From a5eca5d00489d5e7cf8cc416083d88211725da16 Mon Sep 17 00:00:00 2001 From: Mingrui Date: Mon, 9 Mar 2026 15:44:15 +0000 Subject: [PATCH 01/20] fix reading field in python scope avoiding gpu-cpu sync --- genesis/engine/solvers/rigid/rigid_solver.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/genesis/engine/solvers/rigid/rigid_solver.py b/genesis/engine/solvers/rigid/rigid_solver.py index f6d32eb6f..f1a0fb94f 100644 --- a/genesis/engine/solvers/rigid/rigid_solver.py +++ b/genesis/engine/solvers/rigid/rigid_solver.py @@ -440,6 +440,9 @@ def _create_data_manager(self): self._errno = self.data_manager.errno self._rigid_global_info = self.data_manager.rigid_global_info + self._rigid_global_info._n_iterations = ( + self._options.iterations + ) # Python-native mirror to avoid CPU-GPU sync in Python-scope functions self._rigid_adjoint_cache = self.data_manager.rigid_adjoint_cache if self._use_hibernation: self.n_awake_dofs = self._rigid_global_info.n_awake_dofs From b786ecc868c146087f1e0cc940333a8e0bc40c1f Mon Sep 17 00:00:00 2001 From: Mingrui Date: Mon, 9 Mar 2026 16:29:05 +0000 Subject: [PATCH 02/20] update --- genesis/engine/solvers/rigid/rigid_solver.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/genesis/engine/solvers/rigid/rigid_solver.py b/genesis/engine/solvers/rigid/rigid_solver.py index f1a0fb94f..f6d32eb6f 100644 --- a/genesis/engine/solvers/rigid/rigid_solver.py +++ b/genesis/engine/solvers/rigid/rigid_solver.py @@ -440,9 +440,6 @@ def _create_data_manager(self): self._errno = self.data_manager.errno self._rigid_global_info = self.data_manager.rigid_global_info - self._rigid_global_info._n_iterations = ( - self._options.iterations - ) # Python-native mirror to avoid CPU-GPU sync in Python-scope functions self._rigid_adjoint_cache = self.data_manager.rigid_adjoint_cache if self._use_hibernation: self.n_awake_dofs = self._rigid_global_info.n_awake_dofs From 3f1737e4023d0f10cbf13dbbc2bc79290c5858d5 Mon Sep 17 00:00:00 2001 From: Mingrui Date: Mon, 9 Mar 2026 18:10:05 +0000 Subject: [PATCH 03/20] Replace sequential linesearch with parallel multi-kernel linesearch Replace the sequential per-env linesearch (_kernel_linesearch) with a parallel linesearch pipeline using 6 specialized kernels: - _kernel_parallel_linesearch_mv: mv = M @ search, ndrange(dof, env) - _kernel_parallel_linesearch_jv: jv = J @ search, ndrange(constraint, env) - _kernel_parallel_linesearch_p0: fused snorm/quad_gauss/eq_sum/p0_cost with shared memory reductions - _kernel_parallel_linesearch_eval: K=16 log-spaced candidates evaluated in parallel with shared memory argmin - _kernel_parallel_linesearch_apply_alpha_dofs: apply best alpha to qacc/Ma - _kernel_parallel_linesearch_apply_alpha_constraints: apply best alpha to Jaref Also includes decomposed update_constraint (3 kernels) in the iteration loop. Additional changes: - Add dofs_info to func_solve_body dispatch signature - Add _log_scale helper function to solver.py - Exclude requires_grad from decomposed path (parallel LS is sensitive to FP precision) - Update test_grad.py to pass dofs_info Co-Authored-By: Claude Opus 4.6 --- .../engine/solvers/rigid/constraint/solver.py | 9 + .../rigid/constraint/solver_breakdown.py | 484 ++++++++++++++++-- tests/test_grad.py | 1 + 3 files changed, 454 insertions(+), 40 deletions(-) diff --git a/genesis/engine/solvers/rigid/constraint/solver.py b/genesis/engine/solvers/rigid/constraint/solver.py index 986a8c1d4..d0a42e2d4 100644 --- a/genesis/engine/solvers/rigid/constraint/solver.py +++ b/genesis/engine/solvers/rigid/constraint/solver.py @@ -219,6 +219,7 @@ def resolve(self): func_solve_body( self._solver.entities_info, + self._solver.dofs_info, self._solver.dofs_state, self.constraint_state, self._solver._rigid_global_info, @@ -2354,6 +2355,12 @@ def update_bracket_no_eval_local( return flag, p_alpha, p_cost, p_grad, p_hess, p_next_alpha +@qd.func +def _log_scale(min_value: gs.qd_float, max_value: gs.qd_float, num_values: qd.i32, i: qd.i32) -> gs.qd_float: + step = (qd.log(max_value) - qd.log(min_value)) / qd.max(1.0, gs.qd_float(num_values - 1)) + return qd.exp(qd.log(min_value) + gs.qd_float(i) * step) + + @qd.func def func_linesearch_and_apply_alpha( i_b, @@ -3105,6 +3112,7 @@ def func_solve_iter( ) def func_solve_body( entities_info: array_class.EntitiesInfo, + dofs_info: array_class.DofsInfo, dofs_state: array_class.DofsState, constraint_state: array_class.ConstraintState, rigid_global_info: array_class.RigidGlobalInfo, @@ -3117,6 +3125,7 @@ def func_solve_body( @qd.kernel(fastcache=gs.use_fastcache) def func_solve_body_monolith( entities_info: array_class.EntitiesInfo, + dofs_info: array_class.DofsInfo, dofs_state: array_class.DofsState, constraint_state: array_class.ConstraintState, rigid_global_info: array_class.RigidGlobalInfo, diff --git a/genesis/engine/solvers/rigid/constraint/solver_breakdown.py b/genesis/engine/solvers/rigid/constraint/solver_breakdown.py index e5cb7f655..a05cb2242 100644 --- a/genesis/engine/solvers/rigid/constraint/solver_breakdown.py +++ b/genesis/engine/solvers/rigid/constraint/solver_breakdown.py @@ -1,62 +1,429 @@ -import quadrants as ti +import quadrants as qd import genesis as gs import genesis.utils.array_class as array_class from genesis.engine.solvers.rigid.constraint import solver +LS_PARALLEL_K = 16 +LS_PARALLEL_MIN_STEP = 1e-6 +LS_PARALLEL_N_REFINE = 1 # number of successive refinement passes in parallel linesearch +_P0_BLOCK = 32 +_JV_BLOCK = 32 -@ti.kernel(fastcache=gs.use_fastcache) -def _kernel_linesearch( + +@qd.kernel(fastcache=gs.use_fastcache) +def _kernel_parallel_linesearch_mv( + dofs_info: array_class.DofsInfo, entities_info: array_class.EntitiesInfo, + constraint_state: array_class.ConstraintState, + rigid_global_info: array_class.RigidGlobalInfo, + static_rigid_sim_config: qd.template(), +): + """Compute mv = M @ search, parallelized over (dof, env). + + Uses per-dof entity lookup to find the entity block boundaries, giving n_dofs * B + threads (each computing a single ~6-element dot product) instead of n_entities * B + threads (each computing the full block matvec). + """ + n_dofs = constraint_state.search.shape[0] + _B = constraint_state.grad.shape[1] + + qd.loop_config(serialize=static_rigid_sim_config.para_level < gs.PARA_LEVEL.ALL) + for i_d1, i_b in qd.ndrange(n_dofs, _B): + if constraint_state.n_constraints[i_b] > 0: + I_d1 = [i_d1, i_b] if qd.static(static_rigid_sim_config.batch_dofs_info) else i_d1 + i_e = dofs_info.entity_idx[I_d1] + mv = gs.qd_float(0.0) + for i_d2 in range(entities_info.dof_start[i_e], entities_info.dof_end[i_e]): + mv = mv + rigid_global_info.mass_mat[i_d1, i_d2, i_b] * constraint_state.search[i_d2, i_b] + constraint_state.mv[i_d1, i_b] = mv + + +@qd.kernel(fastcache=gs.use_fastcache) +def _kernel_parallel_linesearch_jv( + constraint_state: array_class.ConstraintState, + static_rigid_sim_config: qd.template(), +): + """Compute jv = J @ search, parallelized over (constraint, env).""" + n_dofs = constraint_state.search.shape[0] + len_constraints = constraint_state.jac.shape[0] + _B = constraint_state.grad.shape[1] + + qd.loop_config(serialize=static_rigid_sim_config.para_level < gs.PARA_LEVEL.ALL) + for i_c, i_b in qd.ndrange(len_constraints, _B): + if i_c < constraint_state.n_constraints[i_b]: + jv = gs.qd_float(0.0) + if qd.static(static_rigid_sim_config.sparse_solve): + for i_d_ in range(constraint_state.jac_n_relevant_dofs[i_c, i_b]): + i_d = constraint_state.jac_relevant_dofs[i_c, i_d_, i_b] + jv = jv + constraint_state.jac[i_c, i_d, i_b] * constraint_state.search[i_d, i_b] + else: + for i_d in range(n_dofs): + jv = jv + constraint_state.jac[i_c, i_d, i_b] * constraint_state.search[i_d, i_b] + constraint_state.jv[i_c, i_b] = jv + + +@qd.kernel(fastcache=gs.use_fastcache) +def _kernel_parallel_linesearch_p0( dofs_state: array_class.DofsState, constraint_state: array_class.ConstraintState, rigid_global_info: array_class.RigidGlobalInfo, - static_rigid_sim_config: ti.template(), + static_rigid_sim_config: qd.template(), ): + """Snorm check, quad_gauss, eq_sum, and p0_cost. T threads per env with shared memory reductions. + + Phase 1: Fused snorm + quad_gauss parallel reduction over n_dofs (Options A+B). + Phase 2: Parallel reduction over n_constraints for eq_sum and p0_cost. + """ _B = constraint_state.grad.shape[1] - ti.loop_config(serialize=static_rigid_sim_config.para_level < gs.PARA_LEVEL.ALL, block_dim=32) - for i_b in range(_B): + _T = qd.static(_P0_BLOCK) + + qd.loop_config(block_dim=_T) + for i_flat in range(_B * _T): + tid = i_flat % _T + i_b = i_flat // _T + + # 6 shared arrays for parallel reductions (reused across phases) + sh_snorm_sq = qd.simt.block.SharedArray((_T,), gs.qd_float) + sh_qg_grad = qd.simt.block.SharedArray((_T,), gs.qd_float) + sh_qg_hess = qd.simt.block.SharedArray((_T,), gs.qd_float) + sh_p0_cost = qd.simt.block.SharedArray((_T,), gs.qd_float) + sh_constraint_grad = qd.simt.block.SharedArray((_T,), gs.qd_float) + sh_constraint_hess = qd.simt.block.SharedArray((_T,), gs.qd_float) + + if constraint_state.n_constraints[i_b] > 0: + n_dofs = constraint_state.search.shape[0] + + # === Phase 1: Fused snorm + quad_gauss, parallel over n_dofs === + local_snorm_sq = gs.qd_float(0.0) + local_qg_grad = gs.qd_float(0.0) + local_qg_hess = gs.qd_float(0.0) + + i_d = tid + while i_d < n_dofs: + s = constraint_state.search[i_d, i_b] + local_snorm_sq += s * s + local_qg_grad += s * constraint_state.Ma[i_d, i_b] - s * dofs_state.force[i_d, i_b] + local_qg_hess += 0.5 * s * constraint_state.mv[i_d, i_b] + i_d += _T + + sh_snorm_sq[tid] = local_snorm_sq + sh_qg_grad[tid] = local_qg_grad + sh_qg_hess[tid] = local_qg_hess + + qd.simt.block.sync() + + # Tree reduction for 3 accumulators + stride = _T // 2 + while stride > 0: + if tid < stride: + sh_snorm_sq[tid] += sh_snorm_sq[tid + stride] + sh_qg_grad[tid] += sh_qg_grad[tid + stride] + sh_qg_hess[tid] += sh_qg_hess[tid + stride] + qd.simt.block.sync() + stride //= 2 + + # All threads read the reduced snorm + snorm = qd.sqrt(sh_snorm_sq[0]) + + if snorm < rigid_global_info.EPS[None]: + # Converged — only thread 0 writes + if tid == 0: + constraint_state.candidates[0, i_b] = 0.0 + constraint_state.candidates[1, i_b] = 0.0 + constraint_state.improved[i_b] = False + else: + # Thread 0 writes quad_gauss to global memory + if tid == 0: + constraint_state.improved[i_b] = True + constraint_state.quad_gauss[0, i_b] = constraint_state.gauss[i_b] + constraint_state.quad_gauss[1, i_b] = sh_qg_grad[0] + constraint_state.quad_gauss[2, i_b] = sh_qg_hess[0] + + # === Phase 2: Constraint cost, parallel over n_constraints === + ne = constraint_state.n_constraints_equality[i_b] + nef = ne + constraint_state.n_constraints_frictionloss[i_b] + n_con = constraint_state.n_constraints[i_b] + + local_eq_cost = gs.qd_float(0.0) + local_eq_grad = gs.qd_float(0.0) + local_eq_hess = gs.qd_float(0.0) + local_p0_cost = gs.qd_float(0.0) + local_constraint_grad = gs.qd_float(0.0) + local_constraint_hess = gs.qd_float(0.0) + + i_c = tid + while i_c < n_con: + Jaref_c = constraint_state.Jaref[i_c, i_b] + jv_c = constraint_state.jv[i_c, i_b] + D = constraint_state.efc_D[i_c, i_b] + qf_0 = D * (0.5 * Jaref_c * Jaref_c) + qf_1 = D * (jv_c * Jaref_c) + qf_2 = D * (0.5 * jv_c * jv_c) + + if i_c < ne: + # Equality: always active + local_eq_cost += qf_0 + local_eq_grad += qf_1 + local_eq_hess += qf_2 + local_p0_cost += qf_0 + local_constraint_grad += qf_1 + local_constraint_hess += qf_2 + elif i_c < nef: + # Friction: check linear regime at alpha=0 + f = constraint_state.efc_frictionloss[i_c, i_b] + r = constraint_state.diag[i_c, i_b] + rf = r * f + linear_neg = Jaref_c <= -rf + linear_pos = Jaref_c >= rf + if linear_neg or linear_pos: + qf_0 = linear_neg * f * (-0.5 * rf - Jaref_c) + linear_pos * f * (-0.5 * rf + Jaref_c) + qf_1 = linear_neg * (-f * jv_c) + linear_pos * (f * jv_c) + qf_2 = 0.0 + local_p0_cost += qf_0 + local_constraint_grad += qf_1 + local_constraint_hess += qf_2 + else: + # Contact: active if Jaref < 0 + active = Jaref_c < 0 + local_p0_cost += qf_0 * active + local_constraint_grad += qf_1 * active + local_constraint_hess += qf_2 * active + + i_c += _T + + # Reuse shared arrays for Phase 2 reduction + sh_snorm_sq[tid] = local_eq_cost + sh_qg_grad[tid] = local_eq_grad + sh_qg_hess[tid] = local_eq_hess + sh_p0_cost[tid] = local_p0_cost + sh_constraint_grad[tid] = local_constraint_grad + sh_constraint_hess[tid] = local_constraint_hess + + qd.simt.block.sync() + + # Tree reduction for 6 accumulators + stride = _T // 2 + while stride > 0: + if tid < stride: + sh_snorm_sq[tid] += sh_snorm_sq[tid + stride] + sh_qg_grad[tid] += sh_qg_grad[tid + stride] + sh_qg_hess[tid] += sh_qg_hess[tid + stride] + sh_p0_cost[tid] += sh_p0_cost[tid + stride] + sh_constraint_grad[tid] += sh_constraint_grad[tid + stride] + sh_constraint_hess[tid] += sh_constraint_hess[tid + stride] + qd.simt.block.sync() + stride //= 2 + + if tid == 0: + constraint_state.eq_sum[0, i_b] = sh_snorm_sq[0] + constraint_state.eq_sum[1, i_b] = sh_qg_grad[0] + constraint_state.eq_sum[2, i_b] = sh_qg_hess[0] + constraint_state.ls_it[i_b] = 1 + constraint_state.candidates[1, i_b] = constraint_state.gauss[i_b] + sh_p0_cost[0] + # Initialize best alpha, search range, and best-cost tracker for parallel linesearch + constraint_state.candidates[0, i_b] = 0.0 # default: no step + + # Use full Newton step (DOF + all constraints) as the range center. + total_hess = 2.0 * (constraint_state.quad_gauss[2, i_b] + sh_constraint_hess[0]) + if total_hess > 0.0: + total_grad = constraint_state.quad_gauss[1, i_b] + sh_constraint_grad[0] + alpha_newton = qd.max(qd.abs(total_grad / total_hess), gs.qd_float(LS_PARALLEL_MIN_STEP)) + constraint_state.candidates[2, i_b] = alpha_newton * 1e-2 + constraint_state.candidates[3, i_b] = alpha_newton * 1e2 + else: + constraint_state.candidates[2, i_b] = 1e-6 + constraint_state.candidates[3, i_b] = 1e2 + constraint_state.candidates[4, i_b] = gs.qd_float(1e30) # best cost across passes + + +@qd.kernel(fastcache=gs.use_fastcache) +def _kernel_parallel_linesearch_eval( + constraint_state: array_class.ConstraintState, + rigid_global_info: array_class.RigidGlobalInfo, + static_rigid_sim_config: qd.template(), +): + """Evaluate K candidate alphas in parallel per env, pick the best via reduction. + + Reads the search range from candidates[2] (lo) and candidates[3] (hi). + Writes narrowed range back to candidates[2,3] for successive refinement. + """ + _B = constraint_state.grad.shape[1] + _K = qd.static(LS_PARALLEL_K) + + qd.loop_config(block_dim=_K) + for i_flat in range(_B * _K): + tid = i_flat % _K + i_b = i_flat // _K + + # Shared memory for argmin reduction + sh_cost = qd.simt.block.SharedArray((_K,), gs.qd_float) + sh_idx = qd.simt.block.SharedArray((_K,), qd.i32) + if constraint_state.n_constraints[i_b] > 0 and constraint_state.improved[i_b]: - solver.func_linesearch_and_apply_alpha( - i_b, - entities_info=entities_info, - dofs_state=dofs_state, - rigid_global_info=rigid_global_info, - constraint_state=constraint_state, - static_rigid_sim_config=static_rigid_sim_config, + ne = constraint_state.n_constraints_equality[i_b] + nef = ne + constraint_state.n_constraints_frictionloss[i_b] + n_con = constraint_state.n_constraints[i_b] + + lo = constraint_state.candidates[2, i_b] + hi = constraint_state.candidates[3, i_b] + + # Generate log-spaced alpha within [lo, hi] + alpha = solver._log_scale(lo, hi, _K, tid) + + # Evaluate cost at this alpha + cost = ( + alpha * alpha * constraint_state.quad_gauss[2, i_b] + + alpha * constraint_state.quad_gauss[1, i_b] + + constraint_state.quad_gauss[0, i_b] ) + + # Equality constraints (always active) - use eq_sum precomputed during init + cost = ( + cost + + alpha * alpha * constraint_state.eq_sum[2, i_b] + + alpha * constraint_state.eq_sum[1, i_b] + + constraint_state.eq_sum[0, i_b] + ) + + # Friction constraints + for i_c in range(ne, nef): + Jaref_c = constraint_state.Jaref[i_c, i_b] + jv_c = constraint_state.jv[i_c, i_b] + D = constraint_state.efc_D[i_c, i_b] + f = constraint_state.efc_frictionloss[i_c, i_b] + r = constraint_state.diag[i_c, i_b] + x = Jaref_c + alpha * jv_c + rf = r * f + linear_neg = x <= -rf + linear_pos = x >= rf + if linear_neg or linear_pos: + cost = cost + linear_neg * f * (-0.5 * rf - Jaref_c - alpha * jv_c) + cost = cost + linear_pos * f * (-0.5 * rf + Jaref_c + alpha * jv_c) + else: + cost = cost + D * 0.5 * x * x + + # Contact constraints (active if x < 0) + for i_c in range(nef, n_con): + Jaref_c = constraint_state.Jaref[i_c, i_b] + jv_c = constraint_state.jv[i_c, i_b] + D = constraint_state.efc_D[i_c, i_b] + x = Jaref_c + alpha * jv_c + if x < 0: + cost += D * 0.5 * x * x + + sh_cost[tid] = cost + sh_idx[tid] = tid else: - constraint_state.improved[i_b] = False + sh_cost[tid] = gs.qd_float(1e30) + sh_idx[tid] = tid + + qd.simt.block.sync() + + # Tree reduction for argmin + stride = _K // 2 + while stride > 0: + if tid < stride: + if sh_cost[tid + stride] < sh_cost[tid]: + sh_cost[tid] = sh_cost[tid + stride] + sh_idx[tid] = sh_idx[tid + stride] + qd.simt.block.sync() + stride = stride // 2 + + # Thread 0: acceptance check, write result, and narrow range for next pass + if tid == 0: + if constraint_state.n_constraints[i_b] > 0 and constraint_state.improved[i_b]: + p0_cost = constraint_state.candidates[1, i_b] + best_tid = sh_idx[0] + best_cost = sh_cost[0] + lo = constraint_state.candidates[2, i_b] + hi = constraint_state.candidates[3, i_b] + best_alpha = solver._log_scale(lo, hi, _K, best_tid) + + # Only update best alpha if this pass improved over ALL previous passes + best_cost_prev = constraint_state.candidates[4, i_b] + if best_cost < p0_cost and best_cost < best_cost_prev: + constraint_state.candidates[0, i_b] = best_alpha + constraint_state.candidates[4, i_b] = best_cost + + # Narrow range around accepted point for next refinement pass + lo_idx = qd.max(0, best_tid - 1) + hi_idx = qd.min(_K - 1, best_tid + 1) + constraint_state.candidates[2, i_b] = solver._log_scale(lo, hi, _K, lo_idx) + constraint_state.candidates[3, i_b] = solver._log_scale(lo, hi, _K, hi_idx) + else: + constraint_state.candidates[0, i_b] = 0.0 + + +@qd.kernel(fastcache=gs.use_fastcache) +def _kernel_parallel_linesearch_apply_alpha_dofs( + constraint_state: array_class.ConstraintState, + rigid_global_info: array_class.RigidGlobalInfo, + static_rigid_sim_config: qd.template(), +): + """Apply best alpha to qacc and Ma, parallelized over (dof, env).""" + n_dofs = constraint_state.qacc.shape[0] + _B = constraint_state.grad.shape[1] + + qd.loop_config(serialize=static_rigid_sim_config.para_level < gs.PARA_LEVEL.ALL) + for i_d, i_b in qd.ndrange(n_dofs, _B): + if constraint_state.n_constraints[i_b] > 0 and constraint_state.improved[i_b]: + alpha = constraint_state.candidates[0, i_b] + if qd.abs(alpha) < rigid_global_info.EPS[None]: + if i_d == 0: + constraint_state.improved[i_b] = False + else: + constraint_state.qacc[i_d, i_b] += constraint_state.search[i_d, i_b] * alpha + constraint_state.Ma[i_d, i_b] += constraint_state.mv[i_d, i_b] * alpha -@ti.kernel(fastcache=gs.use_fastcache) +@qd.kernel(fastcache=gs.use_fastcache) +def _kernel_parallel_linesearch_apply_alpha_constraints( + constraint_state: array_class.ConstraintState, + static_rigid_sim_config: qd.template(), +): + """Apply best alpha to Jaref, parallelized over (constraint, env).""" + len_constraints = constraint_state.Jaref.shape[0] + _B = constraint_state.grad.shape[1] + + qd.loop_config(serialize=static_rigid_sim_config.para_level < gs.PARA_LEVEL.ALL) + for i_c, i_b in qd.ndrange(len_constraints, _B): + if i_c < constraint_state.n_constraints[i_b] and constraint_state.improved[i_b]: + alpha = constraint_state.candidates[0, i_b] + constraint_state.Jaref[i_c, i_b] += constraint_state.jv[i_c, i_b] * alpha + + +# ============================================== Shared iteration kernels ============================================== + + +@qd.kernel(fastcache=gs.use_fastcache) def _kernel_cg_only_save_prev_grad( constraint_state: array_class.ConstraintState, - static_rigid_sim_config: ti.template(), + static_rigid_sim_config: qd.template(), ): """Save prev_grad and prev_Mgrad (CG only)""" _B = constraint_state.grad.shape[1] - ti.loop_config(serialize=static_rigid_sim_config.para_level < gs.PARA_LEVEL.ALL, block_dim=32) + qd.loop_config(serialize=static_rigid_sim_config.para_level < gs.PARA_LEVEL.ALL, block_dim=32) for i_b in range(_B): if constraint_state.n_constraints[i_b] > 0 and constraint_state.improved[i_b]: solver.func_save_prev_grad(i_b, constraint_state=constraint_state) -@ti.kernel(fastcache=gs.use_fastcache) +@qd.kernel(fastcache=gs.use_fastcache) def _kernel_update_constraint_forces( constraint_state: array_class.ConstraintState, - static_rigid_sim_config: ti.template(), + static_rigid_sim_config: qd.template(), ): """Compute active flags and efc_force, parallelized over (constraint, env).""" len_constraints = constraint_state.active.shape[0] _B = constraint_state.grad.shape[1] - for i_c, i_b in ti.ndrange(len_constraints, _B): + for i_c, i_b in qd.ndrange(len_constraints, _B): if i_c < constraint_state.n_constraints[i_b] and constraint_state.improved[i_b]: ne = constraint_state.n_constraints_equality[i_b] nef = ne + constraint_state.n_constraints_frictionloss[i_b] - if ti.static(static_rigid_sim_config.solver_type == gs.constraint_solver.Newton): + if qd.static(static_rigid_sim_config.solver_type == gs.constraint_solver.Newton): constraint_state.prev_active[i_c, i_b] = constraint_state.active[i_c, i_b] constraint_state.active[i_c, i_b] = True @@ -78,16 +445,16 @@ def _kernel_update_constraint_forces( ) -@ti.kernel(fastcache=gs.use_fastcache) +@qd.kernel(fastcache=gs.use_fastcache) def _kernel_update_constraint_qfrc( constraint_state: array_class.ConstraintState, - static_rigid_sim_config: ti.template(), + static_rigid_sim_config: qd.template(), ): """Compute qfrc_constraint = J^T @ efc_force, parallelized over (dof, env).""" n_dofs = constraint_state.qfrc_constraint.shape[0] _B = constraint_state.grad.shape[1] - for i_d, i_b in ti.ndrange(n_dofs, _B): + for i_d, i_b in qd.ndrange(n_dofs, _B): if constraint_state.n_constraints[i_b] > 0 and constraint_state.improved[i_b]: n_con = constraint_state.n_constraints[i_b] qfrc = gs.qd_float(0.0) @@ -96,16 +463,16 @@ def _kernel_update_constraint_qfrc( constraint_state.qfrc_constraint[i_d, i_b] = qfrc -@ti.kernel(fastcache=gs.use_fastcache) +@qd.kernel(fastcache=gs.use_fastcache) def _kernel_update_constraint_cost( dofs_state: array_class.DofsState, constraint_state: array_class.ConstraintState, - static_rigid_sim_config: ti.template(), + static_rigid_sim_config: qd.template(), ): """Compute gauss and cost (reductions over dofs and constraints). One thread per env.""" _B = constraint_state.grad.shape[1] - ti.loop_config(block_dim=32) + qd.loop_config(block_dim=32) for i_b in range(_B): if constraint_state.n_constraints[i_b] > 0 and constraint_state.improved[i_b]: n_dofs = constraint_state.qfrc_constraint.shape[0] @@ -149,15 +516,15 @@ def _kernel_update_constraint_cost( constraint_state.cost[i_b] = cost_i -@ti.kernel(fastcache=gs.use_fastcache) +@qd.kernel(fastcache=gs.use_fastcache) def _kernel_newton_only_nt_hessian( constraint_state: array_class.ConstraintState, rigid_global_info: array_class.RigidGlobalInfo, - static_rigid_sim_config: ti.template(), + static_rigid_sim_config: qd.template(), ): """Step 4: Newton Hessian update (Newton only)""" solver.func_hessian_direct_tiled(constraint_state=constraint_state, rigid_global_info=rigid_global_info) - if ti.static(static_rigid_sim_config.enable_tiled_cholesky_hessian): + if qd.static(static_rigid_sim_config.enable_tiled_cholesky_hessian): solver.func_cholesky_factor_direct_tiled( constraint_state=constraint_state, rigid_global_info=rigid_global_info, @@ -165,7 +532,7 @@ def _kernel_newton_only_nt_hessian( ) else: _B = constraint_state.jac.shape[2] - ti.loop_config(serialize=static_rigid_sim_config.para_level < gs.PARA_LEVEL.ALL, block_dim=32) + qd.loop_config(serialize=static_rigid_sim_config.para_level < gs.PARA_LEVEL.ALL, block_dim=32) for i_b in range(_B): if constraint_state.n_constraints[i_b] > 0 and constraint_state.improved[i_b]: solver.func_cholesky_factor_direct_batch( @@ -173,17 +540,17 @@ def _kernel_newton_only_nt_hessian( ) -@ti.kernel(fastcache=gs.use_fastcache) +@qd.kernel(fastcache=gs.use_fastcache) def _kernel_update_gradient( entities_info: array_class.EntitiesInfo, dofs_state: array_class.DofsState, constraint_state: array_class.ConstraintState, rigid_global_info: array_class.RigidGlobalInfo, - static_rigid_sim_config: ti.template(), + static_rigid_sim_config: qd.template(), ): """Step 5: Update gradient""" _B = constraint_state.grad.shape[1] - ti.loop_config(serialize=static_rigid_sim_config.para_level < gs.PARA_LEVEL.ALL, block_dim=32) + qd.loop_config(serialize=static_rigid_sim_config.para_level < gs.PARA_LEVEL.ALL, block_dim=32) for i_b in range(_B): if constraint_state.n_constraints[i_b] > 0 and constraint_state.improved[i_b]: solver.func_update_gradient_batch( @@ -196,15 +563,15 @@ def _kernel_update_gradient( ) -@ti.kernel(fastcache=gs.use_fastcache) +@qd.kernel(fastcache=gs.use_fastcache) def _kernel_update_search_direction( constraint_state: array_class.ConstraintState, rigid_global_info: array_class.RigidGlobalInfo, - static_rigid_sim_config: ti.template(), + static_rigid_sim_config: qd.template(), ): """Step 6: Check convergence and update search direction""" _B = constraint_state.grad.shape[1] - ti.loop_config(serialize=static_rigid_sim_config.para_level < gs.PARA_LEVEL.ALL, block_dim=32) + qd.loop_config(serialize=static_rigid_sim_config.para_level < gs.PARA_LEVEL.ALL, block_dim=32) for i_b in range(_B): if constraint_state.n_constraints[i_b] > 0 and constraint_state.improved[i_b]: solver.func_terminate_or_update_descent_batch( @@ -215,9 +582,19 @@ def _kernel_update_search_direction( ) -@solver.func_solve_body.register(is_compatible=lambda *args, **kwargs: gs.backend in {gs.cuda}) +# ============================================== Solve body dispatch ================================================ + + +@solver.func_solve_body.register( + is_compatible=lambda *args, **kwargs: ( + # Note: we do not use parallel linesearch for finite difference gradient validation, as it is highly + # sensitive to numerical precision and GPU float64 rounding errors can accumulate over many trials. + gs.backend in {gs.cuda} and not (args[5] if len(args) > 5 else kwargs["static_rigid_sim_config"]).requires_grad + ) +) def func_solve_decomposed( entities_info, + dofs_info, dofs_state, constraint_state, rigid_global_info, @@ -228,17 +605,43 @@ def func_solve_decomposed( Uses separate kernels for each solver step per iteration. This maximizes kernel granularity, potentially allowing better GPU scheduling - and more flexibility in execution, at the cost of more Python→C++ boundary crossings. + and more flexibility in execution, at the cost of more Python->C++ boundary crossings. """ # _n_iterations is a Python-native int to avoid CPU-GPU sync (vs rigid_global_info.iterations[None]) for _it in range(_n_iterations): - _kernel_linesearch( + _kernel_parallel_linesearch_mv( + dofs_info, entities_info, + constraint_state, + rigid_global_info, + static_rigid_sim_config, + ) + _kernel_parallel_linesearch_jv( + constraint_state, + static_rigid_sim_config, + ) + _kernel_parallel_linesearch_p0( dofs_state, constraint_state, rigid_global_info, static_rigid_sim_config, ) + # Successive refinement: each pass narrows the search range around the best alpha. + for _refine in range(LS_PARALLEL_N_REFINE): + _kernel_parallel_linesearch_eval( + constraint_state, + rigid_global_info, + static_rigid_sim_config, + ) + _kernel_parallel_linesearch_apply_alpha_dofs( + constraint_state, + rigid_global_info, + static_rigid_sim_config, + ) + _kernel_parallel_linesearch_apply_alpha_constraints( + constraint_state, + static_rigid_sim_config, + ) if static_rigid_sim_config.solver_type == gs.constraint_solver.CG: _kernel_cg_only_save_prev_grad( constraint_state, @@ -257,6 +660,7 @@ def func_solve_decomposed( constraint_state, static_rigid_sim_config, ) + if static_rigid_sim_config.solver_type == gs.constraint_solver.Newton: _kernel_newton_only_nt_hessian( constraint_state, diff --git a/tests/test_grad.py b/tests/test_grad.py index b637065a2..3c49a6b7f 100644 --- a/tests/test_grad.py +++ b/tests/test_grad.py @@ -263,6 +263,7 @@ def constraint_solver_resolve(): ) func_solve_body( entities_info=rigid_solver.entities_info, + dofs_info=rigid_solver.dofs_info, dofs_state=rigid_solver.dofs_state, constraint_state=constraint_solver.constraint_state, rigid_global_info=rigid_solver._rigid_global_info, From ae95f8c14612226239d718dbf22b0c2118dfafec Mon Sep 17 00:00:00 2001 From: Mingrui Date: Tue, 10 Mar 2026 00:04:35 +0000 Subject: [PATCH 04/20] update --- .../engine/solvers/rigid/constraint/solver.py | 6 ---- .../rigid/constraint/solver_breakdown.py | 31 ++++++++++++++++--- 2 files changed, 26 insertions(+), 11 deletions(-) diff --git a/genesis/engine/solvers/rigid/constraint/solver.py b/genesis/engine/solvers/rigid/constraint/solver.py index d0a42e2d4..51f7269c9 100644 --- a/genesis/engine/solvers/rigid/constraint/solver.py +++ b/genesis/engine/solvers/rigid/constraint/solver.py @@ -2355,12 +2355,6 @@ def update_bracket_no_eval_local( return flag, p_alpha, p_cost, p_grad, p_hess, p_next_alpha -@qd.func -def _log_scale(min_value: gs.qd_float, max_value: gs.qd_float, num_values: qd.i32, i: qd.i32) -> gs.qd_float: - step = (qd.log(max_value) - qd.log(min_value)) / qd.max(1.0, gs.qd_float(num_values - 1)) - return qd.exp(qd.log(min_value) + gs.qd_float(i) * step) - - @qd.func def func_linesearch_and_apply_alpha( i_b, diff --git a/genesis/engine/solvers/rigid/constraint/solver_breakdown.py b/genesis/engine/solvers/rigid/constraint/solver_breakdown.py index a05cb2242..8eafdde45 100644 --- a/genesis/engine/solvers/rigid/constraint/solver_breakdown.py +++ b/genesis/engine/solvers/rigid/constraint/solver_breakdown.py @@ -4,9 +4,25 @@ import genesis.utils.array_class as array_class from genesis.engine.solvers.rigid.constraint import solver +# --- Parallel linesearch constants --- +# Number of candidate step sizes evaluated simultaneously per env. +# Each CUDA block processes one env with K threads, using shared memory for the argmin reduction. +# Similar to BLOCK_DIM in func_hessian_direct_tiled: determines parallelism and shared memory layout. LS_PARALLEL_K = 16 + +# Floor for the Newton step estimate used to center the log-spaced search range. +# When |grad/hess| is near-zero the search range [alpha*1e-2, alpha*1e2] would collapse; +# this clamp keeps the range meaningful. The value is well below typical linesearch tolerances +# (ls_tolerance * tolerance ~ 1e-2 * 1e-8 for double, ~ 1e-2 * 1e-5 for float) so it never +# masks a genuinely small optimal step. LS_PARALLEL_MIN_STEP = 1e-6 -LS_PARALLEL_N_REFINE = 1 # number of successive refinement passes in parallel linesearch + +# Number of successive refinement passes: after picking the best of K candidates the search +# range is narrowed around the winner and re-evaluated. 1 pass (K=16 candidates) already +# gives sufficient resolution; increase for tighter convergence at the cost of more kernels. +LS_PARALLEL_N_REFINE = 1 + +# Block sizes for shared-memory reductions in _kernel_parallel_linesearch_p0 and _jv. _P0_BLOCK = 32 _JV_BLOCK = 32 @@ -269,7 +285,8 @@ def _kernel_parallel_linesearch_eval( hi = constraint_state.candidates[3, i_b] # Generate log-spaced alpha within [lo, hi] - alpha = solver._log_scale(lo, hi, _K, tid) + _step = (qd.log(hi) - qd.log(lo)) / qd.max(1.0, gs.qd_float(_K - 1)) + alpha = qd.exp(qd.log(lo) + gs.qd_float(tid) * _step) # Evaluate cost at this alpha cost = ( @@ -338,7 +355,9 @@ def _kernel_parallel_linesearch_eval( best_cost = sh_cost[0] lo = constraint_state.candidates[2, i_b] hi = constraint_state.candidates[3, i_b] - best_alpha = solver._log_scale(lo, hi, _K, best_tid) + # Recover best alpha from log-spaced grid + _step = (qd.log(hi) - qd.log(lo)) / qd.max(1.0, gs.qd_float(_K - 1)) + best_alpha = qd.exp(qd.log(lo) + gs.qd_float(best_tid) * _step) # Only update best alpha if this pass improved over ALL previous passes best_cost_prev = constraint_state.candidates[4, i_b] @@ -349,8 +368,10 @@ def _kernel_parallel_linesearch_eval( # Narrow range around accepted point for next refinement pass lo_idx = qd.max(0, best_tid - 1) hi_idx = qd.min(_K - 1, best_tid + 1) - constraint_state.candidates[2, i_b] = solver._log_scale(lo, hi, _K, lo_idx) - constraint_state.candidates[3, i_b] = solver._log_scale(lo, hi, _K, hi_idx) + # Narrow search range to neighbors of best candidate + _step = (qd.log(hi) - qd.log(lo)) / qd.max(1.0, gs.qd_float(_K - 1)) + constraint_state.candidates[2, i_b] = qd.exp(qd.log(lo) + gs.qd_float(lo_idx) * _step) + constraint_state.candidates[3, i_b] = qd.exp(qd.log(lo) + gs.qd_float(hi_idx) * _step) else: constraint_state.candidates[0, i_b] = 0.0 From a09bef867ec9f29ed3d63393e12d8089ba26cffa Mon Sep 17 00:00:00 2001 From: Mingrui Date: Fri, 13 Mar 2026 16:15:38 +0000 Subject: [PATCH 05/20] update cast --- .../solvers/rigid/constraint/solver_breakdown.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/genesis/engine/solvers/rigid/constraint/solver_breakdown.py b/genesis/engine/solvers/rigid/constraint/solver_breakdown.py index 8eafdde45..ae2aa3316 100644 --- a/genesis/engine/solvers/rigid/constraint/solver_breakdown.py +++ b/genesis/engine/solvers/rigid/constraint/solver_breakdown.py @@ -285,8 +285,8 @@ def _kernel_parallel_linesearch_eval( hi = constraint_state.candidates[3, i_b] # Generate log-spaced alpha within [lo, hi] - _step = (qd.log(hi) - qd.log(lo)) / qd.max(1.0, gs.qd_float(_K - 1)) - alpha = qd.exp(qd.log(lo) + gs.qd_float(tid) * _step) + _step = (qd.log(hi) - qd.log(lo)) / qd.max(1.0, qd.cast(_K - 1, gs.qd_float)) + alpha = qd.exp(qd.log(lo) + qd.cast(tid, gs.qd_float) * _step) # Evaluate cost at this alpha cost = ( @@ -356,8 +356,8 @@ def _kernel_parallel_linesearch_eval( lo = constraint_state.candidates[2, i_b] hi = constraint_state.candidates[3, i_b] # Recover best alpha from log-spaced grid - _step = (qd.log(hi) - qd.log(lo)) / qd.max(1.0, gs.qd_float(_K - 1)) - best_alpha = qd.exp(qd.log(lo) + gs.qd_float(best_tid) * _step) + _step = (qd.log(hi) - qd.log(lo)) / qd.max(1.0, qd.cast(_K - 1, gs.qd_float)) + best_alpha = qd.exp(qd.log(lo) + qd.cast(best_tid, gs.qd_float) * _step) # Only update best alpha if this pass improved over ALL previous passes best_cost_prev = constraint_state.candidates[4, i_b] @@ -369,9 +369,9 @@ def _kernel_parallel_linesearch_eval( lo_idx = qd.max(0, best_tid - 1) hi_idx = qd.min(_K - 1, best_tid + 1) # Narrow search range to neighbors of best candidate - _step = (qd.log(hi) - qd.log(lo)) / qd.max(1.0, gs.qd_float(_K - 1)) - constraint_state.candidates[2, i_b] = qd.exp(qd.log(lo) + gs.qd_float(lo_idx) * _step) - constraint_state.candidates[3, i_b] = qd.exp(qd.log(lo) + gs.qd_float(hi_idx) * _step) + _step = (qd.log(hi) - qd.log(lo)) / qd.max(1.0, qd.cast(_K - 1, gs.qd_float)) + constraint_state.candidates[2, i_b] = qd.exp(qd.log(lo) + qd.cast(lo_idx, gs.qd_float) * _step) + constraint_state.candidates[3, i_b] = qd.exp(qd.log(lo) + qd.cast(hi_idx, gs.qd_float) * _step) else: constraint_state.candidates[0, i_b] = 0.0 From 30f2cf00dfd0ebe8ec9acb102a227eebbd042296 Mon Sep 17 00:00:00 2001 From: Mingrui Date: Thu, 19 Mar 2026 00:48:39 +0000 Subject: [PATCH 06/20] test --- genesis/engine/solvers/rigid/constraint/solver.py | 2 +- tests/test_rigid_benchmarks.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/genesis/engine/solvers/rigid/constraint/solver.py b/genesis/engine/solvers/rigid/constraint/solver.py index 51f7269c9..bffd2475e 100644 --- a/genesis/engine/solvers/rigid/constraint/solver.py +++ b/genesis/engine/solvers/rigid/constraint/solver.py @@ -3115,7 +3115,7 @@ def func_solve_body( ) -> None: ... -@func_solve_body.register(is_compatible=lambda *args, **kwargs: True) +@func_solve_body.register(is_compatible=lambda *args, **kwargs: False) @qd.kernel(fastcache=gs.use_fastcache) def func_solve_body_monolith( entities_info: array_class.EntitiesInfo, diff --git a/tests/test_rigid_benchmarks.py b/tests/test_rigid_benchmarks.py index e6fcf1698..0fee77bdb 100644 --- a/tests/test_rigid_benchmarks.py +++ b/tests/test_rigid_benchmarks.py @@ -725,7 +725,7 @@ def run_benchmark(step_fn, *, n_envs, meta): @pytest.fixture(scope="session") -def stream_writers(printer_session, request): +def stream_writers(request): report_path = Path(request.config.getoption("--speed-test-filepath")) # Delete old unrelated worker-specific reports @@ -746,7 +746,7 @@ def stream_writers(printer_session, request): report_path.unlink() fd = open(report_path, "w") - yield (lambda msg: print(msg, file=fd, flush=True), printer_session) + yield (lambda msg: print(msg, file=fd, flush=True), print) fd.close() From 008716dfb96f5480fc5ebabe642931a2fe897af6 Mon Sep 17 00:00:00 2001 From: Mingrui Date: Sat, 21 Mar 2026 14:18:47 +0000 Subject: [PATCH 07/20] update --- .../rigid/constraint/solver_breakdown.py | 71 ++++++++----------- 1 file changed, 31 insertions(+), 40 deletions(-) diff --git a/genesis/engine/solvers/rigid/constraint/solver_breakdown.py b/genesis/engine/solvers/rigid/constraint/solver_breakdown.py index ae2aa3316..86dd99b60 100644 --- a/genesis/engine/solvers/rigid/constraint/solver_breakdown.py +++ b/genesis/engine/solvers/rigid/constraint/solver_breakdown.py @@ -8,7 +8,7 @@ # Number of candidate step sizes evaluated simultaneously per env. # Each CUDA block processes one env with K threads, using shared memory for the argmin reduction. # Similar to BLOCK_DIM in func_hessian_direct_tiled: determines parallelism and shared memory layout. -LS_PARALLEL_K = 16 +LS_PARALLEL_K = 32 # Floor for the Newton step estimate used to center the log-spaced search range. # When |grad/hess| is near-zero the search range [alpha*1e-2, alpha*1e2] would collapse; @@ -20,7 +20,7 @@ # Number of successive refinement passes: after picking the best of K candidates the search # range is narrowed around the winner and re-evaluated. 1 pass (K=16 candidates) already # gives sufficient resolution; increase for tighter convergence at the cost of more kernels. -LS_PARALLEL_N_REFINE = 1 +LS_PARALLEL_N_REFINE = 3 # Block sizes for shared-memory reductions in _kernel_parallel_linesearch_p0 and _jv. _P0_BLOCK = 32 @@ -46,7 +46,7 @@ def _kernel_parallel_linesearch_mv( qd.loop_config(serialize=static_rigid_sim_config.para_level < gs.PARA_LEVEL.ALL) for i_d1, i_b in qd.ndrange(n_dofs, _B): - if constraint_state.n_constraints[i_b] > 0: + if constraint_state.n_constraints[i_b] > 0 and constraint_state.improved[i_b]: I_d1 = [i_d1, i_b] if qd.static(static_rigid_sim_config.batch_dofs_info) else i_d1 i_e = dofs_info.entity_idx[I_d1] mv = gs.qd_float(0.0) @@ -67,7 +67,7 @@ def _kernel_parallel_linesearch_jv( qd.loop_config(serialize=static_rigid_sim_config.para_level < gs.PARA_LEVEL.ALL) for i_c, i_b in qd.ndrange(len_constraints, _B): - if i_c < constraint_state.n_constraints[i_b]: + if i_c < constraint_state.n_constraints[i_b] and constraint_state.improved[i_b]: jv = gs.qd_float(0.0) if qd.static(static_rigid_sim_config.sparse_solve): for i_d_ in range(constraint_state.jac_n_relevant_dofs[i_c, i_b]): @@ -107,7 +107,7 @@ def _kernel_parallel_linesearch_p0( sh_constraint_grad = qd.simt.block.SharedArray((_T,), gs.qd_float) sh_constraint_hess = qd.simt.block.SharedArray((_T,), gs.qd_float) - if constraint_state.n_constraints[i_b] > 0: + if constraint_state.n_constraints[i_b] > 0 and constraint_state.improved[i_b]: n_dofs = constraint_state.search.shape[0] # === Phase 1: Fused snorm + quad_gauss, parallel over n_dofs === @@ -151,7 +151,6 @@ def _kernel_parallel_linesearch_p0( else: # Thread 0 writes quad_gauss to global memory if tid == 0: - constraint_state.improved[i_b] = True constraint_state.quad_gauss[0, i_b] = constraint_state.gauss[i_b] constraint_state.quad_gauss[1, i_b] = sh_qg_grad[0] constraint_state.quad_gauss[2, i_b] = sh_qg_hess[0] @@ -237,20 +236,25 @@ def _kernel_parallel_linesearch_p0( constraint_state.eq_sum[2, i_b] = sh_qg_hess[0] constraint_state.ls_it[i_b] = 1 constraint_state.candidates[1, i_b] = constraint_state.gauss[i_b] + sh_p0_cost[0] - # Initialize best alpha, search range, and best-cost tracker for parallel linesearch - constraint_state.candidates[0, i_b] = 0.0 # default: no step - - # Use full Newton step (DOF + all constraints) as the range center. + # Use full Newton step as initial best guess + search range center. total_hess = 2.0 * (constraint_state.quad_gauss[2, i_b] + sh_constraint_hess[0]) + total_grad = constraint_state.quad_gauss[1, i_b] + sh_constraint_grad[0] + p0_cost_val = constraint_state.gauss[i_b] + sh_p0_cost[0] + if total_hess > 0.0: - total_grad = constraint_state.quad_gauss[1, i_b] + sh_constraint_grad[0] alpha_newton = qd.max(qd.abs(total_grad / total_hess), gs.qd_float(LS_PARALLEL_MIN_STEP)) + # Initialize with Newton step as the best alpha so far. + # Its quadratic-approximated cost is p0 - grad²/(2*hess), always < p0. + newton_cost = p0_cost_val - 0.5 * total_grad * total_grad / total_hess + constraint_state.candidates[0, i_b] = alpha_newton + constraint_state.candidates[4, i_b] = newton_cost constraint_state.candidates[2, i_b] = alpha_newton * 1e-2 - constraint_state.candidates[3, i_b] = alpha_newton * 1e2 + constraint_state.candidates[3, i_b] = alpha_newton * 10.0 else: + constraint_state.candidates[0, i_b] = 0.0 + constraint_state.candidates[4, i_b] = gs.qd_float(1e30) constraint_state.candidates[2, i_b] = 1e-6 constraint_state.candidates[3, i_b] = 1e2 - constraint_state.candidates[4, i_b] = gs.qd_float(1e30) # best cost across passes @qd.kernel(fastcache=gs.use_fastcache) @@ -377,41 +381,32 @@ def _kernel_parallel_linesearch_eval( @qd.kernel(fastcache=gs.use_fastcache) -def _kernel_parallel_linesearch_apply_alpha_dofs( +def _kernel_parallel_linesearch_apply_alpha( constraint_state: array_class.ConstraintState, rigid_global_info: array_class.RigidGlobalInfo, static_rigid_sim_config: qd.template(), ): - """Apply best alpha to qacc and Ma, parallelized over (dof, env).""" + """Apply best alpha to qacc, Ma, and Jaref. Fuses dof and constraint updates.""" n_dofs = constraint_state.qacc.shape[0] + len_constraints = constraint_state.Jaref.shape[0] _B = constraint_state.grad.shape[1] + n_items = qd.max(n_dofs, len_constraints) qd.loop_config(serialize=static_rigid_sim_config.para_level < gs.PARA_LEVEL.ALL) - for i_d, i_b in qd.ndrange(n_dofs, _B): + for i, i_b in qd.ndrange(n_items, _B): if constraint_state.n_constraints[i_b] > 0 and constraint_state.improved[i_b]: alpha = constraint_state.candidates[0, i_b] if qd.abs(alpha) < rigid_global_info.EPS[None]: - if i_d == 0: + if i == 0: constraint_state.improved[i_b] = False else: - constraint_state.qacc[i_d, i_b] += constraint_state.search[i_d, i_b] * alpha - constraint_state.Ma[i_d, i_b] += constraint_state.mv[i_d, i_b] * alpha - - -@qd.kernel(fastcache=gs.use_fastcache) -def _kernel_parallel_linesearch_apply_alpha_constraints( - constraint_state: array_class.ConstraintState, - static_rigid_sim_config: qd.template(), -): - """Apply best alpha to Jaref, parallelized over (constraint, env).""" - len_constraints = constraint_state.Jaref.shape[0] - _B = constraint_state.grad.shape[1] - - qd.loop_config(serialize=static_rigid_sim_config.para_level < gs.PARA_LEVEL.ALL) - for i_c, i_b in qd.ndrange(len_constraints, _B): - if i_c < constraint_state.n_constraints[i_b] and constraint_state.improved[i_b]: - alpha = constraint_state.candidates[0, i_b] - constraint_state.Jaref[i_c, i_b] += constraint_state.jv[i_c, i_b] * alpha + # Apply to dofs + if i < n_dofs: + constraint_state.qacc[i, i_b] += constraint_state.search[i, i_b] * alpha + constraint_state.Ma[i, i_b] += constraint_state.mv[i, i_b] * alpha + # Apply to constraints + if i < constraint_state.n_constraints[i_b]: + constraint_state.Jaref[i, i_b] += constraint_state.jv[i, i_b] * alpha # ============================================== Shared iteration kernels ============================================== @@ -654,15 +649,11 @@ def func_solve_decomposed( rigid_global_info, static_rigid_sim_config, ) - _kernel_parallel_linesearch_apply_alpha_dofs( + _kernel_parallel_linesearch_apply_alpha( constraint_state, rigid_global_info, static_rigid_sim_config, ) - _kernel_parallel_linesearch_apply_alpha_constraints( - constraint_state, - static_rigid_sim_config, - ) if static_rigid_sim_config.solver_type == gs.constraint_solver.CG: _kernel_cg_only_save_prev_grad( constraint_state, From e7cd2763c11cab5ae650be533c4b3ac21abaa958 Mon Sep 17 00:00:00 2001 From: Mingrui Date: Sat, 21 Mar 2026 15:18:50 +0000 Subject: [PATCH 08/20] update --- genesis/engine/solvers/rigid/constraint/solver.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/genesis/engine/solvers/rigid/constraint/solver.py b/genesis/engine/solvers/rigid/constraint/solver.py index bffd2475e..51f7269c9 100644 --- a/genesis/engine/solvers/rigid/constraint/solver.py +++ b/genesis/engine/solvers/rigid/constraint/solver.py @@ -3115,7 +3115,7 @@ def func_solve_body( ) -> None: ... -@func_solve_body.register(is_compatible=lambda *args, **kwargs: False) +@func_solve_body.register(is_compatible=lambda *args, **kwargs: True) @qd.kernel(fastcache=gs.use_fastcache) def func_solve_body_monolith( entities_info: array_class.EntitiesInfo, From 54d0b3d522f63ee62b36cac532a4374cf15cd4d5 Mon Sep 17 00:00:00 2001 From: Mingrui Date: Sat, 21 Mar 2026 15:19:55 +0000 Subject: [PATCH 09/20] update --- tests/test_rigid_benchmarks.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_rigid_benchmarks.py b/tests/test_rigid_benchmarks.py index 0fee77bdb..e6fcf1698 100644 --- a/tests/test_rigid_benchmarks.py +++ b/tests/test_rigid_benchmarks.py @@ -725,7 +725,7 @@ def run_benchmark(step_fn, *, n_envs, meta): @pytest.fixture(scope="session") -def stream_writers(request): +def stream_writers(printer_session, request): report_path = Path(request.config.getoption("--speed-test-filepath")) # Delete old unrelated worker-specific reports @@ -746,7 +746,7 @@ def stream_writers(request): report_path.unlink() fd = open(report_path, "w") - yield (lambda msg: print(msg, file=fd, flush=True), print) + yield (lambda msg: print(msg, file=fd, flush=True), printer_session) fd.close() From c0546258e73791bdcfa788dad186db65a002d189 Mon Sep 17 00:00:00 2001 From: Mingrui Date: Sun, 22 Mar 2026 21:00:44 +0000 Subject: [PATCH 10/20] safe newton init --- .../rigid/constraint/solver_breakdown.py | 35 +++++++++++-------- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/genesis/engine/solvers/rigid/constraint/solver_breakdown.py b/genesis/engine/solvers/rigid/constraint/solver_breakdown.py index 86dd99b60..76214ebf2 100644 --- a/genesis/engine/solvers/rigid/constraint/solver_breakdown.py +++ b/genesis/engine/solvers/rigid/constraint/solver_breakdown.py @@ -93,6 +93,7 @@ def _kernel_parallel_linesearch_p0( """ _B = constraint_state.grad.shape[1] _T = qd.static(_P0_BLOCK) + _LS_PARALLEL_MIN_STEP = qd.static(LS_PARALLEL_MIN_STEP) qd.loop_config(block_dim=_T) for i_flat in range(_B * _T): @@ -236,25 +237,22 @@ def _kernel_parallel_linesearch_p0( constraint_state.eq_sum[2, i_b] = sh_qg_hess[0] constraint_state.ls_it[i_b] = 1 constraint_state.candidates[1, i_b] = constraint_state.gauss[i_b] + sh_p0_cost[0] - # Use full Newton step as initial best guess + search range center. - total_hess = 2.0 * (constraint_state.quad_gauss[2, i_b] + sh_constraint_hess[0]) - total_grad = constraint_state.quad_gauss[1, i_b] + sh_constraint_grad[0] - p0_cost_val = constraint_state.gauss[i_b] + sh_p0_cost[0] + # Initialize best alpha, search range, and best-cost tracker for parallel linesearch + constraint_state.candidates[0, i_b] = 0.0 # default: no step + # Use full Newton step (DOF + all constraints) as the range center. + total_hess = 2.0 * (constraint_state.quad_gauss[2, i_b] + sh_constraint_hess[0]) if total_hess > 0.0: - alpha_newton = qd.max(qd.abs(total_grad / total_hess), gs.qd_float(LS_PARALLEL_MIN_STEP)) - # Initialize with Newton step as the best alpha so far. - # Its quadratic-approximated cost is p0 - grad²/(2*hess), always < p0. - newton_cost = p0_cost_val - 0.5 * total_grad * total_grad / total_hess - constraint_state.candidates[0, i_b] = alpha_newton - constraint_state.candidates[4, i_b] = newton_cost + total_grad = constraint_state.quad_gauss[1, i_b] + sh_constraint_grad[0] + alpha_newton = qd.max(qd.abs(total_grad / total_hess), gs.qd_float(_LS_PARALLEL_MIN_STEP)) constraint_state.candidates[2, i_b] = alpha_newton * 1e-2 constraint_state.candidates[3, i_b] = alpha_newton * 10.0 + constraint_state.candidates[5, i_b] = alpha_newton # exact Newton step for eval else: - constraint_state.candidates[0, i_b] = 0.0 - constraint_state.candidates[4, i_b] = gs.qd_float(1e30) constraint_state.candidates[2, i_b] = 1e-6 constraint_state.candidates[3, i_b] = 1e2 + constraint_state.candidates[5, i_b] = 0.0 + constraint_state.candidates[4, i_b] = gs.qd_float(1e30) # best cost across passes @qd.kernel(fastcache=gs.use_fastcache) @@ -288,9 +286,15 @@ def _kernel_parallel_linesearch_eval( lo = constraint_state.candidates[2, i_b] hi = constraint_state.candidates[3, i_b] - # Generate log-spaced alpha within [lo, hi] + # Generate log-spaced alpha within [lo, hi]. + # Thread 0 evaluates the exact Newton step instead of the grid point at lo. + # This gives the Newton alpha a fair cost-based comparison with grid candidates. _step = (qd.log(hi) - qd.log(lo)) / qd.max(1.0, qd.cast(_K - 1, gs.qd_float)) alpha = qd.exp(qd.log(lo) + qd.cast(tid, gs.qd_float) * _step) + if tid == 0: + alpha_newton_val = constraint_state.candidates[5, i_b] + if alpha_newton_val > 0.0: + alpha = alpha_newton_val # Evaluate cost at this alpha cost = ( @@ -359,9 +363,12 @@ def _kernel_parallel_linesearch_eval( best_cost = sh_cost[0] lo = constraint_state.candidates[2, i_b] hi = constraint_state.candidates[3, i_b] - # Recover best alpha from log-spaced grid + # Recover best alpha: thread 0 used Newton alpha, others used grid _step = (qd.log(hi) - qd.log(lo)) / qd.max(1.0, qd.cast(_K - 1, gs.qd_float)) best_alpha = qd.exp(qd.log(lo) + qd.cast(best_tid, gs.qd_float) * _step) + # If thread 0 (Newton candidate) won, use the exact Newton alpha + if best_tid == 0 and constraint_state.candidates[5, i_b] > 0.0: + best_alpha = constraint_state.candidates[5, i_b] # Only update best alpha if this pass improved over ALL previous passes best_cost_prev = constraint_state.candidates[4, i_b] From 2e56fec6964244a7d0f98720a63b49bef3615b43 Mon Sep 17 00:00:00 2001 From: Mingrui Date: Mon, 23 Mar 2026 00:48:14 +0000 Subject: [PATCH 11/20] grad check and newton correction --- .../rigid/constraint/solver_breakdown.py | 62 +++++++++++++++++-- 1 file changed, 58 insertions(+), 4 deletions(-) diff --git a/genesis/engine/solvers/rigid/constraint/solver_breakdown.py b/genesis/engine/solvers/rigid/constraint/solver_breakdown.py index 76214ebf2..68b381f98 100644 --- a/genesis/engine/solvers/rigid/constraint/solver_breakdown.py +++ b/genesis/engine/solvers/rigid/constraint/solver_breakdown.py @@ -20,7 +20,7 @@ # Number of successive refinement passes: after picking the best of K candidates the search # range is narrowed around the winner and re-evaluated. 1 pass (K=16 candidates) already # gives sufficient resolution; increase for tighter convergence at the cost of more kernels. -LS_PARALLEL_N_REFINE = 3 +LS_PARALLEL_N_REFINE = 1 # Block sizes for shared-memory reductions in _kernel_parallel_linesearch_p0 and _jv. _P0_BLOCK = 32 @@ -93,7 +93,6 @@ def _kernel_parallel_linesearch_p0( """ _B = constraint_state.grad.shape[1] _T = qd.static(_P0_BLOCK) - _LS_PARALLEL_MIN_STEP = qd.static(LS_PARALLEL_MIN_STEP) qd.loop_config(block_dim=_T) for i_flat in range(_B * _T): @@ -244,7 +243,7 @@ def _kernel_parallel_linesearch_p0( total_hess = 2.0 * (constraint_state.quad_gauss[2, i_b] + sh_constraint_hess[0]) if total_hess > 0.0: total_grad = constraint_state.quad_gauss[1, i_b] + sh_constraint_grad[0] - alpha_newton = qd.max(qd.abs(total_grad / total_hess), gs.qd_float(_LS_PARALLEL_MIN_STEP)) + alpha_newton = qd.max(qd.abs(total_grad / total_hess), gs.qd_float(LS_PARALLEL_MIN_STEP)) constraint_state.candidates[2, i_b] = alpha_newton * 1e-2 constraint_state.candidates[3, i_b] = alpha_newton * 10.0 constraint_state.candidates[5, i_b] = alpha_newton # exact Newton step for eval @@ -253,6 +252,12 @@ def _kernel_parallel_linesearch_p0( constraint_state.candidates[3, i_b] = 1e2 constraint_state.candidates[5, i_b] = 0.0 constraint_state.candidates[4, i_b] = gs.qd_float(1e30) # best cost across passes + # Store gtol for gradient check on last eval pass + n_dofs_val = constraint_state.search.shape[0] + scale = rigid_global_info.meaninertia[i_b] * qd.max(1, n_dofs_val) + constraint_state.candidates[7, i_b] = ( + rigid_global_info.tolerance[None] * rigid_global_info.ls_tolerance[None] * snorm * scale + ) @qd.kernel(fastcache=gs.use_fastcache) @@ -260,11 +265,14 @@ def _kernel_parallel_linesearch_eval( constraint_state: array_class.ConstraintState, rigid_global_info: array_class.RigidGlobalInfo, static_rigid_sim_config: qd.template(), + _is_last_pass: qd.template(), ): """Evaluate K candidate alphas in parallel per env, pick the best via reduction. Reads the search range from candidates[2] (lo) and candidates[3] (hi). Writes narrowed range back to candidates[2,3] for successive refinement. + On the last pass, computes analytical gradient at best_alpha and does one + Newton correction step if |grad| > gtol. """ _B = constraint_state.grad.shape[1] _K = qd.static(LS_PARALLEL_K) @@ -383,6 +391,51 @@ def _kernel_parallel_linesearch_eval( _step = (qd.log(hi) - qd.log(lo)) / qd.max(1.0, qd.cast(_K - 1, gs.qd_float)) constraint_state.candidates[2, i_b] = qd.exp(qd.log(lo) + qd.cast(lo_idx, gs.qd_float) * _step) constraint_state.candidates[3, i_b] = qd.exp(qd.log(lo) + qd.cast(hi_idx, gs.qd_float) * _step) + + # On last pass: analytical gradient check + Newton correction + if qd.static(_is_last_pass): + final_alpha = constraint_state.candidates[0, i_b] + if qd.abs(final_alpha) > rigid_global_info.EPS[None]: + gtol = constraint_state.candidates[7, i_b] + ne_g = constraint_state.n_constraints_equality[i_b] + nef_g = ne_g + constraint_state.n_constraints_frictionloss[i_b] + n_con_g = constraint_state.n_constraints[i_b] + + # Accumulate gradient and hessian coefficients at final_alpha + g1 = constraint_state.quad_gauss[1, i_b] + constraint_state.eq_sum[1, i_b] + g2 = constraint_state.quad_gauss[2, i_b] + constraint_state.eq_sum[2, i_b] + + for i_cg in range(ne_g, nef_g): + Jaref_c = constraint_state.Jaref[i_cg, i_b] + jv_c = constraint_state.jv[i_cg, i_b] + D = constraint_state.efc_D[i_cg, i_b] + f_val = constraint_state.efc_frictionloss[i_cg, i_b] + r_val = constraint_state.diag[i_cg, i_b] + x = Jaref_c + final_alpha * jv_c + rf = r_val * f_val + if x <= -rf or x >= rf: + g1 += (x <= -rf) * (-f_val * jv_c) + (x >= rf) * (f_val * jv_c) + else: + g1 += D * jv_c * Jaref_c + g2 += D * 0.5 * jv_c * jv_c + + for i_cg in range(nef_g, n_con_g): + Jaref_c = constraint_state.Jaref[i_cg, i_b] + jv_c = constraint_state.jv[i_cg, i_b] + D = constraint_state.efc_D[i_cg, i_b] + x = Jaref_c + final_alpha * jv_c + if x < 0: + g1 += D * jv_c * Jaref_c + g2 += D * 0.5 * jv_c * jv_c + + grad_val = 2.0 * final_alpha * g2 + g1 + hess_val = 2.0 * g2 + + # Newton correction if gradient is not near zero + if qd.abs(grad_val) > gtol and hess_val > rigid_global_info.EPS[None]: + alpha_corrected = final_alpha - grad_val / hess_val + if alpha_corrected > 0.0: + constraint_state.candidates[0, i_b] = alpha_corrected else: constraint_state.candidates[0, i_b] = 0.0 @@ -649,12 +702,13 @@ def func_solve_decomposed( rigid_global_info, static_rigid_sim_config, ) - # Successive refinement: each pass narrows the search range around the best alpha. + # Successive refinement: last pass includes gradient check + Newton correction. for _refine in range(LS_PARALLEL_N_REFINE): _kernel_parallel_linesearch_eval( constraint_state, rigid_global_info, static_rigid_sim_config, + _refine == LS_PARALLEL_N_REFINE - 1, ) _kernel_parallel_linesearch_apply_alpha( constraint_state, From c37fb44f1cd9f16ef851bc012040ce2be03bf456 Mon Sep 17 00:00:00 2001 From: Mingrui Date: Mon, 23 Mar 2026 01:41:41 +0000 Subject: [PATCH 12/20] add more newton correction --- .../rigid/constraint/solver_breakdown.py | 148 ++++++++++-------- 1 file changed, 86 insertions(+), 62 deletions(-) diff --git a/genesis/engine/solvers/rigid/constraint/solver_breakdown.py b/genesis/engine/solvers/rigid/constraint/solver_breakdown.py index 68b381f98..6c6313de1 100644 --- a/genesis/engine/solvers/rigid/constraint/solver_breakdown.py +++ b/genesis/engine/solvers/rigid/constraint/solver_breakdown.py @@ -17,15 +17,64 @@ # masks a genuinely small optimal step. LS_PARALLEL_MIN_STEP = 1e-6 -# Number of successive refinement passes: after picking the best of K candidates the search -# range is narrowed around the winner and re-evaluated. 1 pass (K=16 candidates) already -# gives sufficient resolution; increase for tighter convergence at the cost of more kernels. -LS_PARALLEL_N_REFINE = 1 - # Block sizes for shared-memory reductions in _kernel_parallel_linesearch_p0 and _jv. _P0_BLOCK = 32 _JV_BLOCK = 32 +# Maximum allowed alpha (prevents divergence from degenerate Newton steps) +LS_ALPHA_MAX = 1e4 +# Maximum bisection iterations (2^8 = 256x precision improvement) +LS_BISECT_STEPS = 8 +# Range expansion: max steps and growth factor +LS_EXPANSION_STEPS = 6 +LS_EXPANSION_FACTOR = 4.0 + + +@qd.func +def _ls_compute_grad_hess( + alpha, + i_b, + constraint_state: array_class.ConstraintState, +): + """Compute analytical gradient and hessian of the linesearch cost at alpha. + + Accumulates quadratic coefficients with alpha-dependent constraint activation. + Returns (grad, hess). + """ + ne = constraint_state.n_constraints_equality[i_b] + nef = ne + constraint_state.n_constraints_frictionloss[i_b] + n_con = constraint_state.n_constraints[i_b] + + g1 = constraint_state.quad_gauss[1, i_b] + constraint_state.eq_sum[1, i_b] + g2 = constraint_state.quad_gauss[2, i_b] + constraint_state.eq_sum[2, i_b] + + for i_c in range(ne, nef): + Jaref_c = constraint_state.Jaref[i_c, i_b] + jv_c = constraint_state.jv[i_c, i_b] + D = constraint_state.efc_D[i_c, i_b] + f_val = constraint_state.efc_frictionloss[i_c, i_b] + r_val = constraint_state.diag[i_c, i_b] + x = Jaref_c + alpha * jv_c + rf = r_val * f_val + if x <= -rf or x >= rf: + g1 += (x <= -rf) * (-f_val * jv_c) + (x >= rf) * (f_val * jv_c) + else: + g1 += D * jv_c * Jaref_c + g2 += D * 0.5 * jv_c * jv_c + + for i_c in range(nef, n_con): + Jaref_c = constraint_state.Jaref[i_c, i_b] + jv_c = constraint_state.jv[i_c, i_b] + D = constraint_state.efc_D[i_c, i_b] + x = Jaref_c + alpha * jv_c + if x < 0: + g1 += D * jv_c * Jaref_c + g2 += D * 0.5 * jv_c * jv_c + + grad = 2.0 * alpha * g2 + g1 + hess = 2.0 * g2 + return grad, hess + @qd.kernel(fastcache=gs.use_fastcache) def _kernel_parallel_linesearch_mv( @@ -265,13 +314,12 @@ def _kernel_parallel_linesearch_eval( constraint_state: array_class.ConstraintState, rigid_global_info: array_class.RigidGlobalInfo, static_rigid_sim_config: qd.template(), - _is_last_pass: qd.template(), ): - """Evaluate K candidate alphas in parallel per env, pick the best via reduction. + """Evaluate K candidate alphas, pick the best, then apply gradient-guided refinement. - Reads the search range from candidates[2] (lo) and candidates[3] (hi). - Writes narrowed range back to candidates[2,3] for successive refinement. - On the last pass, computes analytical gradient at best_alpha and does one + Phase 1: K threads evaluate log-spaced candidates (thread 0 = Newton alpha). + Phase 2: Argmin reduction to find lowest-cost candidate. + Phase 3: Thread 0 computes analytical gradient at best_alpha and does one Newton correction step if |grad| > gtol. """ _B = constraint_state.grad.shape[1] @@ -392,50 +440,28 @@ def _kernel_parallel_linesearch_eval( constraint_state.candidates[2, i_b] = qd.exp(qd.log(lo) + qd.cast(lo_idx, gs.qd_float) * _step) constraint_state.candidates[3, i_b] = qd.exp(qd.log(lo) + qd.cast(hi_idx, gs.qd_float) * _step) - # On last pass: analytical gradient check + Newton correction - if qd.static(_is_last_pass): - final_alpha = constraint_state.candidates[0, i_b] - if qd.abs(final_alpha) > rigid_global_info.EPS[None]: - gtol = constraint_state.candidates[7, i_b] - ne_g = constraint_state.n_constraints_equality[i_b] - nef_g = ne_g + constraint_state.n_constraints_frictionloss[i_b] - n_con_g = constraint_state.n_constraints[i_b] - - # Accumulate gradient and hessian coefficients at final_alpha - g1 = constraint_state.quad_gauss[1, i_b] + constraint_state.eq_sum[1, i_b] - g2 = constraint_state.quad_gauss[2, i_b] + constraint_state.eq_sum[2, i_b] - - for i_cg in range(ne_g, nef_g): - Jaref_c = constraint_state.Jaref[i_cg, i_b] - jv_c = constraint_state.jv[i_cg, i_b] - D = constraint_state.efc_D[i_cg, i_b] - f_val = constraint_state.efc_frictionloss[i_cg, i_b] - r_val = constraint_state.diag[i_cg, i_b] - x = Jaref_c + final_alpha * jv_c - rf = r_val * f_val - if x <= -rf or x >= rf: - g1 += (x <= -rf) * (-f_val * jv_c) + (x >= rf) * (f_val * jv_c) - else: - g1 += D * jv_c * Jaref_c - g2 += D * 0.5 * jv_c * jv_c - - for i_cg in range(nef_g, n_con_g): - Jaref_c = constraint_state.Jaref[i_cg, i_b] - jv_c = constraint_state.jv[i_cg, i_b] - D = constraint_state.efc_D[i_cg, i_b] - x = Jaref_c + final_alpha * jv_c - if x < 0: - g1 += D * jv_c * Jaref_c - g2 += D * 0.5 * jv_c * jv_c - - grad_val = 2.0 * final_alpha * g2 + g1 - hess_val = 2.0 * g2 - - # Newton correction if gradient is not near zero - if qd.abs(grad_val) > gtol and hess_val > rigid_global_info.EPS[None]: - alpha_corrected = final_alpha - grad_val / hess_val - if alpha_corrected > 0.0: - constraint_state.candidates[0, i_b] = alpha_corrected + # --- Gradient-guided refinement (thread 0) --- + final_alpha = constraint_state.candidates[0, i_b] + gtol = constraint_state.candidates[7, i_b] + + if qd.abs(final_alpha) < rigid_global_info.EPS[None]: + # Branch A: grid found no improvement. Check grad(0). + grad_0, hess_0 = _ls_compute_grad_hess(gs.qd_float(0.0), i_b, constraint_state) + if grad_0 < -gtol and hess_0 > rigid_global_info.EPS[None]: + # Cost descending at alpha=0 → Newton step from 0 + alpha_n = -grad_0 / hess_0 + alpha_n = qd.min(alpha_n, gs.qd_float(LS_ALPHA_MAX)) + if alpha_n > 0.0: + constraint_state.candidates[0, i_b] = alpha_n + else: + # Branch B: improvement found. Gradient check + Newton correction. + grad_val, hess_val = _ls_compute_grad_hess(final_alpha, i_b, constraint_state) + + if qd.abs(grad_val) > gtol and hess_val > rigid_global_info.EPS[None]: + alpha_corrected = final_alpha - grad_val / hess_val + alpha_corrected = qd.min(qd.max(alpha_corrected, gs.qd_float(0.0)), gs.qd_float(LS_ALPHA_MAX)) + if alpha_corrected > 0.0: + constraint_state.candidates[0, i_b] = alpha_corrected else: constraint_state.candidates[0, i_b] = 0.0 @@ -702,14 +728,12 @@ def func_solve_decomposed( rigid_global_info, static_rigid_sim_config, ) - # Successive refinement: last pass includes gradient check + Newton correction. - for _refine in range(LS_PARALLEL_N_REFINE): - _kernel_parallel_linesearch_eval( - constraint_state, - rigid_global_info, - static_rigid_sim_config, - _refine == LS_PARALLEL_N_REFINE - 1, - ) + # Grid search + gradient-guided Newton correction (single pass) + _kernel_parallel_linesearch_eval( + constraint_state, + rigid_global_info, + static_rigid_sim_config, + ) _kernel_parallel_linesearch_apply_alpha( constraint_state, rigid_global_info, From 2ade80c26010e2452bf2f0d297279ee8ea8eb278 Mon Sep 17 00:00:00 2001 From: Mingrui Date: Mon, 23 Mar 2026 12:15:26 +0000 Subject: [PATCH 13/20] fix qudrants constant --- .../engine/solvers/rigid/constraint/solver_breakdown.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/genesis/engine/solvers/rigid/constraint/solver_breakdown.py b/genesis/engine/solvers/rigid/constraint/solver_breakdown.py index 6c6313de1..4fff250f2 100644 --- a/genesis/engine/solvers/rigid/constraint/solver_breakdown.py +++ b/genesis/engine/solvers/rigid/constraint/solver_breakdown.py @@ -142,6 +142,7 @@ def _kernel_parallel_linesearch_p0( """ _B = constraint_state.grad.shape[1] _T = qd.static(_P0_BLOCK) + _LS_PARALLEL_MIN_STEP = qd.static(LS_PARALLEL_MIN_STEP) qd.loop_config(block_dim=_T) for i_flat in range(_B * _T): @@ -292,7 +293,7 @@ def _kernel_parallel_linesearch_p0( total_hess = 2.0 * (constraint_state.quad_gauss[2, i_b] + sh_constraint_hess[0]) if total_hess > 0.0: total_grad = constraint_state.quad_gauss[1, i_b] + sh_constraint_grad[0] - alpha_newton = qd.max(qd.abs(total_grad / total_hess), gs.qd_float(LS_PARALLEL_MIN_STEP)) + alpha_newton = qd.max(qd.abs(total_grad / total_hess), gs.qd_float(_LS_PARALLEL_MIN_STEP)) constraint_state.candidates[2, i_b] = alpha_newton * 1e-2 constraint_state.candidates[3, i_b] = alpha_newton * 10.0 constraint_state.candidates[5, i_b] = alpha_newton # exact Newton step for eval @@ -324,6 +325,7 @@ def _kernel_parallel_linesearch_eval( """ _B = constraint_state.grad.shape[1] _K = qd.static(LS_PARALLEL_K) + _LS_ALPHA_MAX = qd.static(LS_ALPHA_MAX) qd.loop_config(block_dim=_K) for i_flat in range(_B * _K): @@ -450,7 +452,7 @@ def _kernel_parallel_linesearch_eval( if grad_0 < -gtol and hess_0 > rigid_global_info.EPS[None]: # Cost descending at alpha=0 → Newton step from 0 alpha_n = -grad_0 / hess_0 - alpha_n = qd.min(alpha_n, gs.qd_float(LS_ALPHA_MAX)) + alpha_n = qd.min(alpha_n, gs.qd_float(_LS_ALPHA_MAX)) if alpha_n > 0.0: constraint_state.candidates[0, i_b] = alpha_n else: @@ -459,7 +461,7 @@ def _kernel_parallel_linesearch_eval( if qd.abs(grad_val) > gtol and hess_val > rigid_global_info.EPS[None]: alpha_corrected = final_alpha - grad_val / hess_val - alpha_corrected = qd.min(qd.max(alpha_corrected, gs.qd_float(0.0)), gs.qd_float(LS_ALPHA_MAX)) + alpha_corrected = qd.min(qd.max(alpha_corrected, gs.qd_float(0.0)), gs.qd_float(_LS_ALPHA_MAX)) if alpha_corrected > 0.0: constraint_state.candidates[0, i_b] = alpha_corrected else: From 42eb2e3bea92d577ba3b4e83698af38fd9f377f8 Mon Sep 17 00:00:00 2001 From: Mingrui Date: Mon, 23 Mar 2026 15:00:45 +0000 Subject: [PATCH 14/20] re-enable re-benchmarking --- .../rigid/constraint/solver_breakdown.py | 110 +++--------------- 1 file changed, 15 insertions(+), 95 deletions(-) diff --git a/genesis/engine/solvers/rigid/constraint/solver_breakdown.py b/genesis/engine/solvers/rigid/constraint/solver_breakdown.py index 4fff250f2..76214ebf2 100644 --- a/genesis/engine/solvers/rigid/constraint/solver_breakdown.py +++ b/genesis/engine/solvers/rigid/constraint/solver_breakdown.py @@ -17,64 +17,15 @@ # masks a genuinely small optimal step. LS_PARALLEL_MIN_STEP = 1e-6 +# Number of successive refinement passes: after picking the best of K candidates the search +# range is narrowed around the winner and re-evaluated. 1 pass (K=16 candidates) already +# gives sufficient resolution; increase for tighter convergence at the cost of more kernels. +LS_PARALLEL_N_REFINE = 3 + # Block sizes for shared-memory reductions in _kernel_parallel_linesearch_p0 and _jv. _P0_BLOCK = 32 _JV_BLOCK = 32 -# Maximum allowed alpha (prevents divergence from degenerate Newton steps) -LS_ALPHA_MAX = 1e4 -# Maximum bisection iterations (2^8 = 256x precision improvement) -LS_BISECT_STEPS = 8 -# Range expansion: max steps and growth factor -LS_EXPANSION_STEPS = 6 -LS_EXPANSION_FACTOR = 4.0 - - -@qd.func -def _ls_compute_grad_hess( - alpha, - i_b, - constraint_state: array_class.ConstraintState, -): - """Compute analytical gradient and hessian of the linesearch cost at alpha. - - Accumulates quadratic coefficients with alpha-dependent constraint activation. - Returns (grad, hess). - """ - ne = constraint_state.n_constraints_equality[i_b] - nef = ne + constraint_state.n_constraints_frictionloss[i_b] - n_con = constraint_state.n_constraints[i_b] - - g1 = constraint_state.quad_gauss[1, i_b] + constraint_state.eq_sum[1, i_b] - g2 = constraint_state.quad_gauss[2, i_b] + constraint_state.eq_sum[2, i_b] - - for i_c in range(ne, nef): - Jaref_c = constraint_state.Jaref[i_c, i_b] - jv_c = constraint_state.jv[i_c, i_b] - D = constraint_state.efc_D[i_c, i_b] - f_val = constraint_state.efc_frictionloss[i_c, i_b] - r_val = constraint_state.diag[i_c, i_b] - x = Jaref_c + alpha * jv_c - rf = r_val * f_val - if x <= -rf or x >= rf: - g1 += (x <= -rf) * (-f_val * jv_c) + (x >= rf) * (f_val * jv_c) - else: - g1 += D * jv_c * Jaref_c - g2 += D * 0.5 * jv_c * jv_c - - for i_c in range(nef, n_con): - Jaref_c = constraint_state.Jaref[i_c, i_b] - jv_c = constraint_state.jv[i_c, i_b] - D = constraint_state.efc_D[i_c, i_b] - x = Jaref_c + alpha * jv_c - if x < 0: - g1 += D * jv_c * Jaref_c - g2 += D * 0.5 * jv_c * jv_c - - grad = 2.0 * alpha * g2 + g1 - hess = 2.0 * g2 - return grad, hess - @qd.kernel(fastcache=gs.use_fastcache) def _kernel_parallel_linesearch_mv( @@ -302,12 +253,6 @@ def _kernel_parallel_linesearch_p0( constraint_state.candidates[3, i_b] = 1e2 constraint_state.candidates[5, i_b] = 0.0 constraint_state.candidates[4, i_b] = gs.qd_float(1e30) # best cost across passes - # Store gtol for gradient check on last eval pass - n_dofs_val = constraint_state.search.shape[0] - scale = rigid_global_info.meaninertia[i_b] * qd.max(1, n_dofs_val) - constraint_state.candidates[7, i_b] = ( - rigid_global_info.tolerance[None] * rigid_global_info.ls_tolerance[None] * snorm * scale - ) @qd.kernel(fastcache=gs.use_fastcache) @@ -316,16 +261,13 @@ def _kernel_parallel_linesearch_eval( rigid_global_info: array_class.RigidGlobalInfo, static_rigid_sim_config: qd.template(), ): - """Evaluate K candidate alphas, pick the best, then apply gradient-guided refinement. + """Evaluate K candidate alphas in parallel per env, pick the best via reduction. - Phase 1: K threads evaluate log-spaced candidates (thread 0 = Newton alpha). - Phase 2: Argmin reduction to find lowest-cost candidate. - Phase 3: Thread 0 computes analytical gradient at best_alpha and does one - Newton correction step if |grad| > gtol. + Reads the search range from candidates[2] (lo) and candidates[3] (hi). + Writes narrowed range back to candidates[2,3] for successive refinement. """ _B = constraint_state.grad.shape[1] _K = qd.static(LS_PARALLEL_K) - _LS_ALPHA_MAX = qd.static(LS_ALPHA_MAX) qd.loop_config(block_dim=_K) for i_flat in range(_B * _K): @@ -441,29 +383,6 @@ def _kernel_parallel_linesearch_eval( _step = (qd.log(hi) - qd.log(lo)) / qd.max(1.0, qd.cast(_K - 1, gs.qd_float)) constraint_state.candidates[2, i_b] = qd.exp(qd.log(lo) + qd.cast(lo_idx, gs.qd_float) * _step) constraint_state.candidates[3, i_b] = qd.exp(qd.log(lo) + qd.cast(hi_idx, gs.qd_float) * _step) - - # --- Gradient-guided refinement (thread 0) --- - final_alpha = constraint_state.candidates[0, i_b] - gtol = constraint_state.candidates[7, i_b] - - if qd.abs(final_alpha) < rigid_global_info.EPS[None]: - # Branch A: grid found no improvement. Check grad(0). - grad_0, hess_0 = _ls_compute_grad_hess(gs.qd_float(0.0), i_b, constraint_state) - if grad_0 < -gtol and hess_0 > rigid_global_info.EPS[None]: - # Cost descending at alpha=0 → Newton step from 0 - alpha_n = -grad_0 / hess_0 - alpha_n = qd.min(alpha_n, gs.qd_float(_LS_ALPHA_MAX)) - if alpha_n > 0.0: - constraint_state.candidates[0, i_b] = alpha_n - else: - # Branch B: improvement found. Gradient check + Newton correction. - grad_val, hess_val = _ls_compute_grad_hess(final_alpha, i_b, constraint_state) - - if qd.abs(grad_val) > gtol and hess_val > rigid_global_info.EPS[None]: - alpha_corrected = final_alpha - grad_val / hess_val - alpha_corrected = qd.min(qd.max(alpha_corrected, gs.qd_float(0.0)), gs.qd_float(_LS_ALPHA_MAX)) - if alpha_corrected > 0.0: - constraint_state.candidates[0, i_b] = alpha_corrected else: constraint_state.candidates[0, i_b] = 0.0 @@ -730,12 +649,13 @@ def func_solve_decomposed( rigid_global_info, static_rigid_sim_config, ) - # Grid search + gradient-guided Newton correction (single pass) - _kernel_parallel_linesearch_eval( - constraint_state, - rigid_global_info, - static_rigid_sim_config, - ) + # Successive refinement: each pass narrows the search range around the best alpha. + for _refine in range(LS_PARALLEL_N_REFINE): + _kernel_parallel_linesearch_eval( + constraint_state, + rigid_global_info, + static_rigid_sim_config, + ) _kernel_parallel_linesearch_apply_alpha( constraint_state, rigid_global_info, From ff32f252bbb2056dee94e796e08bd3b4560f6f80 Mon Sep 17 00:00:00 2001 From: Mingrui Date: Tue, 24 Mar 2026 20:08:31 +0000 Subject: [PATCH 15/20] update with new opt --- .../rigid/constraint/solver_breakdown.py | 552 +++++++++++++----- 1 file changed, 409 insertions(+), 143 deletions(-) diff --git a/genesis/engine/solvers/rigid/constraint/solver_breakdown.py b/genesis/engine/solvers/rigid/constraint/solver_breakdown.py index 76214ebf2..9a3e672f5 100644 --- a/genesis/engine/solvers/rigid/constraint/solver_breakdown.py +++ b/genesis/engine/solvers/rigid/constraint/solver_breakdown.py @@ -17,15 +17,83 @@ # masks a genuinely small optimal step. LS_PARALLEL_MIN_STEP = 1e-6 -# Number of successive refinement passes: after picking the best of K candidates the search -# range is narrowed around the winner and re-evaluated. 1 pass (K=16 candidates) already -# gives sufficient resolution; increase for tighter convergence at the cost of more kernels. -LS_PARALLEL_N_REFINE = 3 - # Block sizes for shared-memory reductions in _kernel_parallel_linesearch_p0 and _jv. _P0_BLOCK = 32 _JV_BLOCK = 32 +# Maximum bisection iterations for gradient-guided refinement after grid search. +LS_BISECT_STEPS = 12 + +# Number of alpha candidates evaluated via cooperative constraint reduction. +# Each candidate is evaluated by ALL K threads cooperating on the constraint sum, +# reducing per-thread work from O(n_constraints) to O(n_constraints/K). +LS_N_CANDIDATES = 8 + +# Maximum allowed alpha (prevents divergence from degenerate steps). +LS_ALPHA_MAX = 1e4 + + +@qd.func +def _ls_eval_cost_grad( + alpha, + i_b, + constraint_state: array_class.ConstraintState, +): + """Compute cost and analytical gradient at alpha (thread-0 only). + + Follows the same quadratic-coefficient approach as func_ls_point_fn_opt in solver.py. + Reuses quad_gauss and eq_sum precomputed by the p0 kernel. + Returns (cost, grad). + """ + ne = constraint_state.n_constraints_equality[i_b] + nef = ne + constraint_state.n_constraints_frictionloss[i_b] + n_con = constraint_state.n_constraints[i_b] + + # Start from precomputed DOF + equality coefficients + qt_0 = constraint_state.quad_gauss[0, i_b] + constraint_state.eq_sum[0, i_b] + qt_1 = constraint_state.quad_gauss[1, i_b] + constraint_state.eq_sum[1, i_b] + qt_2 = constraint_state.quad_gauss[2, i_b] + constraint_state.eq_sum[2, i_b] + + # Friction constraints: accumulate activation-dependent quad coefficients + for i_c in range(ne, nef): + Jaref_c = constraint_state.Jaref[i_c, i_b] + jv_c = constraint_state.jv[i_c, i_b] + D = constraint_state.efc_D[i_c, i_b] + f_val = constraint_state.efc_frictionloss[i_c, i_b] + r_val = constraint_state.diag[i_c, i_b] + qf_0 = D * (0.5 * Jaref_c * Jaref_c) + qf_1 = D * (jv_c * Jaref_c) + qf_2 = D * (0.5 * jv_c * jv_c) + x = Jaref_c + alpha * jv_c + rf = r_val * f_val + linear_neg = x <= -rf + linear_pos = x >= rf + if linear_neg or linear_pos: + qf_0 = linear_neg * f_val * (-0.5 * rf - Jaref_c) + linear_pos * f_val * (-0.5 * rf + Jaref_c) + qf_1 = linear_neg * (-f_val * jv_c) + linear_pos * (f_val * jv_c) + qf_2 = 0.0 + qt_0 = qt_0 + qf_0 + qt_1 = qt_1 + qf_1 + qt_2 = qt_2 + qf_2 + + # Contact constraints: active when x < 0 + for i_c in range(nef, n_con): + Jaref_c = constraint_state.Jaref[i_c, i_b] + jv_c = constraint_state.jv[i_c, i_b] + D = constraint_state.efc_D[i_c, i_b] + x = Jaref_c + alpha * jv_c + active = x < 0 + qf_0 = D * (0.5 * Jaref_c * Jaref_c) + qf_1 = D * (jv_c * Jaref_c) + qf_2 = D * (0.5 * jv_c * jv_c) + qt_0 = qt_0 + qf_0 * active + qt_1 = qt_1 + qf_1 * active + qt_2 = qt_2 + qf_2 * active + + cost = alpha * alpha * qt_2 + alpha * qt_1 + qt_0 + grad = 2.0 * alpha * qt_2 + qt_1 + return cost, grad + @qd.kernel(fastcache=gs.use_fastcache) def _kernel_parallel_linesearch_mv( @@ -81,19 +149,22 @@ def _kernel_parallel_linesearch_jv( @qd.kernel(fastcache=gs.use_fastcache) def _kernel_parallel_linesearch_p0( + dofs_info: array_class.DofsInfo, + entities_info: array_class.EntitiesInfo, dofs_state: array_class.DofsState, constraint_state: array_class.ConstraintState, rigid_global_info: array_class.RigidGlobalInfo, static_rigid_sim_config: qd.template(), ): - """Snorm check, quad_gauss, eq_sum, and p0_cost. T threads per env with shared memory reductions. + """Fused mv + jv + snorm + quad_gauss + eq_sum + p0_cost. - Phase 1: Fused snorm + quad_gauss parallel reduction over n_dofs (Options A+B). + Phase 0a: Compute mv = M @ search (cooperative over DOFs, 32 threads). + Phase 0b: Compute jv = J @ search (cooperative over constraints, 32 threads). + Phase 1: Fused snorm + quad_gauss parallel reduction over n_dofs. Phase 2: Parallel reduction over n_constraints for eq_sum and p0_cost. """ _B = constraint_state.grad.shape[1] _T = qd.static(_P0_BLOCK) - _LS_PARALLEL_MIN_STEP = qd.static(LS_PARALLEL_MIN_STEP) qd.loop_config(block_dim=_T) for i_flat in range(_B * _T): @@ -110,6 +181,34 @@ def _kernel_parallel_linesearch_p0( if constraint_state.n_constraints[i_b] > 0 and constraint_state.improved[i_b]: n_dofs = constraint_state.search.shape[0] + n_con = constraint_state.n_constraints[i_b] + + # === Phase 0a: Compute mv = M @ search (cooperative over DOFs) === + i_d1 = tid + while i_d1 < n_dofs: + I_d1 = [i_d1, i_b] if qd.static(static_rigid_sim_config.batch_dofs_info) else i_d1 + i_e = dofs_info.entity_idx[I_d1] + mv_val = gs.qd_float(0.0) + for i_d2 in range(entities_info.dof_start[i_e], entities_info.dof_end[i_e]): + mv_val = mv_val + rigid_global_info.mass_mat[i_d1, i_d2, i_b] * constraint_state.search[i_d2, i_b] + constraint_state.mv[i_d1, i_b] = mv_val + i_d1 += _T + + # === Phase 0b: Compute jv = J @ search (cooperative over constraints) === + i_c = tid + while i_c < n_con: + jv_val = gs.qd_float(0.0) + if qd.static(static_rigid_sim_config.sparse_solve): + for i_d_ in range(constraint_state.jac_n_relevant_dofs[i_c, i_b]): + i_d = constraint_state.jac_relevant_dofs[i_c, i_d_, i_b] + jv_val = jv_val + constraint_state.jac[i_c, i_d, i_b] * constraint_state.search[i_d, i_b] + else: + for i_d in range(n_dofs): + jv_val = jv_val + constraint_state.jac[i_c, i_d, i_b] * constraint_state.search[i_d, i_b] + constraint_state.jv[i_c, i_b] = jv_val + i_c += _T + + qd.simt.block.sync() # Ensure mv and jv are written before Phase 1 reads them # === Phase 1: Fused snorm + quad_gauss, parallel over n_dofs === local_snorm_sq = gs.qd_float(0.0) @@ -244,7 +343,9 @@ def _kernel_parallel_linesearch_p0( total_hess = 2.0 * (constraint_state.quad_gauss[2, i_b] + sh_constraint_hess[0]) if total_hess > 0.0: total_grad = constraint_state.quad_gauss[1, i_b] + sh_constraint_grad[0] - alpha_newton = qd.max(qd.abs(total_grad / total_hess), gs.qd_float(_LS_PARALLEL_MIN_STEP)) + alpha_newton = qd.max( + qd.abs(total_grad / total_hess), gs.qd_float(qd.static(LS_PARALLEL_MIN_STEP)) + ) constraint_state.candidates[2, i_b] = alpha_newton * 1e-2 constraint_state.candidates[3, i_b] = alpha_newton * 10.0 constraint_state.candidates[5, i_b] = alpha_newton # exact Newton step for eval @@ -253,6 +354,12 @@ def _kernel_parallel_linesearch_p0( constraint_state.candidates[3, i_b] = 1e2 constraint_state.candidates[5, i_b] = 0.0 constraint_state.candidates[4, i_b] = gs.qd_float(1e30) # best cost across passes + # Store gtol for gradient-guided bisection after grid search + n_dofs_val = constraint_state.search.shape[0] + scale = rigid_global_info.meaninertia[i_b] * qd.max(1, n_dofs_val) + constraint_state.candidates[7, i_b] = ( + rigid_global_info.tolerance[None] * rigid_global_info.ls_tolerance[None] * snorm * scale + ) @qd.kernel(fastcache=gs.use_fastcache) @@ -261,159 +368,234 @@ def _kernel_parallel_linesearch_eval( rigid_global_info: array_class.RigidGlobalInfo, static_rigid_sim_config: qd.template(), ): - """Evaluate K candidate alphas in parallel per env, pick the best via reduction. + """Evaluate alpha candidates via cooperative constraint reduction, then bisect. + + All K threads cooperate on each candidate: each thread reduces n_constraints/K + constraints, then a shared-memory tree reduction sums the partial costs. This is + O(n_candidates × n_constraints/K) per thread instead of O(K × n_constraints). - Reads the search range from candidates[2] (lo) and candidates[3] (hi). - Writes narrowed range back to candidates[2,3] for successive refinement. + Phase 1: Cooperatively evaluate N_CANDIDATES + Newton alpha, pick best via argmin. + Phase 2: Cooperatively evaluate analytical gradient at best, then bisect if needed. """ _B = constraint_state.grad.shape[1] _K = qd.static(LS_PARALLEL_K) + _NC = qd.static(LS_N_CANDIDATES) qd.loop_config(block_dim=_K) for i_flat in range(_B * _K): tid = i_flat % _K i_b = i_flat // _K - # Shared memory for argmin reduction - sh_cost = qd.simt.block.SharedArray((_K,), gs.qd_float) - sh_idx = qd.simt.block.SharedArray((_K,), qd.i32) + # Shared memory for reductions (reused across phases) + sh_val = qd.simt.block.SharedArray((_K,), gs.qd_float) + sh_val2 = qd.simt.block.SharedArray((_K,), gs.qd_float) + # Shared arrays for candidate costs and alphas (only _NC+1 used) + sh_cand_cost = qd.simt.block.SharedArray((_K,), gs.qd_float) + sh_cand_alpha = qd.simt.block.SharedArray((_K,), gs.qd_float) - if constraint_state.n_constraints[i_b] > 0 and constraint_state.improved[i_b]: + active = constraint_state.n_constraints[i_b] > 0 and constraint_state.improved[i_b] + + if active: ne = constraint_state.n_constraints_equality[i_b] nef = ne + constraint_state.n_constraints_frictionloss[i_b] n_con = constraint_state.n_constraints[i_b] - lo = constraint_state.candidates[2, i_b] hi = constraint_state.candidates[3, i_b] - - # Generate log-spaced alpha within [lo, hi]. - # Thread 0 evaluates the exact Newton step instead of the grid point at lo. - # This gives the Newton alpha a fair cost-based comparison with grid candidates. - _step = (qd.log(hi) - qd.log(lo)) / qd.max(1.0, qd.cast(_K - 1, gs.qd_float)) - alpha = qd.exp(qd.log(lo) + qd.cast(tid, gs.qd_float) * _step) - if tid == 0: - alpha_newton_val = constraint_state.candidates[5, i_b] - if alpha_newton_val > 0.0: - alpha = alpha_newton_val - - # Evaluate cost at this alpha - cost = ( - alpha * alpha * constraint_state.quad_gauss[2, i_b] - + alpha * constraint_state.quad_gauss[1, i_b] - + constraint_state.quad_gauss[0, i_b] - ) - - # Equality constraints (always active) - use eq_sum precomputed during init - cost = ( - cost - + alpha * alpha * constraint_state.eq_sum[2, i_b] - + alpha * constraint_state.eq_sum[1, i_b] - + constraint_state.eq_sum[0, i_b] - ) - - # Friction constraints - for i_c in range(ne, nef): - Jaref_c = constraint_state.Jaref[i_c, i_b] - jv_c = constraint_state.jv[i_c, i_b] - D = constraint_state.efc_D[i_c, i_b] - f = constraint_state.efc_frictionloss[i_c, i_b] - r = constraint_state.diag[i_c, i_b] - x = Jaref_c + alpha * jv_c - rf = r * f - linear_neg = x <= -rf - linear_pos = x >= rf - if linear_neg or linear_pos: - cost = cost + linear_neg * f * (-0.5 * rf - Jaref_c - alpha * jv_c) - cost = cost + linear_pos * f * (-0.5 * rf + Jaref_c + alpha * jv_c) + p0_cost = constraint_state.candidates[1, i_b] + gtol = constraint_state.candidates[7, i_b] + + # Pre-compute log-space step for candidate generation + _log_lo = qd.log(lo) + _cand_step = (qd.log(hi) - _log_lo) / qd.max(1.0, qd.cast(_NC - 1, gs.qd_float)) + alpha_newton = constraint_state.candidates[5, i_b] + + # === Phase 1: Cooperative evaluation of N_CANDIDATES alphas === + # Evaluate each candidate sequentially; all K threads cooperate on constraint reduction. + n_total_cands = _NC + 1 # +1 for Newton alpha + for cand_idx in range(n_total_cands): + # Generate alpha for this candidate + alpha_c = gs.qd_float(0.0) + if cand_idx < _NC: + alpha_c = qd.exp(_log_lo + qd.cast(cand_idx, gs.qd_float) * _cand_step) else: - cost = cost + D * 0.5 * x * x - - # Contact constraints (active if x < 0) - for i_c in range(nef, n_con): - Jaref_c = constraint_state.Jaref[i_c, i_b] - jv_c = constraint_state.jv[i_c, i_b] - D = constraint_state.efc_D[i_c, i_b] - x = Jaref_c + alpha * jv_c - if x < 0: - cost += D * 0.5 * x * x - - sh_cost[tid] = cost - sh_idx[tid] = tid - else: - sh_cost[tid] = gs.qd_float(1e30) - sh_idx[tid] = tid + alpha_c = alpha_newton # last candidate is Newton alpha + + # DOF + equality cost (O(1), same for all threads) + dof_eq_cost = ( + alpha_c * alpha_c * constraint_state.quad_gauss[2, i_b] + + alpha_c * constraint_state.quad_gauss[1, i_b] + + constraint_state.quad_gauss[0, i_b] + + alpha_c * alpha_c * constraint_state.eq_sum[2, i_b] + + alpha_c * constraint_state.eq_sum[1, i_b] + + constraint_state.eq_sum[0, i_b] + ) - qd.simt.block.sync() + # Cooperative constraint cost: each thread handles strided constraints + local_cost = gs.qd_float(0.0) + i_c = ne + tid # start from ne (skip equality, already in eq_sum) + while i_c < n_con: + Jaref_c = constraint_state.Jaref[i_c, i_b] + jv_c = constraint_state.jv[i_c, i_b] + D = constraint_state.efc_D[i_c, i_b] + x = Jaref_c + alpha_c * jv_c + if i_c < nef: + # Friction constraint + f_val = constraint_state.efc_frictionloss[i_c, i_b] + r_val = constraint_state.diag[i_c, i_b] + rf = r_val * f_val + linear_neg = x <= -rf + linear_pos = x >= rf + if linear_neg or linear_pos: + local_cost = local_cost + linear_neg * f_val * (-0.5 * rf - Jaref_c - alpha_c * jv_c) + local_cost = local_cost + linear_pos * f_val * (-0.5 * rf + Jaref_c + alpha_c * jv_c) + else: + local_cost = local_cost + D * 0.5 * x * x + else: + # Contact constraint (active if x < 0) + if x < 0: + local_cost = local_cost + D * 0.5 * x * x + i_c += _K - # Tree reduction for argmin - stride = _K // 2 - while stride > 0: - if tid < stride: - if sh_cost[tid + stride] < sh_cost[tid]: - sh_cost[tid] = sh_cost[tid + stride] - sh_idx[tid] = sh_idx[tid + stride] - qd.simt.block.sync() - stride = stride // 2 + # Tree reduction for constraint cost + sh_val[tid] = local_cost + qd.simt.block.sync() + stride = _K // 2 + while stride > 0: + if tid < stride: + sh_val[tid] += sh_val[tid + stride] + qd.simt.block.sync() + stride //= 2 - # Thread 0: acceptance check, write result, and narrow range for next pass - if tid == 0: - if constraint_state.n_constraints[i_b] > 0 and constraint_state.improved[i_b]: - p0_cost = constraint_state.candidates[1, i_b] - best_tid = sh_idx[0] - best_cost = sh_cost[0] - lo = constraint_state.candidates[2, i_b] - hi = constraint_state.candidates[3, i_b] - # Recover best alpha: thread 0 used Newton alpha, others used grid - _step = (qd.log(hi) - qd.log(lo)) / qd.max(1.0, qd.cast(_K - 1, gs.qd_float)) - best_alpha = qd.exp(qd.log(lo) + qd.cast(best_tid, gs.qd_float) * _step) - # If thread 0 (Newton candidate) won, use the exact Newton alpha - if best_tid == 0 and constraint_state.candidates[5, i_b] > 0.0: - best_alpha = constraint_state.candidates[5, i_b] - - # Only update best alpha if this pass improved over ALL previous passes + # Thread 0 stores total cost for this candidate + if tid == 0: + total_cost = dof_eq_cost + sh_val[0] + sh_cand_cost[cand_idx] = total_cost + sh_cand_alpha[cand_idx] = alpha_c + qd.simt.block.sync() + + # === Phase 2: Find best candidate (thread 0) === + if tid == 0: + best_alpha = gs.qd_float(0.0) + best_cost = p0_cost best_cost_prev = constraint_state.candidates[4, i_b] - if best_cost < p0_cost and best_cost < best_cost_prev: - constraint_state.candidates[0, i_b] = best_alpha + for ci in range(n_total_cands): + c = sh_cand_cost[ci] + if c < best_cost and c < best_cost_prev: + best_cost = c + best_alpha = sh_cand_alpha[ci] + + constraint_state.candidates[0, i_b] = best_alpha + if best_alpha > 0.0: constraint_state.candidates[4, i_b] = best_cost + # Store best alpha for Phase 3 cooperative bisection + sh_cand_alpha[0] = best_alpha + qd.simt.block.sync() - # Narrow range around accepted point for next refinement pass - lo_idx = qd.max(0, best_tid - 1) - hi_idx = qd.min(_K - 1, best_tid + 1) - # Narrow search range to neighbors of best candidate - _step = (qd.log(hi) - qd.log(lo)) / qd.max(1.0, qd.cast(_K - 1, gs.qd_float)) - constraint_state.candidates[2, i_b] = qd.exp(qd.log(lo) + qd.cast(lo_idx, gs.qd_float) * _step) - constraint_state.candidates[3, i_b] = qd.exp(qd.log(lo) + qd.cast(hi_idx, gs.qd_float) * _step) - else: - constraint_state.candidates[0, i_b] = 0.0 + # === Phase 3: Cooperative gradient bisection === + best_alpha_shared = sh_cand_alpha[0] + if best_alpha_shared > 0.0: + # Cooperatively compute gradient at best_alpha + alpha_eval = best_alpha_shared + # Cooperative gradient: accumulate quad_total_1 and quad_total_2 + local_qt1 = gs.qd_float(0.0) + local_qt2 = gs.qd_float(0.0) + i_c = ne + tid + while i_c < n_con: + Jaref_c = constraint_state.Jaref[i_c, i_b] + jv_c = constraint_state.jv[i_c, i_b] + D = constraint_state.efc_D[i_c, i_b] + x = Jaref_c + alpha_eval * jv_c + if i_c < nef: + f_val = constraint_state.efc_frictionloss[i_c, i_b] + r_val = constraint_state.diag[i_c, i_b] + rf = r_val * f_val + linear_neg = x <= -rf + linear_pos = x >= rf + qf_1 = D * (jv_c * Jaref_c) + qf_2 = D * (0.5 * jv_c * jv_c) + if linear_neg or linear_pos: + qf_1 = linear_neg * (-f_val * jv_c) + linear_pos * (f_val * jv_c) + qf_2 = 0.0 + local_qt1 = local_qt1 + qf_1 + local_qt2 = local_qt2 + qf_2 + else: + act = x < 0 + local_qt1 = local_qt1 + D * (jv_c * Jaref_c) * act + local_qt2 = local_qt2 + D * (0.5 * jv_c * jv_c) * act + i_c += _K + + # Reduce qt1 and qt2 + sh_val[tid] = local_qt1 + sh_val2[tid] = local_qt2 + qd.simt.block.sync() + stride = _K // 2 + while stride > 0: + if tid < stride: + sh_val[tid] += sh_val[tid + stride] + sh_val2[tid] += sh_val2[tid + stride] + qd.simt.block.sync() + stride //= 2 -@qd.kernel(fastcache=gs.use_fastcache) -def _kernel_parallel_linesearch_apply_alpha( - constraint_state: array_class.ConstraintState, - rigid_global_info: array_class.RigidGlobalInfo, - static_rigid_sim_config: qd.template(), -): - """Apply best alpha to qacc, Ma, and Jaref. Fuses dof and constraint updates.""" - n_dofs = constraint_state.qacc.shape[0] - len_constraints = constraint_state.Jaref.shape[0] - _B = constraint_state.grad.shape[1] - n_items = qd.max(n_dofs, len_constraints) + if tid == 0: + qt1_total = constraint_state.quad_gauss[1, i_b] + constraint_state.eq_sum[1, i_b] + sh_val[0] + qt2_total = constraint_state.quad_gauss[2, i_b] + constraint_state.eq_sum[2, i_b] + sh_val2[0] + g_best = 2.0 * alpha_eval * qt2_total + qt1_total + + if qd.abs(g_best) > gtol: + # Need bisection — use thread-0 sequential bisection with _ls_eval_cost_grad + # (bisection is ~5 iterations, thread-0 cost is acceptable) + bis_a = alpha_eval * 0.5 + bis_b = alpha_eval + if g_best < 0.0: + bis_a = alpha_eval + bis_b = alpha_eval * 2.0 + + _, g_a = _ls_eval_cost_grad(bis_a, i_b, constraint_state) + _, g_b = _ls_eval_cost_grad(bis_b, i_b, constraint_state) + + if g_a < 0.0 and g_b > 0.0: + _N_BISECT = qd.static(LS_BISECT_STEPS) + for _bis_it in range(_N_BISECT): + mid_b = (bis_a + bis_b) * 0.5 + c_mid_b, g_mid_b = _ls_eval_cost_grad(mid_b, i_b, constraint_state) + if qd.abs(g_mid_b) < gtol or qd.abs(bis_b - bis_a) < rigid_global_info.EPS[None]: + break + if g_mid_b < 0.0: + bis_a = mid_b + else: + bis_b = mid_b + mid_b = (bis_a + bis_b) * 0.5 + c_mid_b, _ = _ls_eval_cost_grad(mid_b, i_b, constraint_state) + if c_mid_b < p0_cost and c_mid_b < constraint_state.candidates[4, i_b]: + constraint_state.candidates[0, i_b] = mid_b + constraint_state.candidates[4, i_b] = c_mid_b + else: + if tid == 0: + constraint_state.candidates[0, i_b] = 0.0 + qd.simt.block.sync() - qd.loop_config(serialize=static_rigid_sim_config.para_level < gs.PARA_LEVEL.ALL) - for i, i_b in qd.ndrange(n_items, _B): - if constraint_state.n_constraints[i_b] > 0 and constraint_state.improved[i_b]: - alpha = constraint_state.candidates[0, i_b] - if qd.abs(alpha) < rigid_global_info.EPS[None]: - if i == 0: + # === Phase 4: Cooperative apply alpha (fused, saves 1 kernel launch) === + qd.simt.block.sync() + if active: + n_dofs_apply = constraint_state.qacc.shape[0] + n_con_apply = constraint_state.n_constraints[i_b] + alpha_apply = constraint_state.candidates[0, i_b] + if qd.abs(alpha_apply) < rigid_global_info.EPS[None]: + if tid == 0: constraint_state.improved[i_b] = False else: - # Apply to dofs - if i < n_dofs: - constraint_state.qacc[i, i_b] += constraint_state.search[i, i_b] * alpha - constraint_state.Ma[i, i_b] += constraint_state.mv[i, i_b] * alpha - # Apply to constraints - if i < constraint_state.n_constraints[i_b]: - constraint_state.Jaref[i, i_b] += constraint_state.jv[i, i_b] * alpha + # Apply to dofs (strided over threads) + i_d = tid + while i_d < n_dofs_apply: + constraint_state.qacc[i_d, i_b] += constraint_state.search[i_d, i_b] * alpha_apply + constraint_state.Ma[i_d, i_b] += constraint_state.mv[i_d, i_b] * alpha_apply + i_d += _K + # Apply to constraints (strided over threads) + i_c = tid + while i_c < n_con_apply: + constraint_state.Jaref[i_c, i_b] += constraint_state.jv[i_c, i_b] * alpha_apply + i_c += _K # ============================================== Shared iteration kernels ============================================== @@ -605,6 +787,39 @@ def _kernel_update_search_direction( ) +# ============================================ Sequential linesearch ================================================ + + +@qd.kernel(fastcache=gs.use_fastcache) +def _kernel_linesearch( + entities_info: array_class.EntitiesInfo, + dofs_state: array_class.DofsState, + constraint_state: array_class.ConstraintState, + rigid_global_info: array_class.RigidGlobalInfo, + static_rigid_sim_config: qd.template(), +): + """Sequential iterative linesearch (same as main branch). + + Each thread handles one env, using Newton-guided derivative linesearch. + Lower per-env parallelism but less total work than the K=32 grid search. + Better for scenes with many constraints per env (e.g. humanoid contact). + """ + _B = constraint_state.grad.shape[1] + qd.loop_config(serialize=static_rigid_sim_config.para_level < gs.PARA_LEVEL.ALL, block_dim=32) + for i_b in range(_B): + if constraint_state.n_constraints[i_b] > 0 and constraint_state.improved[i_b]: + solver.func_linesearch_and_apply_alpha( + i_b, + entities_info=entities_info, + dofs_state=dofs_state, + rigid_global_info=rigid_global_info, + constraint_state=constraint_state, + static_rigid_sim_config=static_rigid_sim_config, + ) + else: + constraint_state.improved[i_b] = False + + # ============================================== Solve body dispatch ================================================ @@ -632,31 +847,83 @@ def func_solve_decomposed( """ # _n_iterations is a Python-native int to avoid CPU-GPU sync (vs rigid_global_info.iterations[None]) for _it in range(_n_iterations): - _kernel_parallel_linesearch_mv( + # Fused: mv + jv + snorm + quad_gauss + eq_sum + p0_cost + _kernel_parallel_linesearch_p0( dofs_info, entities_info, + dofs_state, + constraint_state, + rigid_global_info, + static_rigid_sim_config, + ) + # Fused: grid search + bisection + apply alpha + _kernel_parallel_linesearch_eval( constraint_state, rigid_global_info, static_rigid_sim_config, ) - _kernel_parallel_linesearch_jv( + if static_rigid_sim_config.solver_type == gs.constraint_solver.CG: + _kernel_cg_only_save_prev_grad( + constraint_state, + static_rigid_sim_config, + ) + _kernel_update_constraint_forces( constraint_state, static_rigid_sim_config, ) - _kernel_parallel_linesearch_p0( + _kernel_update_constraint_qfrc( + constraint_state, + static_rigid_sim_config, + ) + _kernel_update_constraint_cost( dofs_state, constraint_state, - rigid_global_info, static_rigid_sim_config, ) - # Successive refinement: each pass narrows the search range around the best alpha. - for _refine in range(LS_PARALLEL_N_REFINE): - _kernel_parallel_linesearch_eval( + + if static_rigid_sim_config.solver_type == gs.constraint_solver.Newton: + _kernel_newton_only_nt_hessian( constraint_state, rigid_global_info, static_rigid_sim_config, ) - _kernel_parallel_linesearch_apply_alpha( + _kernel_update_gradient( + entities_info, + dofs_state, + constraint_state, + rigid_global_info, + static_rigid_sim_config, + ) + _kernel_update_search_direction( + constraint_state, + rigid_global_info, + static_rigid_sim_config, + ) + + +@solver.func_solve_body.register( + is_compatible=lambda *args, **kwargs: False # Disabled to benchmark parallel LS in isolation +) +def func_solve_decomposed_sequential( + entities_info, + dofs_info, + dofs_state, + constraint_state, + rigid_global_info, + static_rigid_sim_config, + _n_iterations, +): + """Decomposed solver with sequential iterative linesearch. + + Same kernel-per-step structure as func_solve_decomposed but uses the sequential + Newton-guided linesearch (1 thread per env) instead of the parallel grid search + (K threads per env). Better for scenes with many constraints per env where the + grid search does more total work than needed. + """ + for _it in range(_n_iterations): + _kernel_linesearch( + entities_info, + dofs_state, constraint_state, rigid_global_info, static_rigid_sim_config, @@ -679,7 +946,6 @@ def func_solve_decomposed( constraint_state, static_rigid_sim_config, ) - if static_rigid_sim_config.solver_type == gs.constraint_solver.Newton: _kernel_newton_only_nt_hessian( constraint_state, From 6fa169f4f1caabe5e8669068cd488b644bc1b801 Mon Sep 17 00:00:00 2001 From: Mingrui Date: Wed, 25 Mar 2026 11:14:54 +0000 Subject: [PATCH 16/20] with repeat after count --- genesis/engine/solvers/rigid/constraint/solver.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/genesis/engine/solvers/rigid/constraint/solver.py b/genesis/engine/solvers/rigid/constraint/solver.py index 51f7269c9..00985b918 100644 --- a/genesis/engine/solvers/rigid/constraint/solver.py +++ b/genesis/engine/solvers/rigid/constraint/solver.py @@ -3102,7 +3102,11 @@ def func_solve_iter( @qd.perf_dispatch( - get_geometry_hash=lambda *args, **kwargs: (*args, frozendict(kwargs)), warmup=3, active=3, repeat_after_seconds=0.0 + get_geometry_hash=lambda *args, **kwargs: (*args, frozendict(kwargs)), + warmup=3, + active=3, + repeat_after_seconds=0, + repeat_after_count=3000, ) def func_solve_body( entities_info: array_class.EntitiesInfo, From 04266ba49192977e3b6a68aa1a4a4b8d69d184e9 Mon Sep 17 00:00:00 2001 From: Mingrui Date: Wed, 25 Mar 2026 11:19:40 +0000 Subject: [PATCH 17/20] update n candidates --- genesis/engine/solvers/rigid/constraint/solver_breakdown.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/genesis/engine/solvers/rigid/constraint/solver_breakdown.py b/genesis/engine/solvers/rigid/constraint/solver_breakdown.py index 9a3e672f5..161e14eca 100644 --- a/genesis/engine/solvers/rigid/constraint/solver_breakdown.py +++ b/genesis/engine/solvers/rigid/constraint/solver_breakdown.py @@ -27,7 +27,7 @@ # Number of alpha candidates evaluated via cooperative constraint reduction. # Each candidate is evaluated by ALL K threads cooperating on the constraint sum, # reducing per-thread work from O(n_constraints) to O(n_constraints/K). -LS_N_CANDIDATES = 8 +LS_N_CANDIDATES = 6 # Maximum allowed alpha (prevents divergence from degenerate steps). LS_ALPHA_MAX = 1e4 From bd18461d2188dd8c8ac1ec267f26a97dd387578c Mon Sep 17 00:00:00 2001 From: Mingrui Date: Wed, 25 Mar 2026 20:53:04 +0000 Subject: [PATCH 18/20] remove the fallback for simpler code --- .../rigid/constraint/solver_breakdown.py | 65 ------------------- 1 file changed, 65 deletions(-) diff --git a/genesis/engine/solvers/rigid/constraint/solver_breakdown.py b/genesis/engine/solvers/rigid/constraint/solver_breakdown.py index 161e14eca..223cd374c 100644 --- a/genesis/engine/solvers/rigid/constraint/solver_breakdown.py +++ b/genesis/engine/solvers/rigid/constraint/solver_breakdown.py @@ -899,68 +899,3 @@ def func_solve_decomposed( rigid_global_info, static_rigid_sim_config, ) - - -@solver.func_solve_body.register( - is_compatible=lambda *args, **kwargs: False # Disabled to benchmark parallel LS in isolation -) -def func_solve_decomposed_sequential( - entities_info, - dofs_info, - dofs_state, - constraint_state, - rigid_global_info, - static_rigid_sim_config, - _n_iterations, -): - """Decomposed solver with sequential iterative linesearch. - - Same kernel-per-step structure as func_solve_decomposed but uses the sequential - Newton-guided linesearch (1 thread per env) instead of the parallel grid search - (K threads per env). Better for scenes with many constraints per env where the - grid search does more total work than needed. - """ - for _it in range(_n_iterations): - _kernel_linesearch( - entities_info, - dofs_state, - constraint_state, - rigid_global_info, - static_rigid_sim_config, - ) - if static_rigid_sim_config.solver_type == gs.constraint_solver.CG: - _kernel_cg_only_save_prev_grad( - constraint_state, - static_rigid_sim_config, - ) - _kernel_update_constraint_forces( - constraint_state, - static_rigid_sim_config, - ) - _kernel_update_constraint_qfrc( - constraint_state, - static_rigid_sim_config, - ) - _kernel_update_constraint_cost( - dofs_state, - constraint_state, - static_rigid_sim_config, - ) - if static_rigid_sim_config.solver_type == gs.constraint_solver.Newton: - _kernel_newton_only_nt_hessian( - constraint_state, - rigid_global_info, - static_rigid_sim_config, - ) - _kernel_update_gradient( - entities_info, - dofs_state, - constraint_state, - rigid_global_info, - static_rigid_sim_config, - ) - _kernel_update_search_direction( - constraint_state, - rigid_global_info, - static_rigid_sim_config, - ) From 75e213d59afdb14047c1a28090a43c7b099aeb0c Mon Sep 17 00:00:00 2001 From: Mingrui Date: Wed, 25 Mar 2026 23:09:36 +0000 Subject: [PATCH 19/20] add unexpected missing newton correction.. --- .../rigid/constraint/solver_breakdown.py | 65 +++++++++++-------- 1 file changed, 39 insertions(+), 26 deletions(-) diff --git a/genesis/engine/solvers/rigid/constraint/solver_breakdown.py b/genesis/engine/solvers/rigid/constraint/solver_breakdown.py index 223cd374c..f0fd9c0c4 100644 --- a/genesis/engine/solvers/rigid/constraint/solver_breakdown.py +++ b/genesis/engine/solvers/rigid/constraint/solver_breakdown.py @@ -543,33 +543,46 @@ def _kernel_parallel_linesearch_eval( g_best = 2.0 * alpha_eval * qt2_total + qt1_total if qd.abs(g_best) > gtol: - # Need bisection — use thread-0 sequential bisection with _ls_eval_cost_grad - # (bisection is ~5 iterations, thread-0 cost is acceptable) - bis_a = alpha_eval * 0.5 - bis_b = alpha_eval - if g_best < 0.0: - bis_a = alpha_eval - bis_b = alpha_eval * 2.0 - - _, g_a = _ls_eval_cost_grad(bis_a, i_b, constraint_state) - _, g_b = _ls_eval_cost_grad(bis_b, i_b, constraint_state) - - if g_a < 0.0 and g_b > 0.0: - _N_BISECT = qd.static(LS_BISECT_STEPS) - for _bis_it in range(_N_BISECT): + hess_best = 2.0 * qt2_total + newton_done = False + + # Try one Newton correction first (O(1) compute + 1 cost eval) + if hess_best > rigid_global_info.EPS[None]: + alpha_nc = alpha_eval - g_best / hess_best + if alpha_nc > 0.0: + c_nc, g_nc = _ls_eval_cost_grad(alpha_nc, i_b, constraint_state) + if c_nc < p0_cost and c_nc < constraint_state.candidates[4, i_b]: + constraint_state.candidates[0, i_b] = alpha_nc + constraint_state.candidates[4, i_b] = c_nc + newton_done = True + + # Fall back to bisection if Newton didn't converge + if not newton_done: + bis_a = alpha_eval * 0.5 + bis_b = alpha_eval + if g_best < 0.0: + bis_a = alpha_eval + bis_b = alpha_eval * 2.0 + + _, g_a = _ls_eval_cost_grad(bis_a, i_b, constraint_state) + _, g_b = _ls_eval_cost_grad(bis_b, i_b, constraint_state) + + if g_a < 0.0 and g_b > 0.0: + _N_BISECT = qd.static(LS_BISECT_STEPS) + for _bis_it in range(_N_BISECT): + mid_b = (bis_a + bis_b) * 0.5 + c_mid_b, g_mid_b = _ls_eval_cost_grad(mid_b, i_b, constraint_state) + if qd.abs(g_mid_b) < gtol or qd.abs(bis_b - bis_a) < rigid_global_info.EPS[None]: + break + if g_mid_b < 0.0: + bis_a = mid_b + else: + bis_b = mid_b mid_b = (bis_a + bis_b) * 0.5 - c_mid_b, g_mid_b = _ls_eval_cost_grad(mid_b, i_b, constraint_state) - if qd.abs(g_mid_b) < gtol or qd.abs(bis_b - bis_a) < rigid_global_info.EPS[None]: - break - if g_mid_b < 0.0: - bis_a = mid_b - else: - bis_b = mid_b - mid_b = (bis_a + bis_b) * 0.5 - c_mid_b, _ = _ls_eval_cost_grad(mid_b, i_b, constraint_state) - if c_mid_b < p0_cost and c_mid_b < constraint_state.candidates[4, i_b]: - constraint_state.candidates[0, i_b] = mid_b - constraint_state.candidates[4, i_b] = c_mid_b + c_mid_b, _ = _ls_eval_cost_grad(mid_b, i_b, constraint_state) + if c_mid_b < p0_cost and c_mid_b < constraint_state.candidates[4, i_b]: + constraint_state.candidates[0, i_b] = mid_b + constraint_state.candidates[4, i_b] = c_mid_b else: if tid == 0: constraint_state.candidates[0, i_b] = 0.0 From 8a75bfed9666f93dbc53a624c2f16ac5b4f8a831 Mon Sep 17 00:00:00 2001 From: Mingrui Date: Thu, 26 Mar 2026 00:33:54 +0000 Subject: [PATCH 20/20] update comment --- genesis/engine/solvers/rigid/constraint/solver_breakdown.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/genesis/engine/solvers/rigid/constraint/solver_breakdown.py b/genesis/engine/solvers/rigid/constraint/solver_breakdown.py index f0fd9c0c4..c3bfd1306 100644 --- a/genesis/engine/solvers/rigid/constraint/solver_breakdown.py +++ b/genesis/engine/solvers/rigid/constraint/solver_breakdown.py @@ -375,7 +375,7 @@ def _kernel_parallel_linesearch_eval( O(n_candidates × n_constraints/K) per thread instead of O(K × n_constraints). Phase 1: Cooperatively evaluate N_CANDIDATES + Newton alpha, pick best via argmin. - Phase 2: Cooperatively evaluate analytical gradient at best, then bisect if needed. + Phase 2: Cooperatively evaluate analytical gradient at best, try one Newton correction first then bisect if needed. """ _B = constraint_state.grad.shape[1] _K = qd.static(LS_PARALLEL_K)