diff --git a/examples/benchmarks/sph_weak_scale_test.py b/examples/benchmarks/sph_weak_scale_test.py
index 5a0bda2a99..a71226a6b5 100644
--- a/examples/benchmarks/sph_weak_scale_test.py
+++ b/examples/benchmarks/sph_weak_scale_test.py
@@ -63,7 +63,6 @@
     )
     cfg.set_boundary_periodic()
     cfg.set_eos_adiabatic(gamma)
-    cfg.set_max_neigh_cache_size(int(100e9))
     cfg.print_status()
     model.set_solver_config(cfg)
     model.init_scheduler(scheduler_split_val, scheduler_merge_val)
@@ -102,7 +101,7 @@
 
     model.set_value_in_a_box("uint", "f64", 0, bmin, bmax)
 
-    rinj = 8 * dr
+    rinj = 16 * dr
     u_inj = 1
     model.add_kernel_value("uint", "f64", u_inj, (0, 0, 0), rinj)
 
@@ -116,9 +115,6 @@
     model.set_cfl_cour(0.1)
     model.set_cfl_force(0.1)
 
-    model.set_cfl_multipler(1e-4)
-    model.set_cfl_mult_stiffness(1e6)
-
     shamrock.backends.reset_mem_info_max()
 
     # converge smoothing length and compute initial dt
@@ -129,8 +125,11 @@
     res_cnts = []
     res_system_metrics = []
 
-    for i in range(5):
+    for i in range(10):
         shamrock.sys.mpi_barrier()
+
+        # To replay the same step
+        model.set_next_dt(0.0)
         model.timestep()
 
         tmp_res_rate, tmp_res_cnt, tmp_system_metrics = (
diff --git a/src/shamalgs/src/collective/sparse_exchange.cpp b/src/shamalgs/src/collective/sparse_exchange.cpp
index 6e6625544d..970d35d051 100644
--- a/src/shamalgs/src/collective/sparse_exchange.cpp
+++ b/src/shamalgs/src/collective/sparse_exchange.cpp
@@ -56,6 +56,7 @@ namespace shamalgs::collective {
     /// fetch u64_2 from global message data
     std::vector<u64_2> fetch_global_message_data(
         const std::vector<CommMessageInfo> &messages_send) {
+        __shamrock_stack_entry();
 
         std::vector<u64_2> local_data = std::vector<u64_2>(messages_send.size());
 
@@ -84,6 +85,7 @@ namespace shamalgs::collective {
 
     /// decode message to get message
     std::vector<CommMessageInfo> decode_all_message(const std::vector<u64_2> &global_data) {
+        __shamrock_stack_entry();
         std::vector<CommMessageInfo> message_all(global_data.size());
         for (u64 i = 0; i < global_data.size(); i++) {
             message_all[i] = unpack(global_data[i]);
@@ -94,6 +96,7 @@ namespace shamalgs::collective {
 
     /// compute message tags
     void compute_tags(std::vector<CommMessageInfo> &message_all) {
+        __shamrock_stack_entry();
 
         std::vector<i32> tag_map(shamcomm::world_size(), 0);
 
diff --git a/src/shammodels/gsph/src/modules/GSPHGhostHandler.cpp b/src/shammodels/gsph/src/modules/GSPHGhostHandler.cpp
index 070fc6f858..531a6e7508 100644
--- a/src/shammodels/gsph/src/modules/GSPHGhostHandler.cpp
+++ b/src/shammodels/gsph/src/modules/GSPHGhostHandler.cpp
@@ -344,13 +344,13 @@ auto GSPHGhostHandler<vec>::gen_id_table_interfaces(GeneratorMap &&gen)
 
     for (auto &[k, v] : send_count_stats) {
         if (v > 0.2) {
-            warn_log += shambase::format("\n    patch {} high interf/patch volume: {}", k, v);
+            // warn_log += shambase::format("\n    patch {} high interf/patch volume: {}", k, v);
             has_warn = true;
         }
     }
 
     if (has_warn && shamcomm::world_rank() == 0) {
-        warn_log = "\n    This can lead to high mpi "
+        warn_log = "\n    High interf/patch volume. This can lead to high mpi "
                    "overhead, try to increase the patch split crit"
                    + warn_log;
     }
diff --git a/src/shammodels/sph/include/shammodels/sph/SPHUtilities.hpp b/src/shammodels/sph/include/shammodels/sph/SPHUtilities.hpp
index af5f17a9bd..80dacb3af9 100644
--- a/src/shammodels/sph/include/shammodels/sph/SPHUtilities.hpp
+++ b/src/shammodels/sph/include/shammodels/sph/SPHUtilities.hpp
@@ -86,7 +86,14 @@ namespace shammodels::sph {
             PatchField<flt> interactR_patch = sched.map_owned_to_patch_field_simple<flt>(
                 [&](const Patch p, PatchDataLayer &pdat) -> flt {
                     if (!pdat.is_empty()) {
+#if false
+                        auto tmp = pdat.get_field<flt>(ihpart).compute_max() * h_evol_max * Rkern;
+                        shamcomm::logs::raw_ln(
+                            shambase::format("patch {}, Rghost = {}", p.id_patch, tmp));
+                        return tmp;
+#else
                         return pdat.get_field<flt>(ihpart).compute_max() * h_evol_max * Rkern;
+#endif
                     } else {
                         return shambase::VectorProperties<flt>::get_min();
                     }
diff --git a/src/shammodels/sph/src/BasicSPHGhosts.cpp b/src/shammodels/sph/src/BasicSPHGhosts.cpp
index 6b6357761c..8f5e1a164f 100644
--- a/src/shammodels/sph/src/BasicSPHGhosts.cpp
+++ b/src/shammodels/sph/src/BasicSPHGhosts.cpp
@@ -560,13 +560,13 @@ auto BasicSPHGhostHandler<vec>::gen_id_table_interfaces(GeneratorMap &&gen)
 
     for (auto &[k, v] : send_count_stats) {
         if (v > 0.2) {
-            warn_log += shambase::format("\n    patch {} high interf/patch volume: {}", k, v);
+            // warn_log += shambase::format("\n    patch {} high interf/patch volume: {}", k, v);
             has_warn = true;
         }
     }
 
     if (has_warn && shamcomm::world_rank() == 0) {
-        warn_log = "\n    This can lead to high mpi "
+        warn_log = "\n    High interf/patch volume. This can lead to high mpi "
                    "overhead, try to increase the patch split crit"
                    + warn_log;
     }
diff --git a/src/shamrock/include/shamrock/scheduler/SerialPatchTree.hpp b/src/shamrock/include/shamrock/scheduler/SerialPatchTree.hpp
index 8449755de1..b2decdfb4b 100644
--- a/src/shamrock/include/shamrock/scheduler/SerialPatchTree.hpp
+++ b/src/shamrock/include/shamrock/scheduler/SerialPatchTree.hpp
@@ -269,10 +269,13 @@ class SerialPatchTree {
         sycl::queue &queue,
         shamrock::patch::PatchField<T> pfield,
         Func &&reducer) {
+        __shamrock_stack_entry();
+
         shamrock::patch::PatchtreeField<T> ptfield;
         ptfield.allocate(get_element_count());
 
         {
+            __shamrock_stack_entry();
             sycl::host_accessor lpid{
                 shambase::get_check_ref(linked_patch_ids_buf), sycl::read_only};
             sycl::host_accessor tree_field{
@@ -280,6 +283,8 @@ class SerialPatchTree {
 
             // init reduction
             std::unordered_map<u64, u64> &idp_to_gid = sched.patch_list.id_patch_to_global_idx;
+
+#pragma omp parallel for
             for (u64 idx = 0; idx < get_element_count(); idx++) {
                 tree_field[idx] = (lpid[idx] != u64_max) ? pfield.get(lpid[idx]) : T();
             }
diff --git a/src/shamrock/include/shamrock/scheduler/loadbalance/LoadBalanceStrategy.hpp b/src/shamrock/include/shamrock/scheduler/loadbalance/LoadBalanceStrategy.hpp
index c429d06f4e..db6347a4ba 100644
--- a/src/shamrock/include/shamrock/scheduler/loadbalance/LoadBalanceStrategy.hpp
+++ b/src/shamrock/include/shamrock/scheduler/loadbalance/LoadBalanceStrategy.hpp
@@ -222,7 +222,8 @@ namespace shamrock::scheduler::details {
     inline LBMetric compute_LB_metric(
         const std::vector<TileWithLoad<Torder, Tweight>> &lb_vector,
         const std::vector<i32> &new_owners,
-        i32 world_size) {
+        i32 world_size,
+        f64 strategy_weight) {
 
         std::vector<u64> load_per_node(world_size, 0);
 
@@ -250,7 +251,11 @@ namespace shamrock::scheduler::details {
         }
         var /= world_size;
 
-        return {min, max, avg, sycl::sqrt(var)};
+        return {
+            min * strategy_weight,
+            max * strategy_weight,
+            avg * strategy_weight,
+            sycl::sqrt(var) * strategy_weight};
     }
 
 } // namespace shamrock::scheduler::details
@@ -270,30 +275,39 @@ namespace shamrock::scheduler {
         std::vector<TileWithLoad<Torder, Tweight>> &&lb_vector,
         i32 world_size = shamcomm::world_size()) {
 
-        auto tmpres        = details::lb_startegy_parallel_sweep(lb_vector, world_size);
-        auto metric_psweep = details::compute_LB_metric(lb_vector, tmpres, world_size);
+        using namespace details;
 
-        auto tmpres_2      = details::lb_startegy_roundrobin(lb_vector, world_size);
-        auto metric_rrobin = details::compute_LB_metric(lb_vector, tmpres_2, world_size);
+        f64 factor_boost_psweep = 1;
+        auto tmpres             = lb_startegy_parallel_sweep(lb_vector, world_size);
+        auto metric_psweep = compute_LB_metric(lb_vector, tmpres, world_size, factor_boost_psweep);
 
+        // We boost the round robin strategy to favor it if the difference is around 5% since the
+        // increased uniformity will probably offset the cost anyway
+        f64 factor_boost_rrobin = 0.95;
+        auto tmpres_2           = lb_startegy_roundrobin(lb_vector, world_size);
+        auto metric_rrobin
+            = compute_LB_metric(lb_vector, tmpres_2, world_size, factor_boost_rrobin);
+
+        std::string strategy_name = "parallel sweep";
         if (metric_rrobin.max < metric_psweep.max) {
-            tmpres = tmpres_2;
+            tmpres        = tmpres_2;
+            strategy_name = "round robin";
         }
 
         if (shamcomm::world_rank() == 0) {
-            logger::info_ln("LoadBalance", "summary :");
-            logger::info_ln(
-                "LoadBalance",
-                " - strategy \"psweep\" : max =",
-                metric_psweep.max,
-                "min =",
-                metric_psweep.min);
             logger::info_ln(
                 "LoadBalance",
-                " - strategy \"round robin\" : max =",
-                metric_rrobin.max,
-                "min =",
-                metric_rrobin.min);
+                shambase::format(
+                    R"=(Summary (strategy = {0:}):
+ - strategy "psweep"      : max = {1:.1f} min = {2:.1f} factor = {3:}
+ - strategy "round robin" : max = {4:.1f} min = {5:.1f} factor = {6:})=",
+                    strategy_name,
+                    metric_psweep.max,
+                    metric_psweep.min,
+                    factor_boost_psweep,
+                    metric_rrobin.max,
+                    metric_rrobin.min,
+                    factor_boost_rrobin));
         }
         return tmpres;
     }
diff --git a/src/shamrock/src/solvergraph/ExchangeGhostLayer.cpp b/src/shamrock/src/solvergraph/ExchangeGhostLayer.cpp
index c87577db9a..5edb418156 100644
--- a/src/shamrock/src/solvergraph/ExchangeGhostLayer.cpp
+++ b/src/shamrock/src/solvergraph/ExchangeGhostLayer.cpp
@@ -30,6 +30,28 @@ void shamrock::solvergraph::ExchangeGhostLayer::_impl_evaluate_internal() {
     auto &ghost_layer                                   = edges.ghost_layer;
     const shamrock::solvergraph::RankGetter &rank_owner = edges.rank_owner;
 
+#if false
+    std::unordered_map<u64, u64> msg_sizes_send;
+    std::unordered_map<u64, u64> msg_sizes_max_send;
+
+    std::stringstream ss;
+    ss << "Rank " << shamcomm::world_rank() << " is sending "
+       << ghost_layer.patchdatas.get_native().size() << " patches sizes:";
+    for (auto &pdat : ghost_layer.patchdatas.get_native()) {
+        u64 key = rank_owner.get_rank_owner(pdat.first.first);
+        // ss << pdat.first.first << " " << pdat.first.second << " " << pdat.second.get_obj_cnt() <<
+        // "\n";
+        msg_sizes_send[key] += pdat.second.get_obj_cnt();
+        msg_sizes_max_send[key] = std::max(msg_sizes_max_send[key], u64(pdat.second.get_obj_cnt()));
+    }
+    for (auto &[rank, size] : msg_sizes_send) {
+        ss << "\n"
+           << "msg size from rank " << rank << " is " << size << " max is "
+           << msg_sizes_max_send[rank];
+    }
+    shamcomm::logs::raw_ln(ss.str());
+#endif
+
     shambase::DistributedDataShared<shamrock::patch::PatchDataLayer> recv_dat;
 
     shamalgs::collective::serialize_sparse_comm<shamrock::patch::PatchDataLayer>(