From b5328282045f584ccf05e86d1014a6740f48e135 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timoth=C3=A9e=20David--Cl=C3=A9ris?= Date: Sat, 21 Mar 2026 19:22:27 +0100 Subject: [PATCH 1/2] [Scheduler] omp parallelize LB --- .../loadbalance/LoadBalanceStrategy.hpp | 26 +++++--- .../src/scheduler/HilbertLoadBalance.cpp | 61 ++++++++----------- 2 files changed, 45 insertions(+), 42 deletions(-) diff --git a/src/shamrock/include/shamrock/scheduler/loadbalance/LoadBalanceStrategy.hpp b/src/shamrock/include/shamrock/scheduler/loadbalance/LoadBalanceStrategy.hpp index adf5def7b9..c429d06f4e 100644 --- a/src/shamrock/include/shamrock/scheduler/loadbalance/LoadBalanceStrategy.hpp +++ b/src/shamrock/include/shamrock/scheduler/loadbalance/LoadBalanceStrategy.hpp @@ -41,6 +41,8 @@ namespace shamrock::scheduler::details { u64 index; i32 new_owner; + LoadBalancedTile() = default; + LoadBalancedTile(TileWithLoad in, u64 inindex) : ordering_val(in.ordering_val), load_value(in.load_value), index(inindex) {} }; @@ -76,9 +78,10 @@ namespace shamrock::scheduler::details { using LBTile = TileWithLoad; using LBTileResult = details::LoadBalancedTile; - std::vector res; + std::vector res(lb_vector.size()); +#pragma omp parallel for for (u64 i = 0; i < lb_vector.size(); i++) { - res.push_back(LBTileResult{lb_vector[i], i}); + res[i] = LBTileResult{lb_vector[i], i}; } // apply the ordering @@ -94,7 +97,9 @@ namespace shamrock::scheduler::details { double target_datacnt = double(res[res.size() - 1].accumulated_load_value) / wsize; - for (LBTileResult &tile : res) { +#pragma omp parallel for + for (u64 i = 0; i < res.size(); i++) { + LBTileResult &tile = res[i]; tile.new_owner = (target_datacnt == 0) ? 0 @@ -102,7 +107,8 @@ namespace shamrock::scheduler::details { i32(tile.accumulated_load_value / target_datacnt), 0, wsize - 1); } - if (shamcomm::world_rank() == 0) { + if (shamcomm::world_rank() == 0 + && shamcomm::logs::get_loglevel() >= shamcomm::logs::log_debug) { for (LBTileResult t : res) { shamlog_debug_ln( "HilbertLoadBalance", @@ -141,9 +147,10 @@ namespace shamrock::scheduler::details { using LBTile = TileWithLoad; using LBTileResult = details::LoadBalancedTile; - std::vector res; + std::vector res(lb_vector.size()); +#pragma omp parallel for for (u64 i = 0; i < lb_vector.size(); i++) { - res.push_back(LBTileResult{lb_vector[i], i}); + res[i] = LBTileResult{lb_vector[i], i}; } // apply the ordering @@ -160,7 +167,9 @@ namespace shamrock::scheduler::details { double target_datacnt = double(res[res.size() - 1].accumulated_load_value) / wsize; - for (LBTileResult &tile : res) { +#pragma omp parallel for + for (u64 i = 0; i < res.size(); i++) { + LBTileResult &tile = res[i]; tile.new_owner = (target_datacnt == 0) ? 0 @@ -168,7 +177,8 @@ namespace shamrock::scheduler::details { i32(tile.accumulated_load_value / target_datacnt), 0, wsize - 1); } - if (shamcomm::world_rank() == 0) { + if (shamcomm::world_rank() == 0 + && shamcomm::logs::get_loglevel() >= shamcomm::logs::log_debug) { for (LBTileResult t : res) { shamlog_debug_ln( "HilbertLoadBalance", diff --git a/src/shamrock/src/scheduler/HilbertLoadBalance.cpp b/src/shamrock/src/scheduler/HilbertLoadBalance.cpp index c37ea5a26b..2d988ac626 100644 --- a/src/shamrock/src/scheduler/HilbertLoadBalance.cpp +++ b/src/shamrock/src/scheduler/HilbertLoadBalance.cpp @@ -25,28 +25,13 @@ inline void apply_node_patch_packing( std::vector &global_patch_list, std::vector &new_owner_table) { - using namespace shamrock::patch; - sycl::buffer new_owner(new_owner_table.data(), new_owner_table.size()); - sycl::buffer patch_buf(global_patch_list.data(), global_patch_list.size()); - sycl::range<1> range{global_patch_list.size()}; - - // pack nodes - shamsys::instance::get_alt_queue() - .submit([&](sycl::handler &cgh) { - auto ptch = patch_buf.get_access(cgh); - // auto pdt = dt_buf.get_access(cgh); - auto chosen_node = new_owner.get_access(cgh); - - cgh.parallel_for(range, [=](sycl::item<1> item) { - u64 i = (u64) item.get_id(0); - - if (ptch[i].pack_node_index != u64_max) { - chosen_node[i] = chosen_node[ptch[i].pack_node_index]; - } - }); - }) - .wait(); +#pragma omp parallel for + for (size_t i = 0; i < global_patch_list.size(); i++) { + if (global_patch_list[i].pack_node_index != u64_max) { + new_owner_table[i] = new_owner_table[global_patch_list[i].pack_node_index]; + } + } } namespace shamrock::scheduler { @@ -102,17 +87,17 @@ namespace shamrock::scheduler { // TODO add bool for optional print verbosity // std::cout << i << " : " << old_owner << " -> " << new_owner << std::endl; + if (new_owner != old_owner) { - using ChangeOp = LoadBalancingChangeList::ChangeOp; + using ChangeOp = LoadBalancingChangeList::ChangeOp; - ChangeOp op; - op.patch_idx = i; - op.patch_id = global_patch_list[i].id_patch; - op.rank_owner_new = new_owner; - op.rank_owner_old = old_owner; - op.tag_comm = tags_it_node[old_owner]; + ChangeOp op; + op.patch_idx = i; + op.patch_id = global_patch_list[i].id_patch; + op.rank_owner_new = new_owner; + op.rank_owner_old = old_owner; + op.tag_comm = tags_it_node[old_owner]; - if (new_owner != old_owner) { change_list.change_ops.push_back(op); tags_it_node[old_owner]++; } @@ -126,23 +111,31 @@ namespace shamrock::scheduler { f64 avg = 0; f64 var = 0; - for (i32 nid = 0; nid < shamcomm::world_size(); nid++) { + i32 world_size = shamcomm::world_size(); + +#pragma omp parallel for reduction(min : min) reduction(max : max) reduction(+ : avg) + for (i32 nid = 0; nid < world_size; nid++) { f64 val = load_per_node[nid]; min = sycl::fmin(min, val); max = sycl::fmax(max, val); avg += val; + } - if (shamcomm::world_rank() == 0) { + if (shamcomm::world_rank() == 0 + && shamcomm::logs::get_loglevel() >= shamcomm::logs::log_debug) { + for (i32 nid = 0; nid < world_size; nid++) { shamlog_debug_ln( "HilbertLoadBalance", "node :", nid, "load :", load_per_node[nid]); } } - avg /= shamcomm::world_size(); - for (i32 nid = 0; nid < shamcomm::world_size(); nid++) { + avg /= world_size; + +#pragma omp parallel for reduction(+ : var) + for (i32 nid = 0; nid < world_size; nid++) { f64 val = load_per_node[nid]; var += (val - avg) * (val - avg); } - var /= shamcomm::world_size(); + var /= world_size; if (shamcomm::world_rank() == 0) { std::string str = "Loadbalance stats : \n"; From a4110edb8a9bde4f50f58e719fa4b125016bded6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timoth=C3=A9e=20David--Cl=C3=A9ris?= Date: Sat, 21 Mar 2026 20:24:10 +0100 Subject: [PATCH 2/2] comment --- src/shamrock/src/scheduler/HilbertLoadBalance.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/shamrock/src/scheduler/HilbertLoadBalance.cpp b/src/shamrock/src/scheduler/HilbertLoadBalance.cpp index 2d988ac626..db0c56b303 100644 --- a/src/shamrock/src/scheduler/HilbertLoadBalance.cpp +++ b/src/shamrock/src/scheduler/HilbertLoadBalance.cpp @@ -26,6 +26,9 @@ inline void apply_node_patch_packing( std::vector &global_patch_list, std::vector &new_owner_table) { + // Note that there seems to be a data race here + // However this should never happends as packing index will only point toward a patch without + // packing. As such the data we are accessing should never be modified during this loop. #pragma omp parallel for for (size_t i = 0; i < global_patch_list.size(); i++) { if (global_patch_list[i].pack_node_index != u64_max) {