diff --git a/examples/benchmarks/sph_weak_scale_test.py b/examples/benchmarks/sph_weak_scale_test.py index 5a0bda2a99..a71226a6b5 100644 --- a/examples/benchmarks/sph_weak_scale_test.py +++ b/examples/benchmarks/sph_weak_scale_test.py @@ -63,7 +63,6 @@ ) cfg.set_boundary_periodic() cfg.set_eos_adiabatic(gamma) - cfg.set_max_neigh_cache_size(int(100e9)) cfg.print_status() model.set_solver_config(cfg) model.init_scheduler(scheduler_split_val, scheduler_merge_val) @@ -102,7 +101,7 @@ model.set_value_in_a_box("uint", "f64", 0, bmin, bmax) - rinj = 8 * dr + rinj = 16 * dr u_inj = 1 model.add_kernel_value("uint", "f64", u_inj, (0, 0, 0), rinj) @@ -116,9 +115,6 @@ model.set_cfl_cour(0.1) model.set_cfl_force(0.1) - model.set_cfl_multipler(1e-4) - model.set_cfl_mult_stiffness(1e6) - shamrock.backends.reset_mem_info_max() # converge smoothing length and compute initial dt @@ -129,8 +125,11 @@ res_cnts = [] res_system_metrics = [] - for i in range(5): + for i in range(10): shamrock.sys.mpi_barrier() + + # To replay the same step + model.set_next_dt(0.0) model.timestep() tmp_res_rate, tmp_res_cnt, tmp_system_metrics = ( diff --git a/src/shamalgs/src/collective/sparse_exchange.cpp b/src/shamalgs/src/collective/sparse_exchange.cpp index 6e6625544d..970d35d051 100644 --- a/src/shamalgs/src/collective/sparse_exchange.cpp +++ b/src/shamalgs/src/collective/sparse_exchange.cpp @@ -56,6 +56,7 @@ namespace shamalgs::collective { /// fetch u64_2 from global message data std::vector fetch_global_message_data( const std::vector &messages_send) { + __shamrock_stack_entry(); std::vector local_data = std::vector(messages_send.size()); @@ -84,6 +85,7 @@ namespace shamalgs::collective { /// decode message to get message std::vector decode_all_message(const std::vector &global_data) { + __shamrock_stack_entry(); std::vector message_all(global_data.size()); for (u64 i = 0; i < global_data.size(); i++) { message_all[i] = unpack(global_data[i]); @@ -94,6 +96,7 @@ namespace shamalgs::collective { /// compute message tags void compute_tags(std::vector &message_all) { + __shamrock_stack_entry(); std::vector tag_map(shamcomm::world_size(), 0); diff --git a/src/shammodels/gsph/src/modules/GSPHGhostHandler.cpp b/src/shammodels/gsph/src/modules/GSPHGhostHandler.cpp index 070fc6f858..531a6e7508 100644 --- a/src/shammodels/gsph/src/modules/GSPHGhostHandler.cpp +++ b/src/shammodels/gsph/src/modules/GSPHGhostHandler.cpp @@ -344,13 +344,13 @@ auto GSPHGhostHandler::gen_id_table_interfaces(GeneratorMap &&gen) for (auto &[k, v] : send_count_stats) { if (v > 0.2) { - warn_log += shambase::format("\n patch {} high interf/patch volume: {}", k, v); + // warn_log += shambase::format("\n patch {} high interf/patch volume: {}", k, v); has_warn = true; } } if (has_warn && shamcomm::world_rank() == 0) { - warn_log = "\n This can lead to high mpi " + warn_log = "\n High interf/patch volume. This can lead to high mpi " "overhead, try to increase the patch split crit" + warn_log; } diff --git a/src/shammodels/sph/include/shammodels/sph/SPHUtilities.hpp b/src/shammodels/sph/include/shammodels/sph/SPHUtilities.hpp index af5f17a9bd..80dacb3af9 100644 --- a/src/shammodels/sph/include/shammodels/sph/SPHUtilities.hpp +++ b/src/shammodels/sph/include/shammodels/sph/SPHUtilities.hpp @@ -86,7 +86,14 @@ namespace shammodels::sph { PatchField interactR_patch = sched.map_owned_to_patch_field_simple( [&](const Patch p, PatchDataLayer &pdat) -> flt { if (!pdat.is_empty()) { +#if false + auto tmp = pdat.get_field(ihpart).compute_max() * h_evol_max * Rkern; + shamcomm::logs::raw_ln( + shambase::format("patch {}, Rghost = {}", p.id_patch, tmp)); + return tmp; +#else return pdat.get_field(ihpart).compute_max() * h_evol_max * Rkern; +#endif } else { return shambase::VectorProperties::get_min(); } diff --git a/src/shammodels/sph/src/BasicSPHGhosts.cpp b/src/shammodels/sph/src/BasicSPHGhosts.cpp index 6b6357761c..8f5e1a164f 100644 --- a/src/shammodels/sph/src/BasicSPHGhosts.cpp +++ b/src/shammodels/sph/src/BasicSPHGhosts.cpp @@ -560,13 +560,13 @@ auto BasicSPHGhostHandler::gen_id_table_interfaces(GeneratorMap &&gen) for (auto &[k, v] : send_count_stats) { if (v > 0.2) { - warn_log += shambase::format("\n patch {} high interf/patch volume: {}", k, v); + // warn_log += shambase::format("\n patch {} high interf/patch volume: {}", k, v); has_warn = true; } } if (has_warn && shamcomm::world_rank() == 0) { - warn_log = "\n This can lead to high mpi " + warn_log = "\n High interf/patch volume. This can lead to high mpi " "overhead, try to increase the patch split crit" + warn_log; } diff --git a/src/shamrock/include/shamrock/scheduler/SerialPatchTree.hpp b/src/shamrock/include/shamrock/scheduler/SerialPatchTree.hpp index 8449755de1..b2decdfb4b 100644 --- a/src/shamrock/include/shamrock/scheduler/SerialPatchTree.hpp +++ b/src/shamrock/include/shamrock/scheduler/SerialPatchTree.hpp @@ -269,10 +269,13 @@ class SerialPatchTree { sycl::queue &queue, shamrock::patch::PatchField pfield, Func &&reducer) { + __shamrock_stack_entry(); + shamrock::patch::PatchtreeField ptfield; ptfield.allocate(get_element_count()); { + __shamrock_stack_entry(); sycl::host_accessor lpid{ shambase::get_check_ref(linked_patch_ids_buf), sycl::read_only}; sycl::host_accessor tree_field{ @@ -280,6 +283,8 @@ class SerialPatchTree { // init reduction std::unordered_map &idp_to_gid = sched.patch_list.id_patch_to_global_idx; + +#pragma omp parallel for for (u64 idx = 0; idx < get_element_count(); idx++) { tree_field[idx] = (lpid[idx] != u64_max) ? pfield.get(lpid[idx]) : T(); } diff --git a/src/shamrock/include/shamrock/scheduler/loadbalance/LoadBalanceStrategy.hpp b/src/shamrock/include/shamrock/scheduler/loadbalance/LoadBalanceStrategy.hpp index c429d06f4e..db6347a4ba 100644 --- a/src/shamrock/include/shamrock/scheduler/loadbalance/LoadBalanceStrategy.hpp +++ b/src/shamrock/include/shamrock/scheduler/loadbalance/LoadBalanceStrategy.hpp @@ -222,7 +222,8 @@ namespace shamrock::scheduler::details { inline LBMetric compute_LB_metric( const std::vector> &lb_vector, const std::vector &new_owners, - i32 world_size) { + i32 world_size, + f64 strategy_weight) { std::vector load_per_node(world_size, 0); @@ -250,7 +251,11 @@ namespace shamrock::scheduler::details { } var /= world_size; - return {min, max, avg, sycl::sqrt(var)}; + return { + min * strategy_weight, + max * strategy_weight, + avg * strategy_weight, + sycl::sqrt(var) * strategy_weight}; } } // namespace shamrock::scheduler::details @@ -270,30 +275,39 @@ namespace shamrock::scheduler { std::vector> &&lb_vector, i32 world_size = shamcomm::world_size()) { - auto tmpres = details::lb_startegy_parallel_sweep(lb_vector, world_size); - auto metric_psweep = details::compute_LB_metric(lb_vector, tmpres, world_size); + using namespace details; - auto tmpres_2 = details::lb_startegy_roundrobin(lb_vector, world_size); - auto metric_rrobin = details::compute_LB_metric(lb_vector, tmpres_2, world_size); + f64 factor_boost_psweep = 1; + auto tmpres = lb_startegy_parallel_sweep(lb_vector, world_size); + auto metric_psweep = compute_LB_metric(lb_vector, tmpres, world_size, factor_boost_psweep); + // We boost the round robin strategy to favor it if the difference is around 5% since the + // increased uniformity will probably offset the cost anyway + f64 factor_boost_rrobin = 0.95; + auto tmpres_2 = lb_startegy_roundrobin(lb_vector, world_size); + auto metric_rrobin + = compute_LB_metric(lb_vector, tmpres_2, world_size, factor_boost_rrobin); + + std::string strategy_name = "parallel sweep"; if (metric_rrobin.max < metric_psweep.max) { - tmpres = tmpres_2; + tmpres = tmpres_2; + strategy_name = "round robin"; } if (shamcomm::world_rank() == 0) { - logger::info_ln("LoadBalance", "summary :"); - logger::info_ln( - "LoadBalance", - " - strategy \"psweep\" : max =", - metric_psweep.max, - "min =", - metric_psweep.min); logger::info_ln( "LoadBalance", - " - strategy \"round robin\" : max =", - metric_rrobin.max, - "min =", - metric_rrobin.min); + shambase::format( + R"=(Summary (strategy = {0:}): + - strategy "psweep" : max = {1:.1f} min = {2:.1f} factor = {3:} + - strategy "round robin" : max = {4:.1f} min = {5:.1f} factor = {6:})=", + strategy_name, + metric_psweep.max, + metric_psweep.min, + factor_boost_psweep, + metric_rrobin.max, + metric_rrobin.min, + factor_boost_rrobin)); } return tmpres; } diff --git a/src/shamrock/src/solvergraph/ExchangeGhostLayer.cpp b/src/shamrock/src/solvergraph/ExchangeGhostLayer.cpp index c87577db9a..5edb418156 100644 --- a/src/shamrock/src/solvergraph/ExchangeGhostLayer.cpp +++ b/src/shamrock/src/solvergraph/ExchangeGhostLayer.cpp @@ -30,6 +30,28 @@ void shamrock::solvergraph::ExchangeGhostLayer::_impl_evaluate_internal() { auto &ghost_layer = edges.ghost_layer; const shamrock::solvergraph::RankGetter &rank_owner = edges.rank_owner; +#if false + std::unordered_map msg_sizes_send; + std::unordered_map msg_sizes_max_send; + + std::stringstream ss; + ss << "Rank " << shamcomm::world_rank() << " is sending " + << ghost_layer.patchdatas.get_native().size() << " patches sizes:"; + for (auto &pdat : ghost_layer.patchdatas.get_native()) { + u64 key = rank_owner.get_rank_owner(pdat.first.first); + // ss << pdat.first.first << " " << pdat.first.second << " " << pdat.second.get_obj_cnt() << + // "\n"; + msg_sizes_send[key] += pdat.second.get_obj_cnt(); + msg_sizes_max_send[key] = std::max(msg_sizes_max_send[key], u64(pdat.second.get_obj_cnt())); + } + for (auto &[rank, size] : msg_sizes_send) { + ss << "\n" + << "msg size from rank " << rank << " is " << size << " max is " + << msg_sizes_max_send[rank]; + } + shamcomm::logs::raw_ln(ss.str()); +#endif + shambase::DistributedDataShared recv_dat; shamalgs::collective::serialize_sparse_comm(