diff --git a/input.example.toml b/input.example.toml
index f890143a..5d0af7f5 100644
--- a/input.example.toml
+++ b/input.example.toml
@@ -28,6 +28,29 @@
     #   @example: [2, 2, 2] (total of 8 domains)
     decomposition = ""
 
+  [simulation.domain.load_balancing]
+    # Enable load balancing
+    #   @type: bool
+    #   @default: false
+    enable = ""
+    # Interval between load balancing steps
+    #   @type: int
+    #   @default: 10
+    interval = ""
+    # Dimensions to balance
+    #   @type: array<int>
+    #   @default: []
+    #   @note: Empty array means balance all dimensions
+    dimensions = ""
+    # Maximum number of iterations
+    #   @type: int
+    #   @default: 1
+    max_iterations = ""
+    # Tolerance for load balancing
+    #   @type: float
+    #   @default: 0.1
+    tolerance = ""
+
 [grid]
   # Spatial resolution of the grid
   #   @required
diff --git a/pgens/shock/pgen.hpp b/pgens/shock/pgen.hpp
index b8289c9a..f817a54f 100644
--- a/pgens/shock/pgen.hpp
+++ b/pgens/shock/pgen.hpp
@@ -327,5 +327,169 @@ namespace user {
                                            inj_box);
     }
   };
+
+  // update domain if needed
+  // todo: implement in main code like CustomPostStep
+  void CustomUpdateDomain(const SimulationParams& params, Domain<S, M>& local_domain, Domain<S, M>& new_domain, Domain<S, M>& global_domain) {
+
+    // check if the injector should be active
+    // ToDo: read parameter into global variable
+    if (step % params.template get<int>("setup.domain_decomposition_frequency") != 0) {
+      return;
+    }
+
+    // compute size of local and global domains 
+    const auto local_size   = local_domain->mesh.n_active()[in::x1];
+    const auto local_offset = local_domain->offset_ncells()[in::x1];
+    const auto global_size  = global_domain->mesh.n_active()[in::x1];
+
+    // global number density field along x1
+    index_t Nx_global[global_size] = { 0 };
+
+    /*
+      Option 1: Use built-in particle counting kernel to compute number density field and perform MPI allreduce to get global number density field. 
+      Then compute new domain boundaries based on the global number density field and perform reshuffling of particles according to new domain boundaries.
+    */
+    //     tuple_t<std::size_t, M::Dim> local_cells{ 0 }, global_x_min { 0 }, global_x_max { 0 };
+    // for (auto d = 0; d < M::Dim; ++d) {
+    //   local_cells[d] = local_domain->mesh.n_active(d);
+    //   global_x_min[d] = local_domain->offset_ncells(d);
+    //   global_x_max[d] = local_domain->mesh.n_active(d) + local_domain->offset_ncells(d);
+    // }
+
+    // // compute number density field
+    // array_t<int**> NumberOfParticles("num_particles", local_cells);
+    // auto scatter_buff = Kokkos::Experimental::create_scatter_view(NumberOfParticles);
+    // for (const auto& sp : specs) {
+    //   auto& prtl_spec = prtl_species[sp - 1];
+    //   // clang-format off
+    //   Kokkos::parallel_for(
+    //     "ComputeMoments",
+    //     prtl_spec.rangeActiveParticles(),
+    //     kernel::ParticleMoments_kernel<S, M, F, 6>({}, scatter_buff, buff_idx,
+    //                                                prtl_spec.i1, prtl_spec.i2, prtl_spec.i3,
+    //                                                prtl_spec.dx1, prtl_spec.dx2, prtl_spec.dx3,
+    //                                                prtl_spec.ux1, prtl_spec.ux2, prtl_spec.ux3,
+    //                                                prtl_spec.phi, prtl_spec.weight, prtl_spec.tag,
+    //                                                prtl_spec.mass(), prtl_spec.charge(),
+    //                                                false,
+    //                                                mesh.metric, mesh.flds_bc(),
+    //                                                ni2, ONE, 0));
+    //   // clang-format on
+    // }
+    // Kokkos::Experimental::contribute(NumberOfParticles, scatter_buff);
+
+    //     // compute particle profile along x1
+    // index_t Nx[global_size] = { 0 };
+    
+    // for (auto i = 0; i < local_size; ++i) {
+    //     for (auto d = 0u; d < M::Dim; ++d) {          
+    //         // todo: sum over other dimensions  
+    //         Nx[local_offset + i] += buffer(i, j, buff_idx);
+    //     }
+    // }
+    // // todo: perform MPI allreduce to get global Nx
+    // index_t Nx_global[global_size] = { 0 };
+    // MPI_ALLREDUCE(MPI_SUM, Nx, Nx_global, global_size, MPI_TYPE_INT_T, MPI_COMM_WORLD);
+    
+
+    /*
+      Option 2: Loop over particles and compute number density field manually. Then perform MPI allreduce to get global number density field. 
+      Then compute new domain boundaries based on the global number density field and perform reshuffling of particles according to new domain boundaries.
+    */
+    // store total number of particles in each cell in x1 direction
+    array_t<int**> NumberOfParticles("num_particles", local_size);
+    // loop over particle species
+    for (auto s { 0u }; s < 2; ++s) {
+        // get particle properties
+        auto& species = local_domain.species[s];
+        auto  i1      = species.i1;
+        auto  tag     = species.tag;
+
+          auto NumParts_scatter = Kokkos::Experimental::create_scatter_view(
+            NumberOfParticles);
+          Kokkos::parallel_for(
+            "ComputePPC",
+            species.rangeActiveParticles(),
+            Lambda(index_t p) {
+              if (tag(p) != ParticleTag::alive) {
+                return;
+              }
+              auto NumPart_acc    = NumParts_scatter.access();
+              NumPart_acc(i1(p)) += 1;
+            });
+          Kokkos::Experimental::contribute(NumberOfParticles, NumParts_scatter);
+    }
+
+    // construct contribution to global number density field along x1 direction
+    index_t Nx_local[global_size] = { 0 };
+    for (auto i = 0; i < local_size; ++i) {
+        Nx_local[i+local_offset] = NumberOfParticles(i);
+    }
+    // sum up all ranks
+    MPI_ALLREDUCE(MPI_SUM, Nx_local, Nx_global, global_size, MPI_TYPE_INT_T, MPI_COMM_WORLD);
+
+    // compute mean particle load
+    npart_t total_N = 0;
+    for (auto i = 0; i < global_size; ++i) {
+        total_N += Nx_global[i];
+    }
+
+    // get threshold number of particles
+    auto N_1_ranks = global_domain.ndomains_per_dim()[in::x1];
+    auto N_23_ranks = 0;
+    for (auto d = 1u; d < M::Dim; ++d) {         
+        N_23_ranks += global_domain.ndomains_per_dim()[d];
+    }
+
+    // maximum allowed load imbalance 
+    real_t tolerance = params.load_balancing_tolerance;
+    index_t target_N = total_N / (N_1_ranks + N_23_ranks) * tolerance;
+    // compute new domain boundaries in x1 direction
+    index_t bound_start[N_1_ranks];
+    index_t bound_end[N_1_ranks];
+
+    // overwrite N_23_ranks to be 1 if it's initally 0 to avoid division by zero
+    if (N_23_ranks == 0) {
+        N_23_ranks = 1;
+    }
+
+    bound_start[0] = 0;
+    for (auto r = 0; r < N_1_ranks-1; ++r) {
+        real_t cum_N = 0;
+        for (auto i = bound_start[r]; i < global_size; ++i) {
+            cum_N += static_cast<real_t>(Nx_global[i]) / N_23_ranks;
+            if (cum_N >= target_N) {
+                bound_end[r] = i;
+                // check if we have more than 5 cells
+                index_t Ncells = bound_end[r] - bound_start[r] + 1;
+                if (Ncells < 5) {
+                    bound_end[r] = bound_start[r] + 5;
+                }
+                bound_start[r+1] = bound_end[r]+1;
+                break;
+            }
+        }
+    }
+    // rest of the domain goes to the last rank
+    bound_end[N_1_ranks-1] = global_size - 1;
+
+    // compute maximum load imbalance after reshuffling
+    index_t max_N = 0;
+    for (auto r = 0; r < N_1_ranks; ++r) {
+        index_t N_r = 0;
+        for (auto i = bound_start[r]; i < bound_end[r]; ++i) {
+            N_r += Nx_global[i];
+        }
+        if (N_r > max_N) {
+            max_N = N_r;
+        } 
+    }
+    real_t imbalance = static_cast<real_t>(max_N) / (total_N / N_1_ranks);
+    
+
+    // todo: reshuffling of particles according to new domain boundaries
+
+  }
 } // namespace user
 #endif
diff --git a/src/engines/engine.hpp b/src/engines/engine.hpp
index 597de517..bce033e6 100644
--- a/src/engines/engine.hpp
+++ b/src/engines/engine.hpp
@@ -250,7 +250,7 @@ namespace ntt {
        "ParticleBoundaries", "Communications",
        "Injector", "Custom",
        "ParticleSort", "Output",
-       "Checkpoint" },
+       "Checkpoint", "LoadBalancing" },
       []() {
         Kokkos::fence();
        },
@@ -285,6 +285,14 @@ namespace ntt {
       time += dt;
       ++step;
 
+      const auto lb_enable = m_params.template get<bool>("simulation.domain.load_balancing.enable");
+      const auto lb_interval = m_params.template get<unsigned int>("simulation.domain.load_balancing.interval");
+      if (lb_enable && lb_interval > 0 && step % lb_interval == 0) {
+        timers.start("LoadBalancing");
+        m_metadomain.BalanceLoad(m_params);
+        timers.stop("LoadBalancing");
+      }
+
       auto print_output     = false;
       auto print_checkpoint = false;
 #if defined(OUTPUT_ENABLED)
diff --git a/src/framework/CMakeLists.txt b/src/framework/CMakeLists.txt
index df2bf4c6..826ce216 100644
--- a/src/framework/CMakeLists.txt
+++ b/src/framework/CMakeLists.txt
@@ -56,6 +56,7 @@ set(SOURCES
     ${SRC_DIR}/domain/metadomain.cpp
     ${SRC_DIR}/domain/metadomain_comm.cpp
     ${SRC_DIR}/domain/metadomain_sort.cpp
+    ${SRC_DIR}/domain/metadomain_lb.cpp
     ${SRC_DIR}/domain/metadomain_stats.cpp
     ${SRC_DIR}/containers/particles.cpp
     ${SRC_DIR}/containers/particles_sort.cpp
diff --git a/src/framework/domain/metadomain.h b/src/framework/domain/metadomain.h
index d04433fc..29154c35 100644
--- a/src/framework/domain/metadomain.h
+++ b/src/framework/domain/metadomain.h
@@ -106,6 +106,8 @@ namespace ntt {
                        const SimulationParams&,
                        Domain<S, M>&) const;
 
+    void BalanceLoad(const SimulationParams&);
+
     /**
      * @param global_ndomains total number of domains
      * @param global_decomposition decomposition of the global domain
diff --git a/src/framework/domain/metadomain_lb.cpp b/src/framework/domain/metadomain_lb.cpp
new file mode 100644
index 00000000..90289a44
--- /dev/null
+++ b/src/framework/domain/metadomain_lb.cpp
@@ -0,0 +1,346 @@
+#include "framework/domain/metadomain.h"
+#include "framework/domain/domain.h"
+#include "framework/specialization_registry.h"
+#include "arch/mpi_tags.h"
+#include "utils/numeric.h"
+#include "utils/reporter.h"
+#include "framework/parameters/parameters.h"
+
+#include <vector>
+#include <cmath>
+#include <iostream>
+
+#if defined(MPI_ENABLED)
+  #include <mpi.h>
+#endif
+
+namespace ntt {
+
+  // Load balancing helper based on the 1D julia model
+  bool negotiate_boundary_single(const std::vector<real_t>& N, std::vector<int>& bounds, int i, int n_ghost, double tol) {
+    int left_start = bounds[i];
+    int mid = bounds[i+1];
+    int right_end = bounds[i+2];
+
+    double w1 = 0;
+    for (int k = left_start; k < mid; ++k) w1 += N[k];
+    double w2 = 0;
+    for (int k = mid; k < right_end; ++k) w2 += N[k];
+
+    if (std::abs(w1 - w2) <= tol) return false;
+
+    int L_min = 2 * n_ghost + 1;
+    double best_diff = std::abs(w1 - w2);
+    int best_shift = 0;
+
+    if (w1 > w2) {
+      int max_shift = (mid - left_start) - L_min;
+      double current_transfer = 0.0;
+      for (int s = 1; s <= max_shift; ++s) {
+        current_transfer += N[mid - s];
+        double new_diff = std::abs((w1 - current_transfer) - (w2 + current_transfer));
+        if (new_diff < best_diff) {
+          best_diff = new_diff;
+          best_shift = s;
+        } else {
+          break;
+        }
+      }
+      if (best_shift > 0) {
+        bounds[i+1] -= best_shift;
+        return true;
+      }
+    } else if (w2 > w1) {
+      int max_shift = (right_end - mid) - L_min;
+      double current_transfer = 0.0;
+      for (int s = 1; s <= max_shift; ++s) {
+        current_transfer += N[mid + s - 1]; // mid is inclusive for right domain.
+        double new_diff = std::abs((w1 + current_transfer) - (w2 - current_transfer));
+        if (new_diff < best_diff) {
+          best_diff = new_diff;
+          best_shift = s;
+        } else {
+          break;
+        }
+      }
+      if (best_shift > 0) {
+        bounds[i+1] += best_shift;
+        return true;
+      }
+    }
+    return false;
+  }
+
+  template <SimEngine::type S, class M>
+    requires IsCompatibleWithMetadomain<M>
+  void Metadomain<S, M>::BalanceLoad(const SimulationParams& params) {
+    const auto lb_dims       = params.template get<std::vector<int>>("simulation.domain.load_balancing.dimensions");
+    const auto lb_max_iters  = static_cast<int>(params.template get<unsigned int>("simulation.domain.load_balancing.max_iterations"));
+    const auto lb_tol        = static_cast<double>(params.template get<real_t>("simulation.domain.load_balancing.tolerance"));
+
+    // if no dimensions specified, skip load balancing
+    if (lb_dims.empty()) return;
+
+    auto global_boundaries = std::vector<std::vector<ncells_t>> {};
+    auto offset_ncells     = std::vector<std::vector<ncells_t>> {};
+    auto global_ncells     = std::vector<std::vector<ncells_t>> {};
+    
+    // track if any boundary changed across all dimensions to avoid unnecessary domain updates
+    bool any_change = false;
+
+    // loop over all dimenstions to be load balanced
+    for (int dim : lb_dims) {
+      // ToDo: fallback options for dimentions that should not be load-balanced.
+      if (dim < 1 || dim > D) continue; 
+      int d = dim - 1;
+
+      int nx_domains = g_ndomains_per_dim[d];
+      if (nx_domains < 2) continue;
+
+      int global_ncells = g_mesh.n_active(static_cast<in>(d));
+      Kokkos::View<real_t*> d_N("N", global_ncells);
+
+      // 1. Gather particles histogram natively on the GPU device
+      runOnLocalDomains([&](auto& dom) {
+        for (const auto& sp : dom.species) {
+          if (sp.npart() == 0) continue;
+          
+          auto global_offset = dom.offset_ncells()[d];
+          auto i_view = (d == 0) ? sp.i1 : ((d == 1) ? sp.i2 : sp.i3);
+
+          Kokkos::parallel_for("GatherHistogram", sp.rangeActiveParticles(), KOKKOS_LAMBDA(int p) {
+            int local_cell = i_view(p) - N_GHOSTS;
+            int global_cell = global_offset + local_cell;
+            if (global_cell >= 0 && global_cell < global_ncells) {
+                Kokkos::atomic_add(&d_N(global_cell), ONE); 
+            }
+          });
+        }
+      });
+
+      auto h_N = Kokkos::create_mirror_view(d_N);
+      Kokkos::deep_copy(h_N, d_N);
+      std::vector<real_t> N(global_ncells, ZERO);
+      for (int i = 0; i < global_ncells; ++i) {
+          N[i] = h_N(i);
+      }
+
+#if defined(MPI_ENABLED)
+      std::vector<real_t> N_global(global_ncells, ZERO);
+      MPI_Allreduce(N.data(), N_global.data(), global_ncells, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
+      N = N_global;
+#endif
+
+      // 2. Setup bounds vector 
+      std::vector<ncells_t> bounds(nx_domains + 1, 0);
+      bounds[0] = 0;
+      bounds[nx_domains] = global_ncells;
+      
+      for (int i = 0; i < nx_domains - 1; ++i) {
+        std::vector<unsigned int> target_idx(D, 0);
+        target_idx[d] = i; 
+        unsigned int flatten_idx = g_domain_offset2index[target_idx];
+        const auto& dom = g_subdomains[flatten_idx];
+        // compute global cell index of the right boundary of this domain
+        bounds[i+1] = dom.offset_ncells()[d] + dom.mesh.n_active(static_cast<int>(d));
+      }
+
+      std::vector<ncells_t> old_bounds = bounds;
+
+      // 3. Negotiate boundaries using iterative method
+      for (int iter = 1; iter <= lb_max_iters; ++iter) {
+        bool moved_global = false;
+        // RED phase
+        for (int i = 1; i < nx_domains; i += 2) {
+            moved_global |= negotiate_boundary_single(N, bounds, i - 1, N_GHOSTS, lb_tol);
+        }
+        // BLACK phase
+        for (int i = 2; i < nx_domains; i += 2) {
+            moved_global |= negotiate_boundary_single(N, bounds, i - 1, N_GHOSTS, lb_tol);
+        }
+        if (!moved_global) break;
+      }
+
+      // check if any boundary changed
+      for (int i = 0; i <= nx_domains; ++i) {
+        if (bounds[i] != old_bounds[i]) any_change = true;
+      }
+
+      // store updated boundaries for this dimension
+      global_boundaries.push_back(bounds);
+
+      // store offsets for this dimension (for later use in shifting particles)
+      std::vector<ncells_t> offsets(nx_domains, 0);
+      for (int i = 1; i <= nx_domains; ++i) { // first boundary is fixed at 0
+        offsets[i] = bounds[i] - old_bounds[i];
+      }
+      offset_ncells.push_back(offsets);
+
+      // store number of cells for each domain in this dimension (for later use in domain updates)
+      std::vector<ncells_t> new_ncells(nx_domains, 0);
+      for (int i = 1; i <= nx_domains; ++i) {
+        new_ncells[i] = bounds[i] - bounds[i-1];
+      }
+      global_ncells.push_back(new_ncells);
+      info::Print(fmt::format("Load Balancing shifted boundaries in dimension {}", dim), true);
+
+    } // loop over dimensions
+
+    // no changes, skip domain updates
+    if (!any_change) return;
+
+    // ToDo: Mesh update
+
+
+    // ToDo: Field update
+    for (unsigned int idx { 0 }; idx < g_ndomains; ++idx) {
+#if defined(MPI_ENABLED)
+      // !TODO: need to change to support multiple domains per rank
+      // assuming ONE local subdomain
+      const auto local = ((int)idx == g_mpi_rank);
+      if (local) {
+        auto nxnew = std::vector<ncells_t>(D, 0);
+        auto nxold = std::vector<ncells_t>(D, 0);
+        for (auto d { 0 }; d < (short)D; ++d) {
+          nxnew.push_back(new_ncells[d][idx]);
+          nxold.push_back(nxnew - offset_ncells[d][idx]);
+        }
+        if (offset_ncells[d][idx] < 0) {
+          // domain is shrinking -> right boundary moves left, need to send data to the right neighbor
+          // ToDo: define fields
+          ndfield_t<D, 6> new_fields {... };
+          ndfield_t<D, 6> send_fields {... };
+
+          if constexpr (D == Dim::_1D) {
+            Kokkos::deep_copy(new_fields, Kokkos::slice(old_fields, { 0, nxnew[0] }));
+            Kokkos::deep_copy(send_fields, Kokkos::slice(old_fields, { nxnew[0], nxold[0] }));
+          } else if constexpr (D == Dim::_2D) {
+            Kokkos::deep_copy(new_fields, Kokkos::slice(old_fields, { 0, nxnew[0] }, { 0, nxnew[1] }));
+            Kokkos::deep_copy(send_fields, Kokkos::slice(old_fields, { nxnew[0], nxold[0] }, { nxnew[1], nxold[1] }));
+          } else if constexpr (D == Dim::_3D) {
+            Kokkos::deep_copy(new_fields, Kokkos::slice(old_fields, { 0, nxnew[0] }, { 0, nxnew[1] }, { 0, nxnew[2] }));
+            Kokkos::deep_copy(send_fields, Kokkos::slice(old_fields, { nxnew[0], nxold[0] }, { nxnew[1], nxold[1] }, { nxnew[2], nxold[2] }));
+          }
+      
+          MPI_SEND(send_fields, ...);
+        } else if (offset_ncells[d][idx] > 0) {
+          // domain is growing -> right boundary moves right, need to receive data from the left neighbor
+          ndfield_t<D, 6> new_fields {... };
+          ndfield_t<D, 6> recv_fields {... };
+          
+          if constexpr (D == Dim::_1D) {
+            Kokkos::deep_copy(Kokkos::slice(new_fields, { nxnew[0] - nxold[0], nxnew[0] }), old_fields);
+          } else if constexpr (D == Dim::_2D) {
+            Kokkos::deep_copy(Kokkos::slice(new_fields, { nxnew[0] - nxold[0], nxnew[0] }, { nxnew[1] - nxold[1], nxnew[1] }), old_fields);
+          } else if constexpr (D == Dim::_3D) {
+            Kokkos::deep_copy(Kokkos::slice(new_fields, { nxnew[0] - nxold[0], nxnew[0] }, { nxnew[1] - nxold[1], nxnew[1] }, 
+              { nxnew[2] - nxold[2], nxnew[2] }), old_fields);
+          }
+
+          MPI_RECV(recv_fields, ...);
+          Kokkos::deep_copy(Kokkos::slice(new_fields, { 0, nxnew[0] - nxold[0] }), recv_fields);
+        }
+      }
+      
+      g_subdomains.back().set_mpi_rank(idx);
+      if (g_subdomains.back().mpi_rank() == g_mpi_rank) {
+        g_local_subdomain_indices.push_back(idx);
+      }
+#endif // MPI_ENABLED  
+
+    }
+      
+    // ToDo: Particle update
+    for(size_t s_idx = 0; s_idx < g_species_params.size(); ++s_idx) {
+      auto& sp = dom.species[s_idx];
+
+      // Reset all copied particle tags to 'alive': particles with
+      // send-direction tags from the previous pusher step must not be
+      // re-sent by CommunicateParticles; ShiftParticles below will
+      // re-tag any particle that is now out of the new domain bounds.
+      {
+        auto tag_view = sp.tag;
+        Kokkos::parallel_for("ResetTags_LB", sp.rangeActiveParticles(), KOKKOS_LAMBDA(int p) {
+            tag_view(p) = ParticleTag::alive;
+        });
+      }
+      
+      int offset_diff1 = offset_ncells[0][idx];
+      if constexpr (D == Dim::_1D) {
+        if (offset_diff1 != 0) {
+          auto i1_view = sp.i1;
+          auto i1_prev_view = sp.i1_prev;
+          auto tag_view = sp.tag;
+          int ni1 = new_ncells[0];
+          Kokkos::parallel_for("ShiftParticles_1D", sp.rangeActiveParticles(), KOKKOS_LAMBDA(int p) {
+             i1_view(p) += offset_diff1;
+             i1_prev_view(p) += offset_diff1;
+#if defined(MPI_ENABLED)
+              tag_view(p) = mpi::SendTag(tag_view(p), i1_view(p) < 0, i1_view(p) >= ni1);
+#endif
+          });
+        }
+      } else if constexpr (D == Dim::_2D) {
+        int offset_diff2 = offset_ncells[domain_idx][1];
+        if (offset_diff1 != 0 || offset_diff2 != 0) {
+          auto i1_view = sp.i1;
+          auto i1_prev_view = sp.i1_prev;
+          auto i2_view = sp.i2;
+          auto i2_prev_view = sp.i2_prev;
+          auto tag_view = sp.tag;
+          int ni1 = new_ncells[0];
+          int ni2 = new_ncells[1];
+          Kokkos::parallel_for("ShiftParticles_2D", sp.rangeActiveParticles(), KOKKOS_LAMBDA(int p) {
+             i1_view(p) += offset_diff1;
+             i2_view(p) += offset_diff2;
+             i1_prev_view(p) += offset_diff1;
+             i2_prev_view(p) += offset_diff2;
+#if defined(MPI_ENABLED)
+                      tag_view(p) = mpi::SendTag(tag_view(p), i1_view(p) < 0, i1_view(p) >= ni1, i2_view(p) < 0, i2_view(p) >= ni2);
+#endif
+                   });
+                 }
+               } else if constexpr (D == Dim::_3D) {
+                 int offset_diff2 = offset_ncells[domain_idx][1];
+                 int offset_diff3 = offset_ncells[domain_idx][2];
+                 if (offset_diff1 != 0 || offset_diff2 != 0 || offset_diff3 != 0) {
+                   auto i1_view = sp.i1;
+                   auto i2_view = sp.i2;
+                   auto i3_view = sp.i3;
+                   auto i1_prev_view = sp.i1_prev;
+                   auto i2_prev_view = sp.i2_prev;
+                   auto i3_prev_view = sp.i3_prev;
+                   auto tag_view = sp.tag;
+                   int ni1 = new_ncells[0];
+                   int ni2 = new_ncells[1];
+                   int ni3 = new_ncells[2];
+                   Kokkos::parallel_for("ShiftParticles_3D", sp.rangeActiveParticles(), KOKKOS_LAMBDA(int p) {
+                      i1_view(p) += offset_diff1;
+                      i2_view(p) += offset_diff2;
+                      i3_view(p) += offset_diff3;
+                      i1_prev_view(p) += offset_diff1;
+                      i2_prev_view(p) += offset_diff2;
+                      i3_prev_view(p) += offset_diff3;
+#if defined(MPI_ENABLED)
+                      tag_view(p) = mpi::SendTag(tag_view(p),
+                                                 i1_view(p) < 0, i1_view(p) >= ni1,
+                                                 i2_view(p) < 0, i2_view(p) >= ni2,
+                                                 i3_view(p) < 0, i3_view(p) >= ni3);
+#endif
+                   });
+                 }
+               }
+           }
+
+      CommunicateParticles(dom);
+      CommunicateFields(dom, Comm::E | Comm::B | Comm::J); 
+    }
+  }
+
+#define METADOMAIN_LB(S, M, D) \
+  template void Metadomain<S, M<D>>::BalanceLoad(const SimulationParams&);
+
+  NTT_FOREACH_SPECIALIZATION(METADOMAIN_LB)
+#undef METADOMAIN_LB
+
+} // namespace ntt
diff --git a/src/framework/parameters/algorithms.cpp b/src/framework/parameters/algorithms.cpp
index 856d05fc..9ea0dc4a 100644
--- a/src/framework/parameters/algorithms.cpp
+++ b/src/framework/parameters/algorithms.cpp
@@ -125,6 +125,7 @@ namespace ntt {
                                        "larmor_max",
                                        ZERO);
       }
+
     }
 
     void Algorithms::setParams(const std::map<std::string, bool>& extra,
diff --git a/src/framework/parameters/algorithms.h b/src/framework/parameters/algorithms.h
index a496cb7b..fbf49b9a 100644
--- a/src/framework/parameters/algorithms.h
+++ b/src/framework/parameters/algorithms.h
@@ -46,6 +46,7 @@ namespace ntt {
       real_t synchrotron_gamma_rad;
       real_t compton_gamma_rad;
 
+
       void read(real_t, const std::map<std::string, bool>&, const toml::value&);
       void setParams(const std::map<std::string, bool>&, SimulationParams*) const;
     };
diff --git a/src/framework/parameters/grid.cpp b/src/framework/parameters/grid.cpp
index 9e5d39df..e0176592 100644
--- a/src/framework/parameters/grid.cpp
+++ b/src/framework/parameters/grid.cpp
@@ -385,6 +385,39 @@ namespace ntt {
         "decomposition",
         std::vector<int> { -1, -1, -1 });
 
+      load_balancing_enable = toml::find_or(toml_data,
+                                            "simulation",
+                                            "domain",
+                                            "load_balancing",
+                                            "enable",
+                                            false);
+      load_balancing_interval = toml::find_or(toml_data,
+                                              "simulation",
+                                              "domain",
+                                              "load_balancing",
+                                              "interval",
+                                              0u);
+      load_balancing_dimensions = toml::find_or<std::vector<int>>(
+        toml_data,
+        "simulation",
+        "domain",
+        "load_balancing",
+        "dimensions",
+        std::vector<int> {});
+
+      load_balancing_max_iterations = toml::find_or(toml_data,
+                                                    "simulation",
+                                                    "domain",
+                                                    "load_balancing",
+                                                    "max_iterations",
+                                                    10);
+      load_balancing_tolerance = toml::find_or(toml_data,
+                                               "simulation",
+                                               "domain",
+                                               "load_balancing",
+                                               "tolerance",
+                                               0.1);
+
       /* resolution and dimension ------------------------------------------- */
       resolution = toml::find<std::vector<ncells_t>>(toml_data, "grid", "resolution");
       raise::ErrorIf(resolution.size() < 1 || resolution.size() > 3,
@@ -533,6 +566,11 @@ namespace ntt {
     void Grid::setParams(SimulationParams* params) const {
       params->set("simulation.domain.number", number_of_domains);
       params->set("simulation.domain.decomposition", domain_decomposition);
+      params->set("simulation.domain.load_balancing.enable", load_balancing_enable);
+      params->set("simulation.domain.load_balancing.interval", load_balancing_interval);
+      params->set("simulation.domain.load_balancing.dimensions", load_balancing_dimensions);
+      params->set("simulation.domain.load_balancing.max_iterations", load_balancing_max_iterations);
+      params->set("simulation.domain.load_balancing.tolerance", load_balancing_tolerance);
 
       params->set("grid.resolution", resolution);
       params->set("grid.dim", dim);
diff --git a/src/framework/parameters/grid.h b/src/framework/parameters/grid.h
index 978b7f32..e44554f7 100644
--- a/src/framework/parameters/grid.h
+++ b/src/framework/parameters/grid.h
@@ -56,6 +56,11 @@ namespace ntt {
     struct Grid {
       unsigned int     number_of_domains;
       std::vector<int> domain_decomposition;
+      unsigned int     load_balancing_interval;
+      std::vector<int> load_balancing_dimensions;
+      bool             load_balancing_enable;
+      unsigned int     load_balancing_max_iterations;
+      real_t           load_balancing_tolerance;
 
       std::vector<ncells_t> resolution;
       Dimension             dim;