diff --git a/src/ddx_constants.f90 b/src/ddx_constants.f90
index debbe22c..5ff7370b 100644
--- a/src/ddx_constants.f90
+++ b/src/ddx_constants.f90
@@ -219,6 +219,11 @@ module ddx_constants
     !! computation of forces (gradients). Allocated and used only if
     !! fmm=1.
     integer :: grad_nbasis
+    !> Number of layers in the bisection tree. Defined only if fmm=1.
+    integer :: nlayers
+    !> The first and the last clusters of each layer of the tree. The
+    !! dimension is (2, nlayers). Allocated and used only if fmm=1.
+    integer, allocatable :: layers(:, :)
     !> Inner tolerance for microiterations done when using ddLPB
     real(dp) :: inner_tol
     !> Whether the diagonal of the matrices has to be used in the mvp for
@@ -837,7 +842,11 @@ subroutine constants_geometry_init(params, constants)
             & constants % parent, constants % cnode, constants % rnode, &
             & constants % snode, constants % error_message, &
             & constants % error_flag)
-        if (params % error_flag .ne. 0) return
+        if (constants % error_flag .ne. 0) return
+
+        call compute_layers(constants)
+        if (constants % error_flag .ne. 0) return
+
         ! Get number of far and near admissible pairs
         iwork = 0
         jwork = 1
@@ -1429,6 +1438,73 @@ subroutine mkprec(lmax, nbasis, nsph, ngrid, eps, ui, wgrid, vgrid, &
     error_message = ""
 end subroutine mkprec
 
+!> Compute and store information about the layers of the tree used in
+!! the fmm operations.
+!!
+!! @param[inout] constants: Object containing all constants.
+!!
+subroutine compute_layers(constants)
+    implicit none
+    type(ddx_constants_type), intent(inout) :: constants
+    integer :: i, distance, info, max_distance
+    integer, allocatable :: distance_from_root(:)
+
+    allocate(distance_from_root(constants % nclusters), stat=info)
+    if (info.ne.0) then
+        constants % error_flag = 1
+        constants % error_message = "Allocation failed in compute_layers"
+        return
+    end if
+
+    ! first we compute the distances from the root of the tree and
+    ! the number of layers
+    distance_from_root(1) = 0
+    do i = 1, constants % nclusters
+        distance = distance_from_root(i)
+        if (constants % children(1, i) .ne. 0) &
+            & distance_from_root(constants % children(1, i)) = distance + 1
+        if (constants % children(2, i) .ne. 0) &
+            & distance_from_root(constants % children(2, i)) = distance + 1
+        if (distance .gt. max_distance) max_distance = distance
+    end do
+    constants % nlayers = max_distance + 1
+
+    ! now we can allocate the layer information using the actual number
+    ! of layers
+    allocate(constants % layers(2, constants % nlayers), stat=info)
+    if (info.ne.0) then
+        constants % error_flag = 1
+        constants % error_message = "Allocation failed in compute_layers"
+        return
+    end if
+
+    ! finally we can populate the layer information
+    distance = 0
+    constants % layers(1, 1) = 1
+    do i = 1, constants % nclusters
+        if (distance_from_root(i) .lt. distance) then
+            constants % error_flag = 1
+            constants % error_message = "Error in compute_layers"
+            return
+        end if
+        if (distance_from_root(i) .ne. distance) then
+            constants % layers(2, distance + 1) = i
+            if (distance_from_root(i) .lt. constants % nlayers) &
+                & constants % layers(1, distance + 2) = i + 1
+            distance = distance_from_root(i)
+        end if
+    end do
+    constants % layers(2, constants % nlayers) = constants % nclusters
+
+    deallocate(distance_from_root, stat=info)
+    if (info.ne.0) then
+        constants % error_flag = 1
+        constants % error_message = "Deallocation failed in compute_layers"
+        return
+    end if
+
+end subroutine compute_layers
+
 !> Build a recursive inertial binary tree
 !!
 !! Uses inertial bisection in a recursive manner until each leaf node has only
@@ -2249,6 +2325,14 @@ subroutine constants_free(constants)
             return
         end if
     end if
+    if (allocated(constants % layers)) then
+        deallocate(constants % layers, stat=istat)
+        if (istat .ne. 0) then
+            constants % error_message = "`layers` deallocation failed!"
+            constants % error_flag = 1
+            return
+        end if
+    end if
 end subroutine constants_free
 
 end module ddx_constants
diff --git a/src/ddx_core.f90 b/src/ddx_core.f90
index 73c701a6..56134ea9 100644
--- a/src/ddx_core.f90
+++ b/src/ddx_core.f90
@@ -187,6 +187,8 @@ module ddx_core
     real(dp), allocatable :: x_adj_r_grid(:, :)
     real(dp), allocatable :: x_adj_e_grid(:, :)
     real(dp), allocatable :: phi_n(:, :)
+    real(dp) :: hsp_time
+    real(dp) :: hsp_adj_time
 
 end type ddx_state_type
 
@@ -1858,7 +1860,7 @@ subroutine tree_m2m_rotation(params, constants, node_m)
     ! Output
     real(dp), intent(inout) :: node_m((params % pm+1)**2, constants % nclusters)
     ! Temporary workspace
-    real(dp) :: work(6*params % pm**2 + 19*params % pm + 8)
+    real(dp) :: work(6*params % pm**2 + 19*params % pm + 8, params % nproc)
     ! Call corresponding work routine
     call tree_m2m_rotation_work(params, constants, node_m, work)
 end subroutine tree_m2m_rotation
@@ -1874,36 +1876,40 @@ subroutine tree_m2m_rotation_work(params, constants, node_m, work)
     ! Output
     real(dp), intent(inout) :: node_m((params % pm+1)**2, constants % nclusters)
     ! Temporary workspace
-    real(dp), intent(out) :: work(6*params % pm**2 + 19*params % pm + 8)
+    real(dp), intent(out) :: work(6*params % pm**2 + 19*params % pm + 8, &
+        & params % nproc)
     ! Local variables
-    integer :: i, j
+    integer :: i, j, l, iproc
     real(dp) :: c1(3), c(3), r1, r
     ! Bottom-to-top pass
-    !!$omp parallel do default(none) shared(constants,params,node_m) &
-    !!$omp private(i,j,c1,c,r1,r,work)
-    do i = constants % nclusters, 1, -1
-        ! Leaf node does not need any update
-        if (constants % children(1, i) == 0) cycle
-        c = constants % cnode(:, i)
-        r = constants % rnode(i)
-        ! First child initializes output
-        j = constants % children(1, i)
-        c1 = constants % cnode(:, j)
-        r1 = constants % rnode(j)
-        c1 = c1 - c
-        call fmm_m2m_rotation_work(c1, r1, r, &
-            & params % pm, &
-            & constants % vscales, &
-            & constants % vcnk, one, &
-            & node_m(:, j), zero, node_m(:, i), work)
-        ! All other children update the same output
-        do j = constants % children(1, i)+1, constants % children(2, i)
+    do l = constants % nlayers, 1, -1
+        !$omp parallel do default(none) shared(constants,params,node_m,l, &
+        !$omp work) private(i,j,c1,c,r1,r,iproc)
+        do i = constants % layers(1, l), constants % layers(2, l)
+            iproc = omp_get_thread_num() + 1
+            ! Leaf node does not need any update
+            if (constants % children(1, i) == 0) cycle
+            c = constants % cnode(:, i)
+            r = constants % rnode(i)
+            ! First child initializes output
+            j = constants % children(1, i)
             c1 = constants % cnode(:, j)
             r1 = constants % rnode(j)
             c1 = c1 - c
-            call fmm_m2m_rotation_work(c1, r1, r, params % pm, &
-                & constants % vscales, constants % vcnk, one, &
-                & node_m(:, j), one, node_m(:, i), work)
+            call fmm_m2m_rotation_work(c1, r1, r, &
+                & params % pm, &
+                & constants % vscales, &
+                & constants % vcnk, one, &
+                & node_m(:, j), zero, node_m(:, i), work(:, iproc))
+            ! All other children update the same output
+            do j = constants % children(1, i)+1, constants % children(2, i)
+                c1 = constants % cnode(:, j)
+                r1 = constants % rnode(j)
+                c1 = c1 - c
+                call fmm_m2m_rotation_work(c1, r1, r, params % pm, &
+                    & constants % vscales, constants % vcnk, one, &
+                    & node_m(:, j), one, node_m(:, i), work(:, iproc))
+            end do
         end do
     end do
 end subroutine tree_m2m_rotation_work
@@ -1939,32 +1945,36 @@ subroutine tree_m2m_bessel_rotation_work(params, constants, node_m)
     real(dp) :: work(6*params % pm**2 + 19*params % pm + 8)
     complex(dp) :: work_complex(2*params % pm+1)
     ! Local variables
-    integer :: i, j
+    integer :: i, j, l
     real(dp) :: c1(3), c(3), r1, r
     ! Bottom-to-top pass
-    do i = constants % nclusters, 1, -1
-        ! Leaf node does not need any update
-        if (constants % children(1, i) == 0) cycle
-        c = constants % cnode(:, i)
-        r = constants % rnode(i)
-        ! First child initializes output
-        j = constants % children(1, i)
-        c1 = constants % cnode(:, j)
-        r1 = constants % rnode(j)
-        c1 = params % kappa*(c1 - c)
-        call fmm_m2m_bessel_rotation_work(c1, &
-            & constants % SK_rnode(:, j), constants % SK_rnode(:, i), &
-            & params % pm, constants % vscales, one, node_m(:, j), zero, &
-            & node_m(:, i), work, work_complex)
-        ! All other children update the same output
-        do j = constants % children(1, i)+1, constants % children(2, i)
+    do l = constants % nlayers, 1, -1
+        !$omp parallel do default(none) shared(constants,params,node_m,l) &
+        !$omp private(i,j,c1,c,r1,r,work,work_complex)
+        do i = constants % layers(1, l), constants % layers(2, l)
+            ! Leaf node does not need any update
+            if (constants % children(1, i) == 0) cycle
+            c = constants % cnode(:, i)
+            r = constants % rnode(i)
+            ! First child initializes output
+            j = constants % children(1, i)
             c1 = constants % cnode(:, j)
             r1 = constants % rnode(j)
             c1 = params % kappa*(c1 - c)
             call fmm_m2m_bessel_rotation_work(c1, &
                 & constants % SK_rnode(:, j), constants % SK_rnode(:, i), &
-                & params % pm, constants % vscales, one, node_m(:, j), one, &
+                & params % pm, constants % vscales, one, node_m(:, j), zero, &
                 & node_m(:, i), work, work_complex)
+            ! All other children update the same output
+            do j = constants % children(1, i)+1, constants % children(2, i)
+                c1 = constants % cnode(:, j)
+                r1 = constants % rnode(j)
+                c1 = params % kappa*(c1 - c)
+                call fmm_m2m_bessel_rotation_work(c1, &
+                    & constants % SK_rnode(:, j), constants % SK_rnode(:, i), &
+                    & params % pm, constants % vscales, one, node_m(:, j), one, &
+                    & node_m(:, i), work, work_complex)
+            end do
         end do
     end do
 end subroutine tree_m2m_bessel_rotation_work
@@ -1980,7 +1990,7 @@ subroutine tree_m2m_rotation_adj(params, constants, node_m)
     ! Output
     real(dp), intent(inout) :: node_m((params % pm+1)**2, constants % nclusters)
     ! Temporary workspace
-    real(dp) :: work(6*params % pm**2 + 19*params % pm + 8)
+    real(dp) :: work(6*params % pm**2 + 19*params % pm + 8, params % nproc)
     ! Call corresponding work routine
     call tree_m2m_rotation_adj_work(params, constants, node_m, work)
 end subroutine tree_m2m_rotation_adj
@@ -1996,21 +2006,27 @@ subroutine tree_m2m_rotation_adj_work(params, constants, node_m, work)
     ! Output
     real(dp), intent(inout) :: node_m((params % pm+1)**2, constants % nclusters)
     ! Temporary workspace
-    real(dp), intent(out) :: work(6*params % pm**2 + 19*params % pm + 8)
+    real(dp), intent(out) :: work(6*params % pm**2 + 19*params % pm + 8, &
+        & params % nproc)
     ! Local variables
-    integer :: i, j
+    integer :: i, j, l, iproc
     real(dp) :: c1(3), c(3), r1, r
     ! Top-to-bottom pass
-    do i = 2, constants % nclusters
-        j = constants % parent(i)
-        c = constants % cnode(:, j)
-        r = constants % rnode(j)
-        c1 = constants % cnode(:, i)
-        r1 = constants % rnode(i)
-        c1 = c - c1
-        call fmm_m2m_rotation_adj_work(c1, r, r1, params % pm, &
-            & constants % vscales, constants % vcnk, one, node_m(:, j), one, &
-            & node_m(:, i), work)
+    do l = 2, constants % nlayers
+        !$omp parallel do default(none) shared(constants,params,node_m,l, &
+        !$omp work) private(i,j,c1,c,r1,r,iproc)
+        do i = constants % layers(1, l), constants % layers(2, l)
+            iproc = omp_get_thread_num() + 1
+            j = constants % parent(i)
+            c = constants % cnode(:, j)
+            r = constants % rnode(j)
+            c1 = constants % cnode(:, i)
+            r1 = constants % rnode(i)
+            c1 = c - c1
+            call fmm_m2m_rotation_adj_work(c1, r, r1, params % pm, &
+                & constants % vscales, constants % vcnk, one, node_m(:, j), one, &
+                & node_m(:, i), work(:, iproc))
+        end do
     end do
 end subroutine tree_m2m_rotation_adj_work
 !------------------------------------------------------------------------------
@@ -2042,18 +2058,21 @@ subroutine tree_m2m_bessel_rotation_adj_work(params, constants, node_m)
     real(dp) :: work(6*params % pm**2 + 19*params % pm + 8)
     complex(dp) :: work_complex(2*params % pm+1)
     ! Local variables
-    integer :: i, j
+    integer :: i, j, l
     real(dp) :: c1(3), c(3)
     ! Top-to-bottom pass
-    do i = 2, constants % nclusters
-        j = constants % parent(i)
-        c = constants % cnode(:, j)
-        c1 = constants % cnode(:, i)
-        c1 = params % kappa*(c1 - c)
-        ! Little bit confusion about the i and j indices
-        call fmm_m2m_bessel_rotation_adj_work(c1, constants % SK_rnode(:, i), &
-            & constants % SK_rnode(:, j), params % pm, constants % vscales, &
-            & one, node_m(:, j), one, node_m(:, i), work, work_complex)
+    do l = 2, constants % nlayers
+        !$omp parallel do default(none) shared(constants,params,node_m,l) &
+        !$omp private(i,j,c1,c,work,work_complex)
+        do i = constants % layers(1, l), constants % layers(2, l)
+            j = constants % parent(i)
+            c = constants % cnode(:, j)
+            c1 = constants % cnode(:, i)
+            c1 = params % kappa*(c1 - c)
+            call fmm_m2m_bessel_rotation_adj_work(c1, constants % SK_rnode(:, i), &
+                & constants % SK_rnode(:, j), params % pm, constants % vscales, &
+                & one, node_m(:, j), one, node_m(:, i), work, work_complex)
+        end do
     end do
 end subroutine tree_m2m_bessel_rotation_adj_work
 
@@ -2068,7 +2087,7 @@ subroutine tree_l2l_rotation(params, constants, node_l)
     ! Output
     real(dp), intent(inout) :: node_l((params % pl+1)**2, constants % nclusters)
     ! Temporary workspace
-    real(dp) :: work(6*params % pl**2 + 19*params % pl + 8)
+    real(dp) :: work(6*params % pl**2 + 19*params % pl + 8, params % nproc)
     ! Call corresponding work routine
     call tree_l2l_rotation_work(params, constants, node_l, work)
 end subroutine tree_l2l_rotation
@@ -2084,23 +2103,27 @@ subroutine tree_l2l_rotation_work(params, constants, node_l, work)
     ! Output
     real(dp), intent(inout) :: node_l((params % pl+1)**2, constants % nclusters)
     ! Temporary workspace
-    real(dp), intent(out) :: work(6*params % pl**2 + 19*params % pl + 8)
+    real(dp), intent(out) :: work(6*params % pl**2 + 19*params % pl + 8, &
+        & params % nproc)
     ! Local variables
-    integer :: i, j
+    integer :: i, j, l, iproc
     real(dp) :: c1(3), c(3), r1, r
     ! Top-to-bottom pass
-    !!$omp parallel do default(none) shared(constants,params,node_l) &
-    !!$omp private(i,j,c1,c,r1,r,work)
-    do i = 2, constants % nclusters
-        j = constants % parent(i)
-        c = constants % cnode(:, j)
-        r = constants % rnode(j)
-        c1 = constants % cnode(:, i)
-        r1 = constants % rnode(i)
-        c1 = c - c1
-        call fmm_l2l_rotation_work(c1, r, r1, params % pl, &
-            & constants % vscales, constants % vfact, one, &
-            & node_l(:, j), one, node_l(:, i), work)
+    do l = 2, constants % nlayers
+        !$omp parallel do default(none) shared(constants,params,node_l,l, &
+        !$omp work) private(i,j,c1,c,r1,r,iproc)
+        do i = constants % layers(1, l), constants % layers(2, l)
+            iproc = omp_get_thread_num() + 1
+            j = constants % parent(i)
+            c = constants % cnode(:, j)
+            r = constants % rnode(j)
+            c1 = constants % cnode(:, i)
+            r1 = constants % rnode(i)
+            c1 = c - c1
+            call fmm_l2l_rotation_work(c1, r, r1, params % pl, &
+                & constants % vscales, constants % vfact, one, &
+                & node_l(:, j), one, node_l(:, i), work(:, iproc))
+        end do
     end do
 end subroutine tree_l2l_rotation_work
 
@@ -2135,18 +2158,22 @@ subroutine tree_l2l_bessel_rotation_work(params, constants, node_l)
     real(dp) :: work(6*params % pl**2 + 19*params % pl + 8)
     complex(dp) :: work_complex(2*params % pl+1)
     ! Local variables
-    integer :: i, j
+    integer :: i, j, l
     real(dp) :: c_child(3), c_parent(3), c_diff(3)
     ! Top-to-bottom pass
-    do i = 2, constants % nclusters
-        j = constants % parent(i)
-        c_child = constants % cnode(:, j)
-        c_parent = constants % cnode(:, i)
-        c_diff = params % kappa*(c_child - c_parent)
-        call fmm_l2l_bessel_rotation_work(c_diff, &
-            & constants % si_rnode(:, j), constants % si_rnode(:, i), &
-            & params % pl, constants % vscales, one, &
-            & node_l(:, j), one, node_l(:, i), work, work_complex)
+    do l = 2, constants % nlayers
+        !$omp parallel do default(none) shared(constants,params,node_l,l) &
+        !$omp private(i,j,c_child,c_parent,c_diff,work,work_complex)
+        do i = constants % layers(1, l), constants % layers(2, l)
+            j = constants % parent(i)
+            c_child = constants % cnode(:, j)
+            c_parent = constants % cnode(:, i)
+            c_diff = params % kappa*(c_child - c_parent)
+            call fmm_l2l_bessel_rotation_work(c_diff, &
+                & constants % si_rnode(:, j), constants % si_rnode(:, i), &
+                & params % pl, constants % vscales, one, &
+                & node_l(:, j), one, node_l(:, i), work, work_complex)
+        end do
     end do
 end subroutine tree_l2l_bessel_rotation_work
 
@@ -2161,7 +2188,7 @@ subroutine tree_l2l_rotation_adj(params, constants, node_l)
     ! Output
     real(dp), intent(inout) :: node_l((params % pl+1)**2, constants % nclusters)
     ! Temporary workspace
-    real(dp) :: work(6*params % pl**2 + 19*params % pl + 8)
+    real(dp) :: work(6*params % pl**2 + 19*params % pl + 8, params % nproc)
     ! Call corresponding work routine
     call tree_l2l_rotation_adj_work(params, constants, node_l, work)
 end subroutine tree_l2l_rotation_adj
@@ -2177,32 +2204,38 @@ subroutine tree_l2l_rotation_adj_work(params, constants, node_l, work)
     ! Output
     real(dp), intent(inout) :: node_l((params % pl+1)**2, constants % nclusters)
     ! Temporary workspace
-    real(dp), intent(out) :: work(6*params % pl**2 + 19*params % pl + 8)
+    real(dp), intent(out) :: work(6*params % pl**2 + 19*params % pl + 8, &
+        & params % nproc)
     ! Local variables
-    integer :: i, j
+    integer :: i, j, l, iproc
     real(dp) :: c1(3), c(3), r1, r
     ! Bottom-to-top pass
-    do i = constants % nclusters, 1, -1
-        ! Leaf node does not need any update
-        if (constants % children(1, i) == 0) cycle
-        c = constants % cnode(:, i)
-        r = constants % rnode(i)
-        ! First child initializes output
-        j = constants % children(1, i)
-        c1 = constants % cnode(:, j)
-        r1 = constants % rnode(j)
-        c1 = c1 - c
-        call fmm_l2l_rotation_adj_work(c1, r1, r, params % pl, &
-            & constants % vscales, constants % vfact, one, &
-            & node_l(:, j), zero, node_l(:, i), work)
-        ! All other children update the same output
-        do j = constants % children(1, i)+1, constants % children(2, i)
+    do l = constants % nlayers, 1, -1
+        !$omp parallel do default(none) shared(constants,params,node_l,l, &
+        !$omp work) private(i,j,c1,c,r1,r,iproc)
+        do i = constants % layers(1, l), constants % layers(2, l)
+            iproc = omp_get_thread_num() + 1
+            ! Leaf node does not need any update
+            if (constants % children(1, i) == 0) cycle
+            c = constants % cnode(:, i)
+            r = constants % rnode(i)
+            ! First child initializes output
+            j = constants % children(1, i)
             c1 = constants % cnode(:, j)
             r1 = constants % rnode(j)
             c1 = c1 - c
             call fmm_l2l_rotation_adj_work(c1, r1, r, params % pl, &
                 & constants % vscales, constants % vfact, one, &
-                & node_l(:, j), one, node_l(:, i), work)
+                & node_l(:, j), zero, node_l(:, i), work)
+            ! All other children update the same output
+            do j = constants % children(1, i)+1, constants % children(2, i)
+                c1 = constants % cnode(:, j)
+                r1 = constants % rnode(j)
+                c1 = c1 - c
+                call fmm_l2l_rotation_adj_work(c1, r1, r, params % pl, &
+                    & constants % vscales, constants % vfact, one, &
+                    & node_l(:, j), one, node_l(:, i), work(:, iproc))
+            end do
         end do
     end do
 end subroutine tree_l2l_rotation_adj_work
@@ -2221,19 +2254,23 @@ subroutine tree_l2l_bessel_rotation_adj(params, constants, node_l)
     real(dp) :: work(6*params % pl**2 + 19*params % pl + 8)
     complex(dp) :: work_complex(2*params % pl+1)
     ! Local variables
-    integer :: i, j
+    integer :: i, j, l
     real(dp) :: c_parent(3), c_child(3), c_diff(3)
     ! Bottom-to-top pass
-    do i = constants % nclusters, 1, -1
-        c_child = constants % cnode(:, i)
-        j = constants % parent(i)
-        if (j == 0) cycle
-        c_parent = constants % cnode(:, j)
-        c_diff = params % kappa*(c_parent - c_child)
-        call fmm_l2l_bessel_rotation_adj_work(c_diff, &
-            & constants % si_rnode(:, i), constants % si_rnode(:, j), &
-            & params % pl, constants % vscales, one, &
-            & node_l(:, i), one, node_l(:, j), work, work_complex)
+    do l = constants % nlayers, 1, -1
+        !$omp parallel do default(none) shared(constants,params,node_l,l) &
+        !$omp private(i,j,c_parent,c_child,c_diff,work,work_complex)
+        do i = constants % layers(1, l), constants % layers(2, l)
+            c_child = constants % cnode(:, i)
+            j = constants % parent(i)
+            if (j == 0) cycle
+            c_parent = constants % cnode(:, j)
+            c_diff = params % kappa*(c_parent - c_child)
+            call fmm_l2l_bessel_rotation_adj_work(c_diff, &
+                & constants % si_rnode(:, i), constants % si_rnode(:, j), &
+                & params % pl, constants % vscales, one, &
+                & node_l(:, i), one, node_l(:, j), work, work_complex)
+        end do
     end do
 end subroutine tree_l2l_bessel_rotation_adj
 
@@ -2370,10 +2407,9 @@ subroutine tree_m2l_bessel_rotation_adj_work(params, constants, node_l, node_m)
     ! Local variables
     integer :: i, j, k
     real(dp) :: c1(3), c(3), r
-    ! Any order of this cycle is OK
     node_m = zero
-    !!$omp parallel do shared(constants,params,node_l,node_m) &
-    !!$omp private(i,c,r,k,c1,work,work_complex,j) schedule(dynamic)
+    !$omp parallel do shared(constants,params,node_l,node_m) &
+    !$omp private(i,c,r,k,c1,work,work_complex,j) schedule(dynamic)
     do i = 1, constants % nclusters
         ! If no far admissible pairs just set output to zero
         if (constants % nfar(i) .eq. 0) then
@@ -2384,19 +2420,18 @@ subroutine tree_m2l_bessel_rotation_adj_work(params, constants, node_l, node_m)
         ! Use the first far admissible pair to initialize output
         k = constants % far(constants % sfar(i))
         c1 = constants % cnode(:, k)
-        c1 = params % kappa*(c1 - c)
-        call fmm_m2l_bessel_rotation_adj_work(c1, constants % SI_rnode(:, i), &
-            & constants % SK_rnode(:, k), params % pm, &
-            & constants % vscales, one, &
-            & node_l(:, i), one, node_m(:, k), work, work_complex)
+        c1 = params % kappa*(c - c1)
+        call fmm_m2l_bessel_rotation_adj_work(c1, constants % SI_rnode(:, k), &
+            & constants % SK_rnode(:, i), params % pm, constants % vscales, &
+            & one, node_l(:, k), one, node_m(:, i), work, work_complex)
         do j = constants % sfar(i)+1, constants % sfar(i+1)-1
             k = constants % far(j)
             c1 = constants % cnode(:, k)
-            c1 = params % kappa*(c1 - c)
-            call fmm_m2l_bessel_rotation_adj_work(c1, constants % SI_rnode(:, i), &
-                & constants % SK_rnode(:, k), params % pm, &
-                & constants % vscales, one, &
-                & node_l(:, i), one, node_m(:, k), work, work_complex)
+            c1 = params % kappa*(c - c1)
+            call fmm_m2l_bessel_rotation_adj_work(c1, &
+                & constants % SI_rnode(:, k), constants % SK_rnode(:, i), &
+                & params % pm, constants % vscales, one, node_l(:, k), &
+                & one, node_m(:, i), work, work_complex)
         end do
     end do
 end subroutine tree_m2l_bessel_rotation_adj_work
@@ -2415,8 +2450,9 @@ subroutine tree_m2l_rotation_adj(params, constants, node_l, node_m)
     ! Local variables
     integer :: i, j, k
     real(dp) :: c1(3), c(3), r1, r
-    ! Any order of this cycle is OK
     node_m = zero
+    !$omp parallel do default(none) shared(constants,params,node_m,node_l) &
+    !$omp private(i,c,r,k,c1,r1,j) schedule(dynamic)
     do i = 1, constants % nclusters
         ! If no far admissible pairs just set output to zero
         if (constants % nfar(i) .eq. 0) then
@@ -2428,18 +2464,18 @@ subroutine tree_m2l_rotation_adj(params, constants, node_l, node_m)
         k = constants % far(constants % sfar(i))
         c1 = constants % cnode(:, k)
         r1 = constants % rnode(k)
-        c1 = c - c1
-        call fmm_m2l_rotation_adj(c1, r, r1, params % pl, params % pm, &
+        c1 = c1 - c
+        call fmm_m2l_rotation_adj(c1, r1, r, params % pl, params % pm, &
             & constants % vscales, constants % m2l_ztranslate_adj_coef, one, &
-            & node_l(:, i), one, node_m(:, k))
+            & node_l(:, k), one, node_m(:, i))
         do j = constants % sfar(i)+1, constants % sfar(i+1)-1
             k = constants % far(j)
             c1 = constants % cnode(:, k)
             r1 = constants % rnode(k)
-            c1 = c - c1
-            call fmm_m2l_rotation_adj(c1, r, r1, params % pl, params % pm, &
+            c1 = c1 - c
+            call fmm_m2l_rotation_adj(c1, r1, r, params % pl, params % pm, &
                 & constants % vscales, constants % m2l_ztranslate_adj_coef, one, &
-                & node_l(:, i), one, node_m(:, k))
+                & node_l(:, k), one, node_m(:, i))
         end do
     end do
 end subroutine tree_m2l_rotation_adj
@@ -2720,9 +2756,9 @@ subroutine tree_m2p_adj(params, constants, p, alpha, grid_v, beta, sph_m)
         sph_m = beta * sph_m
     end if
     ! Cycle over all spheres
-    !!$omp parallel do default(none) shared(params,constants,grid_v,p, &
-    !!$omp alpha,sph_m), private(isph,inode,jnear,jnode,jsph,igrid,c,work) &
-    !!$omp schedule(dynamic)
+    !$omp parallel do default(none) shared(params,constants,grid_v,p, &
+    !$omp alpha,sph_m), private(isph,inode,jnear,jnode,jsph,igrid,c,work) &
+    !$omp schedule(dynamic)
     do isph = 1, params % nsph
         ! Cycle over all near-field admissible pairs of spheres
         inode = constants % snode(isph)
@@ -2735,12 +2771,12 @@ subroutine tree_m2p_adj(params, constants, p, alpha, grid_v, beta, sph_m)
             if(isph .eq. jsph) cycle
             ! Accumulate interaction for external grid points only
             do igrid = 1, params % ngrid
-                if(constants % ui(igrid, isph) .eq. zero) cycle
-                c = constants % cgrid(:, igrid)*params % rsph(isph) - &
-                    & params % csph(:, jsph) + params % csph(:, isph)
-                call fmm_m2p_adj_work(c, alpha*grid_v(igrid, isph), &
-                    & params % rsph(jsph), p, constants % vscales_rel, one, &
-                    & sph_m(:, jsph), work)
+                if(constants % ui(igrid, jsph) .eq. zero) cycle
+                c = constants % cgrid(:, igrid)*params % rsph(jsph) - &
+                    & params % csph(:, isph) + params % csph(:, jsph)
+                call fmm_m2p_adj_work(c, alpha*grid_v(igrid, jsph), &
+                    & params % rsph(isph), p, constants % vscales_rel, one, &
+                    & sph_m(:, isph), work)
             end do
         end do
     end do
@@ -2771,28 +2807,25 @@ subroutine tree_m2p_bessel_adj(params, constants, p, alpha, grid_v, beta, sph_p,
         sph_m = beta * sph_m
     end if
     ! Cycle over all spheres
-    !!$omp parallel do default(none) shared(params,constants,p,sph_m,tmp, &
-    !!$omp alpha,grid_v) private(isph,inode,jnear,jnode,jsph,igrid,c, &
-    !!$omp iproc) schedule(dynamic)
+    !$omp parallel do default(none) shared(params,constants,p,sph_m, &
+    !$omp alpha,grid_v) private(isph,inode,jnear,jnode,jsph,igrid,c) &
+    !$omp schedule(dynamic)
     do isph = 1, params % nsph
         ! Cycle over all near-field admissible pairs of spheres
         inode = constants % snode(isph)
-        ! iproc = omp_get_thread_num() + 1
         do jnear = constants % snear(inode), constants % snear(inode+1)-1
             ! Near-field interactions are possible only between leaf nodes,
             ! which must contain only a single input sphere
             jnode = constants % near(jnear)
             jsph = constants % order(constants % cluster(1, jnode))
-            ! Ignore self-interaction
-            !if(isph .eq. jsph) cycle
             ! Accumulate interaction for external grid points only
             do igrid = 1, params % ngrid
-                if(constants % ui(igrid, isph) .eq. zero) cycle
-                c = constants % cgrid(:, igrid)*params % rsph(isph) - &
-                    & params % csph(:, jsph) + params % csph(:, isph)
-                call fmm_m2p_bessel_adj(c, alpha*grid_v(igrid, isph), &
-                    & params % rsph(jsph), params % kappa, p, &
-                    & constants % vscales, one, sph_m(:, jsph))
+                if(constants % ui(igrid, jsph) .eq. zero) cycle
+                c = constants % cgrid(:, igrid)*params % rsph(jsph) - &
+                    & params % csph(:, isph) + params % csph(:, jsph)
+                call fmm_m2p_bessel_adj(c, alpha*grid_v(igrid, jsph), &
+                    & params % rsph(isph), params % kappa, p, &
+                    & constants % vscales, one, sph_m(:, isph))
             end do
         end do
     end do
diff --git a/src/ddx_driver.f90 b/src/ddx_driver.f90
index b3fcbe61..3157e3da 100644
--- a/src/ddx_driver.f90
+++ b/src/ddx_driver.f90
@@ -298,7 +298,12 @@ program main
             & state % x_lpb_rel_diff(i)
     end do
     ! Print number of iterations and time
-    write(6, 100) "ddlpb step time: ", state % x_lpb_time, " seconds"
+    write(6, 100) "ddlpb step time: ", state % x_lpb_time, &
+        & " seconds, of which:"
+    write(6, 100) "    ddcosmo: ", state % xs_time, " seconds"
+    write(6, 100) "    hsp: ", state % hsp_time, " seconds"
+    write(6, 100) "    coupling terms: ", state % x_lpb_time &
+        & - state % xs_time - state % hsp_time, " seconds"
     write(6, 300) "ddlpb step iterations: ", state % x_lpb_niter
 end if
 
@@ -348,7 +353,11 @@ program main
         end do
         ! Print number of iterations and time
         write(6, 100) "adjoint ddlpb step time: ", &
-            & state % x_adj_lpb_time, " seconds"
+            & state % x_adj_lpb_time, " seconds, of which:"
+        write(6, 100) "    adjoint ddcosmo: ", state % s_time, " seconds"
+        write(6, 100) "    adjoint hsp: ", state % hsp_adj_time, " seconds"
+        write(6, 100) "    adjoint coupling terms: ", state % x_adj_lpb_time &
+            & - state % s_time - state % hsp_adj_time, " seconds"
         write(6, 200) "adjoint ddlpb step iterations: ", &
             & state % x_adj_lpb_niter
     end if
diff --git a/src/ddx_lpb.f90 b/src/ddx_lpb.f90
index c6b15f34..1805293c 100644
--- a/src/ddx_lpb.f90
+++ b/src/ddx_lpb.f90
@@ -233,6 +233,8 @@ subroutine ddlpb_solve(params, constants, workspace, state, tol)
     real(dp) :: start_time
 
     state % x_lpb_niter = params % maxiter
+    workspace % xs_time = zero
+    workspace % hsp_time = zero
 
     ! solve LS using Jacobi/DIIS
     start_time = omp_get_wtime()
@@ -251,6 +253,10 @@ subroutine ddlpb_solve(params, constants, workspace, state, tol)
     ! the density so that it is consistent with the ddCOSMO and ddPCM
     ! ones.
     call convert_ddcosmo(params, constants, -1, state % x_lpb(:,:,1))
+
+    ! put the timings in the right places
+    state % xs_time = workspace % xs_time
+    state % hsp_time = workspace % hsp_time
 end subroutine ddlpb_solve
 
 
@@ -274,7 +280,8 @@ subroutine ddlpb_solve_adjoint(params, constants, workspace, state, tol)
     real(dp) :: start_time
 
     state % x_adj_lpb_niter = params % maxiter
-    constants % inner_tol = sqrt(tol)
+    workspace % s_time = zero
+    workspace % hsp_adj_time = zero
 
     ! solve adjoint LS using Jacobi/DIIS
     start_time = omp_get_wtime()
@@ -291,6 +298,10 @@ subroutine ddlpb_solve_adjoint(params, constants, workspace, state, tol)
 
     state % q = state % x_adj_lpb(:, :, 1)
 
+    ! put the timings in the right places
+    state % s_time = workspace % s_time
+    state % hsp_adj_time = workspace % hsp_adj_time
+
     call ddlpb_derivative_setup(params, constants, workspace, state)
 
 end subroutine ddlpb_solve_adjoint
diff --git a/src/ddx_operators.f90 b/src/ddx_operators.f90
index d5ef9a1b..50c0e73c 100644
--- a/src/ddx_operators.f90
+++ b/src/ddx_operators.f90
@@ -370,11 +370,14 @@ subroutine dstarx_dense(params, constants, workspace, do_diag, x, y)
     ! Local variables
     real(dp) :: vji(3), sji(3)
     real(dp) :: vvji, tji, tt, f, rho, ctheta, stheta, cphi, sphi
-    integer :: its, isph, jsph, l, m, ind
+    integer :: its, isph, jsph, l, m, ind, iproc
     real(dp), external :: dnrm2
     y = zero
-    ! this loop is easily parallelizable
+    !$omp parallel do default(none) shared(do_diag,params,constants, &
+    !$omp workspace,x,y) private(isph,jsph,its,vji,vvji,tji,sji,rho, &
+    !$omp ctheta,stheta,cphi,sphi,tt,l,ind,f,m,iproc) schedule(dynamic)
     do isph = 1, params % nsph
+        iproc = omp_get_thread_num() + 1
         do jsph = 1, params % nsph
             if (jsph.ne.isph) then
                 do its = 1, params % ngrid
@@ -389,8 +392,10 @@ subroutine dstarx_dense(params, constants, workspace, do_diag, x, y)
                         ! build the local basis
                         call ylmbas(sji, rho, ctheta, stheta, cphi, sphi, &
                             & params % lmax, constants % vscales, &
-                            & workspace % tmp_vylm, workspace % tmp_vplm, &
-                            & workspace % tmp_vcos, workspace % tmp_vsin)
+                            & workspace % tmp_vylm(:(params % lmax + 1)**2, iproc), &
+                            & workspace % tmp_vplm(:(params % lmax + 1)**2, iproc), &
+                            & workspace % tmp_vcos(:(params % lmax + 1), iproc), &
+                            & workspace % tmp_vsin(:(params % lmax + 1), iproc))
                         tt = constants % ui(its,jsph) &
                             & *dot_product(constants % vwgrid(:constants % nbasis, its), &
                             & x(:,jsph))/tji
@@ -399,7 +404,7 @@ subroutine dstarx_dense(params, constants, workspace, do_diag, x, y)
                             f = dble(l)*tt/ constants % vscales(ind)**2
                             do m = -l, l
                                 y(ind+m,isph) = y(ind+m,isph) + &
-                                    & f*workspace % tmp_vylm(ind+m, 1)
+                                    & f*workspace % tmp_vylm(ind+m, iproc)
                             end do
                             tt = tt/tji
                         end do
@@ -1355,26 +1360,34 @@ subroutine prec_tstarx(params, constants, workspace, x, y)
     real(dp), intent(inout) :: y(constants % nbasis, params % nsph, 2)
     integer :: n_iter
     real(dp), dimension(params % maxiter) :: x_rel_diff
+    real(dp) :: start_time
 
+    start_time = omp_get_wtime()
     y(:,:,1) = x(:,:,1)
     call convert_ddcosmo(params, constants, 1, y(:,:,1))
     n_iter = params % maxiter
-    call jacobi_diis(params, constants, workspace, constants % inner_tol, y(:,:,1), &
-        & workspace % ddcosmo_guess, n_iter, x_rel_diff, lstarx, ldm1x, hnorm)
+    call jacobi_diis(params, constants, workspace, constants % inner_tol, &
+        & y(:,:,1), workspace % ddcosmo_guess, n_iter, x_rel_diff, lstarx, &
+        & ldm1x, hnorm)
     if (workspace % error_flag.ne.0) then
         workspace % error_message = 'prec_tstarx: ddCOSMO failed to converge'
         return
     end if
     y(:,:,1) = workspace % ddcosmo_guess
+    workspace % s_time = workspace % s_time + omp_get_wtime() - start_time
 
+    start_time = omp_get_wtime()
     n_iter = params % maxiter
-    call jacobi_diis(params, constants, workspace, constants % inner_tol, x(:,:,2), workspace % hsp_guess, &
-        & n_iter, x_rel_diff, bstarx, bx_prec, hnorm)
+    call jacobi_diis(params, constants, workspace, constants % inner_tol, &
+        & x(:,:,2), workspace % hsp_guess, n_iter, x_rel_diff, bstarx, &
+        & bx_prec, hnorm)
     if (workspace % error_flag.ne.0) then
         workspace % error_message = 'prec_tstarx: HSP failed to converge'
         return
     end if
     y(:,:,2) = workspace % hsp_guess
+    workspace % hsp_adj_time = workspace % hsp_adj_time &
+        & + omp_get_wtime() - start_time
 
 end subroutine prec_tstarx
 
@@ -1393,11 +1406,14 @@ subroutine prec_tx(params, constants, workspace, x, y)
     real(dp), intent(inout) :: y(constants % nbasis, params % nsph, 2)
     integer :: n_iter
     real(dp), dimension(params % maxiter) :: x_rel_diff
+    real(dp) :: start_time
 
     ! perform A^-1 * Yr
+    start_time = omp_get_wtime()
     n_iter = params % maxiter
-    call jacobi_diis(params, constants, workspace, constants % inner_tol, x(:,:,1), &
-        & workspace % ddcosmo_guess, n_iter, x_rel_diff, lx, ldm1x, hnorm)
+    call jacobi_diis(params, constants, workspace, constants % inner_tol, &
+        & x(:,:,1), workspace % ddcosmo_guess, n_iter, x_rel_diff, lx, &
+        & ldm1x, hnorm)
     if (workspace % error_flag.ne.0) then
         workspace % error_message = 'prec_tx: ddCOSMO failed to converge'
         return
@@ -1406,12 +1422,16 @@ subroutine prec_tx(params, constants, workspace, x, y)
     ! Scale by the factor of (2l+1)/4Pi
     y(:,:,1) = workspace % ddcosmo_guess
     call convert_ddcosmo(params, constants, 1, y(:,:,1))
+    workspace % xs_time = workspace % xs_time + omp_get_wtime() - start_time
 
     ! perform B^-1 * Ye
+    start_time = omp_get_wtime()
     n_iter = params % maxiter
-    call jacobi_diis(params, constants, workspace, constants % inner_tol, x(:,:,2), workspace % hsp_guess, &
-        & n_iter, x_rel_diff, bx, bx_prec, hnorm)
+    call jacobi_diis(params, constants, workspace, constants % inner_tol, &
+        & x(:,:,2), workspace % hsp_guess, n_iter, x_rel_diff, bx, &
+        & bx_prec, hnorm)
     y(:,:,2) = workspace % hsp_guess
+    workspace % hsp_time = workspace % hsp_time + omp_get_wtime() - start_time
 
     if (workspace % error_flag.ne.0) then
         workspace % error_message = 'prec_tx: HSP failed to converge'
diff --git a/src/ddx_workspace.f90 b/src/ddx_workspace.f90
index c59c20db..ae01160f 100644
--- a/src/ddx_workspace.f90
+++ b/src/ddx_workspace.f90
@@ -98,6 +98,8 @@ module ddx_workspace
     real(dp), allocatable :: tmp_bmat(:, :)
     !> ddLPB solutions for the microiterations
     real(dp), allocatable :: ddcosmo_guess(:,:), hsp_guess(:,:)
+    !> ddLPB temporary timings
+    real(dp) :: xs_time, s_time, hsp_time, hsp_adj_time
     !> Flag if there were an error
     integer :: error_flag = 2
     !> Last error message