From bc9dba8d22655d39fe2f071f2876b555e3687ee7 Mon Sep 17 00:00:00 2001
From: Michele Nottoli <michele.nottoli@gmail.com>
Date: Mon, 15 Apr 2024 15:02:38 +0200
Subject: [PATCH 01/21] Profiling enabled.

---
 src/CMakeLists.txt    |  1 +
 src/ddx.f90           | 14 +++++++++++++
 src/ddx_core.f90      |  1 +
 src/ddx_driver.f90    |  6 +++++-
 src/ddx_operators.f90 |  6 +++++-
 src/ddx_profiling.f90 | 48 +++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 74 insertions(+), 2 deletions(-)
 create mode 100644 src/ddx_profiling.f90

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 3d498a52..36af8011 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -17,6 +17,7 @@ set(SRC
     ddx.f90
     ddx_legacy.f90
     ddx_multipolar_solutes.f90
+    ddx_profiling.f90
     llgnew.f
     cbessel.f90
     ddx_cinterface.f90
diff --git a/src/ddx.f90 b/src/ddx.f90
index 4b67ef44..b9ee78ed 100644
--- a/src/ddx.f90
+++ b/src/ddx.f90
@@ -390,8 +390,10 @@ subroutine ddrun(ddx_data, state, electrostatics, psi, tol, esolv, &
         return
     end if
 
+    call time_push()
     call setup(ddx_data % params, ddx_data % constants, &
         & ddx_data % workspace, state, electrostatics, psi, ddx_error)
+    call time_pull("setup")
     if (ddx_error % flag .ne. 0) then
         call update_error(ddx_error, "ddrun: setup returned an error, exiting")
         return
@@ -399,24 +401,30 @@ subroutine ddrun(ddx_data, state, electrostatics, psi, tol, esolv, &
 
     ! solve the primal linear system
     if (do_guess) then
+        call time_push()
         call fill_guess(ddx_data % params, ddx_data % constants, &
             & ddx_data % workspace, state, tol, ddx_error)
+        call time_pull("guess")
         if (ddx_error % flag .ne. 0) then
             call update_error(ddx_error, &
                 & "ddrun: fill_guess returned an error, exiting")
             return
         end if
     end if
+    call time_push()
     call solve(ddx_data % params, ddx_data % constants, &
         & ddx_data % workspace, state, tol, ddx_error)
+    call time_pull("solve")
     if (ddx_error % flag .ne. 0) then
         call update_error(ddx_error, "ddrun: solve returned an error, exiting")
         return
     end if
 
     ! compute the energy
+    call time_push()
     call energy(ddx_data % params, ddx_data % constants, &
         & ddx_data % workspace, state, esolv, ddx_error)
+    call time_pull("energy")
     if (ddx_error % flag .ne. 0) then
         call update_error(ddx_error, "ddrun: energy returned an error, exiting")
         return
@@ -425,16 +433,20 @@ subroutine ddrun(ddx_data, state, electrostatics, psi, tol, esolv, &
     ! solve the primal linear system
     if (ddx_data % params % force .eq. 1) then
         if (do_guess) then
+            call time_push()
             call fill_guess_adjoint(ddx_data % params, ddx_data % constants, &
                 & ddx_data % workspace, state, tol, ddx_error)
+            call time_pull("guess adjoint")
             if (ddx_error % flag .ne. 0) then
                 call update_error(ddx_error, &
                     & "ddrun: fill_guess_adjoint returned an error, exiting")
                 return
             end if
         end if
+        call time_push()
         call solve_adjoint(ddx_data % params, ddx_data % constants, &
             & ddx_data % workspace, state, tol, ddx_error)
+        call time_pull("solve adjoint")
         if (ddx_error % flag .ne. 0) then
             call update_error(ddx_error, &
                 & "ddrun: solve_adjoint returned an error, exiting")
@@ -445,8 +457,10 @@ subroutine ddrun(ddx_data, state, electrostatics, psi, tol, esolv, &
     ! compute the forces
     if (ddx_data % params % force .eq. 1) then
         force = zero
+        call time_push()
         call solvation_force_terms(ddx_data % params, ddx_data % constants, &
             & ddx_data % workspace, state, electrostatics, force, ddx_error)
+        call time_pull("solvation forces")
         if (ddx_error % flag .ne. 0) then
             call update_error(ddx_error, &
                 & "ddrun: solvation_force_terms returned an error, exiting")
diff --git a/src/ddx_core.f90 b/src/ddx_core.f90
index cc957e68..1df85ca8 100644
--- a/src/ddx_core.f90
+++ b/src/ddx_core.f90
@@ -21,6 +21,7 @@ module ddx_core
 use ddx_harmonics
 ! Enable OpenMP
 use omp_lib
+use ddx_profiling
 implicit none
 
 !> @defgroup Fortran_interface_core Fortran interface: core routines
diff --git a/src/ddx_driver.f90 b/src/ddx_driver.f90
index 078551f5..422189b8 100644
--- a/src/ddx_driver.f90
+++ b/src/ddx_driver.f90
@@ -114,6 +114,7 @@ program main
 finish_time = omp_get_wtime()
 write(*, 100) "Psi time:", finish_time-start_time, " seconds"
 
+call time_push()
 if (ddx_data % params % force .eq. 1) then
     allocate(force(3, ddx_data % params % nsph), stat=info)
     if (info .ne. 0) then
@@ -126,6 +127,7 @@ program main
     call ddrun(ddx_data, state, electrostatics, psi, tol, esolv, ddx_error)
 end if
 call check_error(ddx_error)
+call time_pull("ddrun")
 
 if (ddx_data % params % force .eq. 1) write(*, 100) &
     & "solvation force terms time:", state % force_time, " seconds"
@@ -134,9 +136,11 @@ program main
 ! forces.
 if (ddx_data % params % force .eq. 1) then
     start_time = omp_get_wtime()
+    call time_push()
     call multipole_force_terms(ddx_data % params, ddx_data % constants, &
         & ddx_data % workspace, state, 0, multipoles, force, ddx_error)
-        call check_error(ddx_error)
+    call time_pull("solute forces")
+    call check_error(ddx_error)
     finish_time = omp_get_wtime()
     write(*, 100) "multipolar force terms time:", &
         & finish_time - start_time, " seconds"
diff --git a/src/ddx_operators.f90 b/src/ddx_operators.f90
index dedbf00f..161479ea 100644
--- a/src/ddx_operators.f90
+++ b/src/ddx_operators.f90
@@ -56,6 +56,7 @@ subroutine lx(params, constants, workspace, x, y, ddx_error)
             end do
         end do
     else
+        call time_push()
         !$omp parallel do default(none) shared(params,constants,workspace,x,y) &
         !$omp private(isph,iproc) schedule(dynamic)
         do isph = 1, params % nsph
@@ -69,6 +70,7 @@ subroutine lx(params, constants, workspace, x, y, ddx_error)
             ! now, fix the sign.
             y(:, isph) = - y(:, isph)
         end do
+        call time_pull("lx")
     end if
 !
 !   if required, add the diagonal.
@@ -768,7 +770,8 @@ subroutine bx(params, constants, workspace, x, y, ddx_error)
                     & one, y(:,isph), 1)
             end do
         end do
-    else 
+    else
+        call time_push()
         !$omp parallel do default(none) shared(params,constants,workspace,x,y) &
         !$omp private(isph,iproc) schedule(dynamic)
         do isph = 1, params % nsph
@@ -778,6 +781,7 @@ subroutine bx(params, constants, workspace, x, y, ddx_error)
               & constants % vgrid_nbasis, workspace % tmp_pot(:, iproc), y(:,isph))
           y(:,isph) = - y(:,isph) 
         end do
+        call time_pull("bx")
     end if
     if (constants % dodiag) y = y + x
 end subroutine bx
diff --git a/src/ddx_profiling.f90 b/src/ddx_profiling.f90
new file mode 100644
index 00000000..b5799d88
--- /dev/null
+++ b/src/ddx_profiling.f90
@@ -0,0 +1,48 @@
+module ddx_profiling
+    !! Unified Input/Output handling across the code.
+    use ddx_definitions, only: dp
+    implicit none
+    private
+
+    integer, parameter:: ntimes = 128
+    integer:: tcnt = 1
+    real(dp):: times(ntimes)
+
+    public:: time_pull, time_push
+
+    contains
+
+    subroutine time_push()
+        implicit none
+        real(dp):: omp_get_wtime
+
+        if(tcnt <= ntimes) then
+            times(tcnt) = omp_get_wtime()
+            tcnt = tcnt+1
+        else
+            write(*, *) 'time_push Cannot push another time in the buffer.'
+            stop 1
+        end if
+    end subroutine
+
+    subroutine time_pull(s)
+        implicit none
+
+        character(len=*), intent(in):: s
+        real(dp):: elap
+        character(len = 2048):: msg
+
+        real(dp):: omp_get_wtime
+
+        if(tcnt > 1) then
+            elap = omp_get_wtime() - times(tcnt-1)
+            tcnt = tcnt-1
+            write(msg, "(3a, ': ', e14.6E2, ' s')") repeat('-', tcnt), '> ', s, elap
+            write(*, *) '[fmm-time]', trim(msg)
+        else
+            write(*, *) 'time_pull Cannot pull any value.'
+            stop 1
+        end if
+    end subroutine
+
+end module ddx_profiling

From 63a99cf050e50199e3ef72c6c706ee95d1e87b3a Mon Sep 17 00:00:00 2001
From: Michele Nottoli <michele.nottoli@gmail.com>
Date: Mon, 15 Apr 2024 15:54:21 +0200
Subject: [PATCH 02/21] Opt.

---
 src/ddx_core.f90      | 19 +++++++++++++------
 src/ddx_operators.f90 | 21 ++++++++++-----------
 2 files changed, 23 insertions(+), 17 deletions(-)

diff --git a/src/ddx_core.f90 b/src/ddx_core.f90
index 1df85ca8..6f8dfa7d 100644
--- a/src/ddx_core.f90
+++ b/src/ddx_core.f90
@@ -1391,15 +1391,22 @@ real(dp) function hnorm(lmax, nbasis, nsph, x)
     implicit none
     integer, intent(in) :: lmax, nbasis, nsph
     real(dp),  dimension(nbasis, nsph), intent(in) :: x
-    integer :: isph
-    real(dp) :: vrms, fac
+    integer :: isph, l, ind, m
+    real(dp) :: vrms, fac, tmp
 
-    vrms = 0.0_dp
+    vrms = 0.0d0
     !$omp parallel do default(none) shared(nsph,lmax,nbasis,x) &
-    !$omp private(isph,fac) schedule(dynamic) reduction(+:vrms)
+    !$omp private(isph,tmp,l,ind,fac,m) schedule(static,1) reduction(+:vrms)
     do isph = 1, nsph
-        call hsnorm(lmax, nbasis, x(:,isph), fac)
-        vrms = vrms + fac*fac
+        tmp = 0.0d0
+        do l = 0, lmax
+            ind = l*l + l + 1
+            fac = 1.0d0/(1.0d0 + dble(l))
+            do m = -l, l
+                tmp = tmp + fac*x(ind+m,isph)*x(ind+m,isph)
+            end do
+        end do
+        vrms = vrms + tmp
     enddo
     hnorm = sqrt(vrms/dble(nsph))
 end function hnorm
diff --git a/src/ddx_operators.f90 b/src/ddx_operators.f90
index 161479ea..1c4e3188 100644
--- a/src/ddx_operators.f90
+++ b/src/ddx_operators.f90
@@ -62,20 +62,19 @@ subroutine lx(params, constants, workspace, x, y, ddx_error)
         do isph = 1, params % nsph
             iproc = omp_get_thread_num() + 1
             ! Compute NEGATIVE action of off-digonal blocks
-            call calcv(params, constants, isph, workspace % tmp_pot(:, iproc), x, &
+            call calcv(params, constants, isph, workspace % tmp_grid(:, isph), x, &
                 & workspace % tmp_work(:, iproc))
-            call ddintegrate(1, constants % nbasis, params % ngrid, &
-                & constants % vwgrid, constants % vgrid_nbasis, &
-                & workspace % tmp_pot(:, iproc), y(:, isph))
-            ! now, fix the sign.
-            y(:, isph) = - y(:, isph)
         end do
+        call ddintegrate(params % nsph, constants % nbasis, params % ngrid, &
+            & constants % vwgrid, constants % vgrid_nbasis, &
+            & workspace % tmp_grid, y)
+        y = - y
         call time_pull("lx")
     end if
 !
 !   if required, add the diagonal.
 !
-    if (constants % dodiag) then 
+    if (constants % dodiag) then
         do isph = 1, params % nsph
             do l = 0, params % lmax
                 ind = l*l + l + 1
@@ -776,11 +775,11 @@ subroutine bx(params, constants, workspace, x, y, ddx_error)
         !$omp private(isph,iproc) schedule(dynamic)
         do isph = 1, params % nsph
           iproc = omp_get_thread_num() + 1
-          call calcv2_lpb(params, constants, isph, workspace % tmp_pot(:, iproc), x)
-          call ddintegrate(1, constants % nbasis, params % ngrid, constants % vwgrid, &
-              & constants % vgrid_nbasis, workspace % tmp_pot(:, iproc), y(:,isph))
-          y(:,isph) = - y(:,isph) 
+          call calcv2_lpb(params, constants, isph, workspace % tmp_grid(:,isph), x)
         end do
+        call ddintegrate(params % nsph, constants % nbasis, params % ngrid, constants % vwgrid, &
+            & constants % vgrid_nbasis, workspace % tmp_grid, y)
+        y = - y
         call time_pull("bx")
     end if
     if (constants % dodiag) y = y + x

From 57c1fd01fef321065ee77ee33e0d959a59b4216a Mon Sep 17 00:00:00 2001
From: Michele Nottoli <michele.nottoli@gmail.com>
Date: Mon, 15 Apr 2024 17:13:44 +0200
Subject: [PATCH 03/21] Further profiling.

---
 src/ddx_operators.f90 | 13 ++++++++++++-
 src/ddx_profiling.f90 |  2 +-
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/src/ddx_operators.f90 b/src/ddx_operators.f90
index 1c4e3188..17a404d4 100644
--- a/src/ddx_operators.f90
+++ b/src/ddx_operators.f90
@@ -36,11 +36,14 @@ subroutine lx(params, constants, workspace, x, y, ddx_error)
     !! Local variables
     integer :: isph, jsph, ij, l, ind, iproc
 
+    call time_push()
     ! dummy operation on unused interface arguments
     if (ddx_error % flag .eq. 0) continue
 
     !! Initialize
+    call time_push()
     y = zero
+    call time_pull("zero")
 !
 !   incore code:
 !
@@ -65,11 +68,16 @@ subroutine lx(params, constants, workspace, x, y, ddx_error)
             call calcv(params, constants, isph, workspace % tmp_grid(:, isph), x, &
                 & workspace % tmp_work(:, iproc))
         end do
+        call time_pull("calcv")
+
+        call time_push()
         call ddintegrate(params % nsph, constants % nbasis, params % ngrid, &
             & constants % vwgrid, constants % vgrid_nbasis, &
             & workspace % tmp_grid, y)
+        call time_pull("integrate")
+        call time_push()
         y = - y
-        call time_pull("lx")
+        call time_pull("sign")
     end if
 !
 !   if required, add the diagonal.
@@ -83,6 +91,7 @@ subroutine lx(params, constants, workspace, x, y, ddx_error)
             end do
         end do
     end if
+    call time_pull("lx")
 end subroutine lx
 
 !> Adjoint single layer operator matvec 
@@ -1067,6 +1076,7 @@ subroutine cx(params, constants, workspace, x, y, ddx_error)
 
     real(dp), allocatable :: diff_re(:,:), diff0(:,:)
 
+    call time_push()
     allocate(diff_re(constants % nbasis, params % nsph), &
         & diff0(constants % nbasis0, params % nsph), stat=info)
     if (info.ne.0) then
@@ -1191,6 +1201,7 @@ subroutine cx(params, constants, workspace, x, y, ddx_error)
         call update_error(ddx_error, "Deallocation failed in cx")
         return
     end if
+    call time_pull("cx")
 
 end subroutine cx
 
diff --git a/src/ddx_profiling.f90 b/src/ddx_profiling.f90
index b5799d88..7e056f3c 100644
--- a/src/ddx_profiling.f90
+++ b/src/ddx_profiling.f90
@@ -38,7 +38,7 @@ subroutine time_pull(s)
             elap = omp_get_wtime() - times(tcnt-1)
             tcnt = tcnt-1
             write(msg, "(3a, ': ', e14.6E2, ' s')") repeat('-', tcnt), '> ', s, elap
-            write(*, *) '[fmm-time]', trim(msg)
+            write(*, *) '[ddX-time]', trim(msg)
         else
             write(*, *) 'time_pull Cannot pull any value.'
             stop 1

From 58493f8b64be6b98b422a5dc79e2016958d1df61 Mon Sep 17 00:00:00 2001
From: Michele Nottoli <michele.nottoli@gmail.com>
Date: Tue, 16 Apr 2024 10:50:51 +0200
Subject: [PATCH 04/21] Slight improvement.

---
 src/ddx_operators.f90 | 40 +++++++++++++++++++++++++++++++++-------
 1 file changed, 33 insertions(+), 7 deletions(-)

diff --git a/src/ddx_operators.f90 b/src/ddx_operators.f90
index 17a404d4..2caefb59 100644
--- a/src/ddx_operators.f90
+++ b/src/ddx_operators.f90
@@ -34,7 +34,8 @@ subroutine lx(params, constants, workspace, x, y, ddx_error)
     real(dp), intent(out) :: y(constants % nbasis, params % nsph)
     type(ddx_error_type), intent(inout) :: ddx_error
     !! Local variables
-    integer :: isph, jsph, ij, l, ind, iproc
+    integer :: isph, jsph, ij, l, ind, iproc, its
+    real(dp) :: vij(3), tij, vvij, xij, oij, thigh
 
     call time_push()
     ! dummy operation on unused interface arguments
@@ -60,13 +61,38 @@ subroutine lx(params, constants, workspace, x, y, ddx_error)
         end do
     else
         call time_push()
-        !$omp parallel do default(none) shared(params,constants,workspace,x,y) &
-        !$omp private(isph,iproc) schedule(dynamic)
+        thigh = one + pt5*(params % se + one)*params % eta
+        !$omp parallel do collapse(2) default(none) shared(params,constants,workspace,x,y,thigh) &
+        !$omp private(isph,iproc,its,ij,jsph,vij,vvij,tij,xij,oij) schedule(static,1)
         do isph = 1, params % nsph
-            iproc = omp_get_thread_num() + 1
-            ! Compute NEGATIVE action of off-digonal blocks
-            call calcv(params, constants, isph, workspace % tmp_grid(:, isph), x, &
-                & workspace % tmp_work(:, iproc))
+            do its = 1, params % ngrid
+                workspace % tmp_grid(its, isph) = zero
+                iproc = omp_get_thread_num() + 1
+                ! contribution from integration point present
+                if (constants % ui(its,isph).lt.one) then
+                    ! loop over neighbors of i-sphere
+                    do ij = constants % inl(isph), constants % inl(isph+1)-1
+                        jsph = constants % nl(ij)
+                        ! compute t_n^ij = | r_i + \rho_i s_n - r_j | / \rho_j
+                        vij  = params % csph(:,isph) + params % rsph(isph)* &
+                            & constants % cgrid(:,its) - params % csph(:,jsph)
+                        vvij = sqrt(vij(1)*vij(1) + vij(2)*vij(2) + vij(3)*vij(3))
+                        tij  = vvij / params % rsph(jsph)
+                        ! point is INSIDE j-sphere
+                        if (tij.lt.thigh .and. tij.gt.zero) then
+                            xij = fsw(tij, params % se, params % eta)
+                            if (constants % fi(its,isph).gt.one) then
+                                oij = xij / constants % fi(its,isph)
+                            else
+                                oij = xij
+                            end if
+                            call fmm_l2p_work(vij, params % rsph(jsph), params % lmax, &
+                                & constants % vscales_rel, oij, x(:, jsph), one, &
+                                & workspace % tmp_grid(its, isph), workspace % tmp_work(:, iproc))
+                        end if
+                    end do
+                end if
+            end do
         end do
         call time_pull("calcv")
 

From 0535cb269605ece136a8117d5fab8e1adfe07e7b Mon Sep 17 00:00:00 2001
From: Michele Nottoli <michele.nottoli@gmail.com>
Date: Tue, 16 Apr 2024 12:45:26 +0200
Subject: [PATCH 05/21] Big improvement by precomputing buried region.

---
 src/ddx_constants.f90 | 67 ++++++++++++++++++++++++++++++++++++++++++-
 src/ddx_operators.f90 | 55 +++++++++++++++++------------------
 2 files changed, 93 insertions(+), 29 deletions(-)

diff --git a/src/ddx_constants.f90 b/src/ddx_constants.f90
index 66577adc..d798b311 100644
--- a/src/ddx_constants.f90
+++ b/src/ddx_constants.f90
@@ -223,6 +223,12 @@ module ddx_constants
     !> Whether the diagonal of the matrices has to be used in the mvp for
     !! ddCOSMO, ddPCM or inner ddLPB iterations
     logical  :: dodiag
+    !> List of exposed points in a CSR format.
+    integer, allocatable :: iexposed(:)
+    integer, allocatable :: exposed(:)
+    !> List of buried points in a CSR format.
+    integer, allocatable :: iburied(:)
+    integer, allocatable :: buried(:)
 end type ddx_constants_type
 
 contains
@@ -423,7 +429,7 @@ subroutine constants_init(params, constants, ddx_error)
         end do
     end if
 
-    ! Generate geometry-related constants (required by the LPB code)
+    ! Generate geometry-related constants
     call constants_geometry_init(params, constants, ddx_error)
     if (ddx_error % flag .ne. 0) then
         call update_error(ddx_error, "constants_geometry_init returned an " // &
@@ -992,6 +998,37 @@ subroutine constants_geometry_init(params, constants, ddx_error)
             end if
         enddo
     enddo
+
+    allocate(constants % iburied(params % nsph + 1), &
+         & constants % iexposed(params % nsph + 1), &
+         & constants % buried(params % nsph * params % ngrid), &
+         & constants % exposed(params % nsph * params % ngrid), stat=info)
+    if (info .ne. 0) then
+        call update_error(ddx_error, "Buried exposed lists allocation failed")
+        return
+    endif
+    iwork = 1
+    do isph = 1, params % nsph
+        constants % iburied(isph) = iwork
+        do igrid = 1, params % ngrid
+            if (constants % ui(igrid, isph) .lt. one) then
+                write(7, *) isph, igrid
+                constants % buried(iwork) = igrid
+                iwork = iwork + 1
+            end if
+        end do
+    end do
+    constants % iburied(params % nsph + 1) = iwork
+
+    !!do isph = 1, params % nsph
+    !!    write(6,*) isph, constants % iburied(isph), constants % iburied(isph + 1) - 1
+    !!    do iwork = constants % iburied(isph), constants % iburied(isph + 1) - 1
+    !!        igrid = constants % buried(iwork)
+    !!        write(8, *) isph, igrid
+    !!    end do
+    !!end do
+    !!stop 0
+
     ! Build cavity array. At first get total count for each sphere
     allocate(constants % ncav_sph(params % nsph), stat=info)
     if (info .ne. 0) then
@@ -2144,6 +2181,34 @@ subroutine constants_free(constants, ddx_error)
                 & "deallocation failed!")
         end if
     end if
+    if (allocated(constants % iburied)) then
+        deallocate(constants % iburied, stat=istat)
+        if (istat .ne. 0) then
+            call update_error(ddx_error, "`iburied` " // &
+                & "deallocation failed!")
+        end if
+    end if
+    if (allocated(constants % buried)) then
+        deallocate(constants % buried, stat=istat)
+        if (istat .ne. 0) then
+            call update_error(ddx_error, "`buried` " // &
+                & "deallocation failed!")
+        end if
+    end if
+    if (allocated(constants % iexposed)) then
+        deallocate(constants % iexposed, stat=istat)
+        if (istat .ne. 0) then
+            call update_error(ddx_error, "`iexposed` " // &
+                & "deallocation failed!")
+        end if
+    end if
+    if (allocated(constants % exposed)) then
+        deallocate(constants % exposed, stat=istat)
+        if (istat .ne. 0) then
+            call update_error(ddx_error, "`exposed` " // &
+                & "deallocation failed!")
+        end if
+    end if
 end subroutine constants_free
 
 end module ddx_constants
diff --git a/src/ddx_operators.f90 b/src/ddx_operators.f90
index 2caefb59..9175fce8 100644
--- a/src/ddx_operators.f90
+++ b/src/ddx_operators.f90
@@ -34,7 +34,7 @@ subroutine lx(params, constants, workspace, x, y, ddx_error)
     real(dp), intent(out) :: y(constants % nbasis, params % nsph)
     type(ddx_error_type), intent(inout) :: ddx_error
     !! Local variables
-    integer :: isph, jsph, ij, l, ind, iproc, its
+    integer :: isph, jsph, ij, l, ind, iproc, its, i
     real(dp) :: vij(3), tij, vvij, xij, oij, thigh
 
     call time_push()
@@ -62,36 +62,35 @@ subroutine lx(params, constants, workspace, x, y, ddx_error)
     else
         call time_push()
         thigh = one + pt5*(params % se + one)*params % eta
-        !$omp parallel do collapse(2) default(none) shared(params,constants,workspace,x,y,thigh) &
-        !$omp private(isph,iproc,its,ij,jsph,vij,vvij,tij,xij,oij) schedule(static,1)
+        workspace % tmp_grid = zero
+        !$omp parallel do default(none) shared(params, &
+        !$omp constants,workspace,x,y,thigh) private(isph,iproc,its,ij, &
+        !$omp jsph,vij,vvij,tij,xij,oij,i) schedule(static,1)
         do isph = 1, params % nsph
-            do its = 1, params % ngrid
-                workspace % tmp_grid(its, isph) = zero
+            do i = constants % iburied(isph), constants % iburied(isph + 1) - 1
+                its = constants % buried(i)
                 iproc = omp_get_thread_num() + 1
-                ! contribution from integration point present
-                if (constants % ui(its,isph).lt.one) then
-                    ! loop over neighbors of i-sphere
-                    do ij = constants % inl(isph), constants % inl(isph+1)-1
-                        jsph = constants % nl(ij)
-                        ! compute t_n^ij = | r_i + \rho_i s_n - r_j | / \rho_j
-                        vij  = params % csph(:,isph) + params % rsph(isph)* &
-                            & constants % cgrid(:,its) - params % csph(:,jsph)
-                        vvij = sqrt(vij(1)*vij(1) + vij(2)*vij(2) + vij(3)*vij(3))
-                        tij  = vvij / params % rsph(jsph)
-                        ! point is INSIDE j-sphere
-                        if (tij.lt.thigh .and. tij.gt.zero) then
-                            xij = fsw(tij, params % se, params % eta)
-                            if (constants % fi(its,isph).gt.one) then
-                                oij = xij / constants % fi(its,isph)
-                            else
-                                oij = xij
-                            end if
-                            call fmm_l2p_work(vij, params % rsph(jsph), params % lmax, &
-                                & constants % vscales_rel, oij, x(:, jsph), one, &
-                                & workspace % tmp_grid(its, isph), workspace % tmp_work(:, iproc))
+                write(6,*) "thread", iproc, "working on", isph, its
+                do ij = constants % inl(isph), constants % inl(isph+1)-1
+                    jsph = constants % nl(ij)
+                    ! compute t_n^ij = | r_i + \rho_i s_n - r_j | / \rho_j
+                    vij  = params % csph(:,isph) + params % rsph(isph)* &
+                        & constants % cgrid(:,its) - params % csph(:,jsph)
+                    vvij = sqrt(vij(1)*vij(1) + vij(2)*vij(2) + vij(3)*vij(3))
+                    tij  = vvij / params % rsph(jsph)
+                    ! point is INSIDE j-sphere
+                    if (tij.lt.thigh .and. tij.gt.zero) then
+                        xij = fsw(tij, params % se, params % eta)
+                        if (constants % fi(its,isph).gt.one) then
+                            oij = xij / constants % fi(its,isph)
+                        else
+                            oij = xij
                         end if
-                    end do
-                end if
+                        call fmm_l2p_work(vij, params % rsph(jsph), params % lmax, &
+                            & constants % vscales_rel, oij, x(:, jsph), one, &
+                            & workspace % tmp_grid(its, isph), workspace % tmp_work(:, iproc))
+                    end if
+                end do
             end do
         end do
         call time_pull("calcv")

From a95dc8a5b3c7b2bb7754320d214a4cc1b58ac672 Mon Sep 17 00:00:00 2001
From: Michele Nottoli <michele.nottoli@gmail.com>
Date: Mon, 22 Apr 2024 16:07:40 +0200
Subject: [PATCH 06/21] Good parallel scaling.

---
 src/ddx_operators.f90 | 103 +++++++++++++++++++++++-------------------
 1 file changed, 57 insertions(+), 46 deletions(-)

diff --git a/src/ddx_operators.f90 b/src/ddx_operators.f90
index 9175fce8..f24e6538 100644
--- a/src/ddx_operators.f90
+++ b/src/ddx_operators.f90
@@ -34,21 +34,22 @@ subroutine lx(params, constants, workspace, x, y, ddx_error)
     real(dp), intent(out) :: y(constants % nbasis, params % nsph)
     type(ddx_error_type), intent(inout) :: ddx_error
     !! Local variables
-    integer :: isph, jsph, ij, l, ind, iproc, its, i
+    integer :: isph, jsph, ij, l, ind, its, i
     real(dp) :: vij(3), tij, vvij, xij, oij, thigh
 
+    integer :: vgrid_nbasis, nbasis, ngrid, nsph, lmax
+    real(dp), allocatable :: tmp_grid(:)
+    real(dp) :: se, eta, work(params % lmax + 1)
+
+    allocate(tmp_grid(params % ngrid))
+
     call time_push()
     ! dummy operation on unused interface arguments
     if (ddx_error % flag .eq. 0) continue
+    if (workspace % xs_time .eq. 0) continue
 
-    !! Initialize
-    call time_push()
-    y = zero
-    call time_pull("zero")
-!
-!   incore code:
-!
     if (params % matvecmem .eq. 1) then
+        y = zero
         !$omp parallel do default(none) shared(params,constants,x,y) &
         !$omp private(isph,ij,jsph) schedule(dynamic)
         do isph = 1, params % nsph
@@ -61,48 +62,57 @@ subroutine lx(params, constants, workspace, x, y, ddx_error)
         end do
     else
         call time_push()
+
+        ! set up the environment for a good usage of open mp
         thigh = one + pt5*(params % se + one)*params % eta
-        workspace % tmp_grid = zero
-        !$omp parallel do default(none) shared(params, &
-        !$omp constants,workspace,x,y,thigh) private(isph,iproc,its,ij, &
-        !$omp jsph,vij,vvij,tij,xij,oij,i) schedule(static,1)
-        do isph = 1, params % nsph
-            do i = constants % iburied(isph), constants % iburied(isph + 1) - 1
-                its = constants % buried(i)
-                iproc = omp_get_thread_num() + 1
-                write(6,*) "thread", iproc, "working on", isph, its
-                do ij = constants % inl(isph), constants % inl(isph+1)-1
-                    jsph = constants % nl(ij)
-                    ! compute t_n^ij = | r_i + \rho_i s_n - r_j | / \rho_j
-                    vij  = params % csph(:,isph) + params % rsph(isph)* &
-                        & constants % cgrid(:,its) - params % csph(:,jsph)
-                    vvij = sqrt(vij(1)*vij(1) + vij(2)*vij(2) + vij(3)*vij(3))
-                    tij  = vvij / params % rsph(jsph)
-                    ! point is INSIDE j-sphere
-                    if (tij.lt.thigh .and. tij.gt.zero) then
-                        xij = fsw(tij, params % se, params % eta)
-                        if (constants % fi(its,isph).gt.one) then
-                            oij = xij / constants % fi(its,isph)
-                        else
-                            oij = xij
+        nsph = params % nsph
+        se = params % se
+        eta = params % eta
+        lmax = params % lmax
+        nbasis = constants % nbasis
+        ngrid = params % ngrid
+        vgrid_nbasis = constants % vgrid_nbasis
+
+        associate(iburied => constants % iburied, buried => constants % buried, &
+                & inl => constants % inl, nl => constants % nl, &
+                & csph => params % csph, rsph => params % rsph, &
+                & cgrid => constants % cgrid, fi => constants % fi, &
+                & vscales_rel => constants % vscales_rel, &
+                & vwgrid => constants % vwgrid)
+
+            !$omp parallel do default(none) &
+            !$omp firstprivate(nsph,thigh,se,eta,lmax,nbasis,ngrid,vgrid_nbasis) &
+            !$omp shared(iburied,buried,inl,nl,csph,rsph,fi,x,y,vscales_rel,vwgrid) &
+            !$omp private(isph,its,ij,jsph,vij,vvij,tij,xij,oij,i,work,tmp_grid) &
+            !$omp schedule(dynamic,1)
+            do isph = 1, nsph
+                tmp_grid(:) = 0.0d0
+                do i = iburied(isph), iburied(isph + 1) - 1
+                    its = buried(i)
+                    do ij = inl(isph), inl(isph+1)-1
+                        jsph = nl(ij)
+                        vij  = csph(:,isph) + rsph(isph)*cgrid(:,its) - csph(:,jsph)
+                        vvij = sqrt(vij(1)*vij(1) + vij(2)*vij(2) + vij(3)*vij(3))
+                        tij  = vvij / rsph(jsph)
+                        if (tij.lt.thigh .and. tij.gt.0.0d0) then
+                            xij = fsw(tij, se, eta)
+                            if (fi(its, isph).gt.1.0d0) then
+                                oij = xij / fi(its, isph)
+                            else
+                                oij = xij
+                            end if
+                            call fmm_l2p_work(vij, rsph(jsph), lmax, &
+                                & vscales_rel, oij, x(:, jsph), 1.0d0, &
+                                & tmp_grid(its), work)
                         end if
-                        call fmm_l2p_work(vij, params % rsph(jsph), params % lmax, &
-                            & constants % vscales_rel, oij, x(:, jsph), one, &
-                            & workspace % tmp_grid(its, isph), workspace % tmp_work(:, iproc))
-                    end if
+                    end do
                 end do
+                call ddintegrate(1, nbasis, ngrid, vwgrid, vgrid_nbasis, &
+                    & tmp_grid, y(:, isph))
+                y(:, isph) = - y(:, isph)
             end do
-        end do
-        call time_pull("calcv")
-
-        call time_push()
-        call ddintegrate(params % nsph, constants % nbasis, params % ngrid, &
-            & constants % vwgrid, constants % vgrid_nbasis, &
-            & workspace % tmp_grid, y)
-        call time_pull("integrate")
-        call time_push()
-        y = - y
-        call time_pull("sign")
+        end associate
+        call time_pull("calcv+integrate+sign")
     end if
 !
 !   if required, add the diagonal.
@@ -117,6 +127,7 @@ subroutine lx(params, constants, workspace, x, y, ddx_error)
         end do
     end if
     call time_pull("lx")
+    deallocate(tmp_grid)
 end subroutine lx
 
 !> Adjoint single layer operator matvec 

From 05a61c266e0539026e66e249b18f4007ec7a6df8 Mon Sep 17 00:00:00 2001
From: Michele Nottoli <michele.nottoli@gmail.com>
Date: Mon, 22 Apr 2024 16:55:45 +0200
Subject: [PATCH 07/21] Fixed a bug.

---
 src/ddx_operators.f90 | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/ddx_operators.f90 b/src/ddx_operators.f90
index f24e6538..051c5e8a 100644
--- a/src/ddx_operators.f90
+++ b/src/ddx_operators.f90
@@ -82,8 +82,8 @@ subroutine lx(params, constants, workspace, x, y, ddx_error)
 
             !$omp parallel do default(none) &
             !$omp firstprivate(nsph,thigh,se,eta,lmax,nbasis,ngrid,vgrid_nbasis) &
-            !$omp shared(iburied,buried,inl,nl,csph,rsph,fi,x,y,vscales_rel,vwgrid) &
-            !$omp private(isph,its,ij,jsph,vij,vvij,tij,xij,oij,i,work,tmp_grid) &
+            !$omp shared(buried,iburied,inl,nl,csph,rsph,cgrid,fi,vscales_rel,x,y,vwgrid) &
+            !$omp private(isph,tmp_grid,i,its,ij,jsph,vij,vvij,tij,xij,oij,work) &
             !$omp schedule(dynamic,1)
             do isph = 1, nsph
                 tmp_grid(:) = 0.0d0

From 13566a1a3158d957918797a007aaaf569fdd447e Mon Sep 17 00:00:00 2001
From: Michele Nottoli <michele.nottoli@gmail.com>
Date: Tue, 23 Apr 2024 15:47:21 +0200
Subject: [PATCH 08/21] lx completely parallel.

---
 src/ddx_operators.f90 | 71 +++++++++++++++++++++++--------------------
 1 file changed, 38 insertions(+), 33 deletions(-)

diff --git a/src/ddx_operators.f90 b/src/ddx_operators.f90
index 051c5e8a..5367e874 100644
--- a/src/ddx_operators.f90
+++ b/src/ddx_operators.f90
@@ -41,35 +41,36 @@ subroutine lx(params, constants, workspace, x, y, ddx_error)
     real(dp), allocatable :: tmp_grid(:)
     real(dp) :: se, eta, work(params % lmax + 1)
 
+    call time_push()
     allocate(tmp_grid(params % ngrid))
 
-    call time_push()
     ! dummy operation on unused interface arguments
     if (ddx_error % flag .eq. 0) continue
     if (workspace % xs_time .eq. 0) continue
 
+    nsph = params % nsph
+    nbasis = constants % nbasis
+    lmax = params % lmax
+
     if (params % matvecmem .eq. 1) then
-        y = zero
-        !$omp parallel do default(none) shared(params,constants,x,y) &
-        !$omp private(isph,ij,jsph) schedule(dynamic)
-        do isph = 1, params % nsph
-            do ij = constants % inl(isph), constants % inl(isph + 1) - 1
-                jsph = constants % nl(ij)
-                call dgemv('n', constants % nbasis, constants % nbasis, one, &
-                    & constants % l(:,:,ij), constants % nbasis, x(:,jsph), 1, &
-                    & one, y(:,isph), 1)
+        associate(inl => constants % inl, nl => constants % nl, &
+                & l => constants % l)
+            !$omp parallel do default(none) shared(inl,nl,l,x,y) &
+            !$omp firstprivate(nsph,nbasis) &
+            !$omp private(isph,ij,jsph) schedule(dynamic,1)
+            do isph = 1, nsph
+                y(:,isph) = 0.0d0
+                do ij = inl(isph), inl(isph + 1) - 1
+                    jsph = nl(ij)
+                    call dgemv('n', nbasis, nbasis, 1.0d0, l(:,:,ij), &
+                        & nbasis, x(:,jsph), 1, 1.0d0, y(:,isph), 1)
+                end do
             end do
-        end do
+        end associate
     else
-        call time_push()
-
-        ! set up the environment for a good usage of open mp
         thigh = one + pt5*(params % se + one)*params % eta
-        nsph = params % nsph
         se = params % se
         eta = params % eta
-        lmax = params % lmax
-        nbasis = constants % nbasis
         ngrid = params % ngrid
         vgrid_nbasis = constants % vgrid_nbasis
 
@@ -80,10 +81,10 @@ subroutine lx(params, constants, workspace, x, y, ddx_error)
                 & vscales_rel => constants % vscales_rel, &
                 & vwgrid => constants % vwgrid)
 
-            !$omp parallel do default(none) &
-            !$omp firstprivate(nsph,thigh,se,eta,lmax,nbasis,ngrid,vgrid_nbasis) &
-            !$omp shared(buried,iburied,inl,nl,csph,rsph,cgrid,fi,vscales_rel,x,y,vwgrid) &
-            !$omp private(isph,tmp_grid,i,its,ij,jsph,vij,vvij,tij,xij,oij,work) &
+            !$omp parallel do default(none) firstprivate(nsph,thigh,se,eta, &
+            !$omp lmax,nbasis,ngrid,vgrid_nbasis) shared(buried,iburied,inl, &
+            !$omp nl,csph,rsph,cgrid,fi,vscales_rel,x,y,vwgrid) private( &
+            !$omp isph,tmp_grid,i,its,ij,jsph,vij,vvij,tij,xij,oij,work) &
             !$omp schedule(dynamic,1)
             do isph = 1, nsph
                 tmp_grid(:) = 0.0d0
@@ -91,8 +92,10 @@ subroutine lx(params, constants, workspace, x, y, ddx_error)
                     its = buried(i)
                     do ij = inl(isph), inl(isph+1)-1
                         jsph = nl(ij)
-                        vij  = csph(:,isph) + rsph(isph)*cgrid(:,its) - csph(:,jsph)
-                        vvij = sqrt(vij(1)*vij(1) + vij(2)*vij(2) + vij(3)*vij(3))
+                        vij = csph(:,isph) + rsph(isph)*cgrid(:,its) &
+                            & - csph(:,jsph)
+                        vvij = sqrt(vij(1)*vij(1) + vij(2)*vij(2) &
+                            & + vij(3)*vij(3))
                         tij  = vvij / rsph(jsph)
                         if (tij.lt.thigh .and. tij.gt.0.0d0) then
                             xij = fsw(tij, se, eta)
@@ -112,19 +115,21 @@ subroutine lx(params, constants, workspace, x, y, ddx_error)
                 y(:, isph) = - y(:, isph)
             end do
         end associate
-        call time_pull("calcv+integrate+sign")
     end if
-!
-!   if required, add the diagonal.
-!
+
+    ! if required, add the diagonal.
     if (constants % dodiag) then
-        do isph = 1, params % nsph
-            do l = 0, params % lmax
-                ind = l*l + l + 1
-                y(ind-l:ind+l, isph) = y(ind-l:ind+l, isph) + &
-                     x(ind-l:ind+l, isph) / (constants % vscales(ind)**2)
+        associate(vscales => constants % vscales)
+            !$omp parallel do default(none) private(isph,l,ind) &
+            !$omp firstprivate(nsph,lmax) shared(x,y,vscales)
+            do isph = 1, nsph
+                do l = 0, lmax
+                    ind = l*l + l + 1
+                    y(ind-l:ind+l, isph) = y(ind-l:ind+l, isph) &
+                         & + x(ind-l:ind+l, isph) / (vscales(ind)**2)
+                end do
             end do
-        end do
+        end associate
     end if
     call time_pull("lx")
     deallocate(tmp_grid)

From ca8108c500a3072d98605bc4051c3108b63619eb Mon Sep 17 00:00:00 2001
From: Michele Nottoli <michele.nottoli@gmail.com>
Date: Tue, 23 Apr 2024 17:15:39 +0200
Subject: [PATCH 09/21] Parallelized adjoint.

---
 src/ddx.f90           |   1 +
 src/ddx_operators.f90 | 169 ++++++++++++++++++++++++++++--------------
 src/ddx_solvers.f90   |   2 +-
 3 files changed, 117 insertions(+), 55 deletions(-)

diff --git a/src/ddx.f90 b/src/ddx.f90
index b9ee78ed..ba424b4d 100644
--- a/src/ddx.f90
+++ b/src/ddx.f90
@@ -411,6 +411,7 @@ subroutine ddrun(ddx_data, state, electrostatics, psi, tol, esolv, &
             return
         end if
     end if
+
     call time_push()
     call solve(ddx_data % params, ddx_data % constants, &
         & ddx_data % workspace, state, tol, ddx_error)
diff --git a/src/ddx_operators.f90 b/src/ddx_operators.f90
index 5367e874..094b3259 100644
--- a/src/ddx_operators.f90
+++ b/src/ddx_operators.f90
@@ -34,8 +34,8 @@ subroutine lx(params, constants, workspace, x, y, ddx_error)
     real(dp), intent(out) :: y(constants % nbasis, params % nsph)
     type(ddx_error_type), intent(inout) :: ddx_error
     !! Local variables
-    integer :: isph, jsph, ij, l, ind, its, i
-    real(dp) :: vij(3), tij, vvij, xij, oij, thigh
+    integer :: isph, jsph, ij, l, ind, its, i, m
+    real(dp) :: vij(3), tij, vvij, xij, oij, thigh, fac
 
     integer :: vgrid_nbasis, nbasis, ngrid, nsph, lmax
     real(dp), allocatable :: tmp_grid(:)
@@ -120,13 +120,16 @@ subroutine lx(params, constants, workspace, x, y, ddx_error)
     ! if required, add the diagonal.
     if (constants % dodiag) then
         associate(vscales => constants % vscales)
-            !$omp parallel do default(none) private(isph,l,ind) &
-            !$omp firstprivate(nsph,lmax) shared(x,y,vscales)
+            !$omp parallel do collapse(2) default(none) &
+            !$omp shared(vscales,x,y) firstprivate(nsph,lmax) &
+            !$omp private(isph,l,ind,m,fac) schedule(static,10)
             do isph = 1, nsph
                 do l = 0, lmax
                     ind = l*l + l + 1
-                    y(ind-l:ind+l, isph) = y(ind-l:ind+l, isph) &
-                         & + x(ind-l:ind+l, isph) / (vscales(ind)**2)
+                    fac = vscales(ind)**2
+                    do m = -l, l
+                        y(ind+m, isph) = y(ind+m, isph) + x(ind+m, isph)/fac
+                    end do
                 end do
             end do
         end associate
@@ -135,7 +138,7 @@ subroutine lx(params, constants, workspace, x, y, ddx_error)
     deallocate(tmp_grid)
 end subroutine lx
 
-!> Adjoint single layer operator matvec 
+!> Adjoint single layer operator matvec
 subroutine lstarx(params, constants, workspace, x, y, ddx_error)
     implicit none
     !! Inputs
@@ -148,55 +151,105 @@ subroutine lstarx(params, constants, workspace, x, y, ddx_error)
     real(dp), intent(out) :: y(constants % nbasis, params % nsph)
     type(ddx_error_type), intent(inout) :: ddx_error
     !! Local variables
-    integer :: isph, jsph, ij, indmat, igrid, l, ind, iproc
+    integer :: isph, jsph, ij, indmat, igrid, l, ind, m
+    real(dp) :: vji(3), vvji, tji, xji, oji, fac
+
+    integer :: nsph, ngrid, nbasis, lmax
+    real(dp) :: thigh, se, eta, work(params % lmax + 1)
+
+    call time_push()
 
     ! dummy operation on unused interface arguments
     if (ddx_error % flag .eq. 0) continue
 
-    y = zero
+    nsph = params % nsph
+    nbasis = constants % nbasis
+
     if (params % matvecmem .eq. 1) then
-        !$omp parallel do default(none) shared(params,constants,x,y) &
-        !$omp private(isph,ij,jsph,indmat) schedule(dynamic)
-        do isph = 1, params % nsph
-            do ij = constants % inl(isph), constants % inl(isph + 1) - 1
-                jsph = constants % nl(ij)
-                indmat = constants % itrnl(ij)
-                call dgemv('t', constants % nbasis, constants % nbasis, one, &
-                    & constants % l(:,:,indmat), constants % nbasis, x(:,jsph), 1, &
-                    & one, y(:,isph), 1)
+        associate(inl => constants % inl, nl => constants % nl, &
+                & itrnl => constants % itrnl, l => constants % l)
+            !$omp parallel do default(none) schedule(dynamic,1) &
+            !$omp firstprivate(nsph,nbasis) shared(x,y,inl,nl,itrnl,l) &
+            !$omp private(isph,ij,jsph,indmat)
+            do isph = 1, nsph
+                y(:,isph) = 0.0d0
+                do ij = inl(isph), inl(isph + 1) - 1
+                    jsph = nl(ij)
+                    indmat = itrnl(ij)
+                    call dgemv('t', nbasis, nbasis, 1.0d0, &
+                        & l(:,:,indmat), nbasis, x(:,jsph), 1, &
+                        & 1.0d0, y(:,isph), 1)
+                end do
             end do
-        end do
+        end associate
     else
+        ngrid = params % ngrid
+        thigh = one + (params % se+one)/two*params % eta
+        se = params % se
+        eta = params % eta
+        lmax = params % lmax
         ! Expand x over spherical harmonics
-        !$omp parallel do default(none) shared(params,constants,workspace,x) &
-        !$omp private(isph,igrid) schedule(dynamic)
-        do isph = 1, params % nsph
-            do igrid = 1, params % ngrid
-                workspace % tmp_grid(igrid, isph) = dot_product(x(:, isph), &
-                    & constants % vgrid(:constants % nbasis, igrid))
+        associate(vgrid => constants % vgrid, tmp_grid => workspace % tmp_grid)
+            !$omp parallel do collapse(2) default(none) shared(x,tmp_grid) &
+            !$omp firstprivate(nsph,ngrid,nbasis) &
+            !$omp private(isph,igrid) schedule(static,100)
+            do isph = 1, nsph
+                do igrid = 1, ngrid
+                    tmp_grid(igrid,isph) = dot_product(x(:, isph), vgrid(:nbasis, igrid))
+                end do
             end do
-        end do
-        ! Compute (negative) action
-        !$omp parallel do default(none) shared(params,constants,workspace,x,y) &
-        !$omp private(isph,iproc) schedule(dynamic)
-        do isph = 1, params % nsph
-            iproc = omp_get_thread_num() + 1
-            call adjrhs(params, constants, isph, workspace % tmp_grid, &
-                & y(:, isph), workspace % tmp_work(:, iproc))
-            ! fix the sign
-            y(:, isph) = - y(:, isph)
-        end do
+        end associate
+        associate(inl => constants % inl, nl => constants % nl, &
+                & csph => params % csph, rsph => params % rsph, &
+                & cgrid => constants % cgrid, fi => constants % fi, &
+                & wgrid => constants % wgrid, &
+                & tmp_grid => workspace % tmp_grid, &
+                & vscales_rel => constants % vscales_rel)
+            !$omp parallel do default(none) schedule(dynamic,1) &
+            !$omp shared(inl,nl,csph,rsph,cgrid,fi,wgrid,tmp_grid,y) &
+            !$omp firstprivate(nsph,ngrid,thigh,se,eta,lmax) &
+            !$omp private(isph,ij,jsph,igrid,vji,vvji,tji,xji,oji,fac,work)
+            do isph = 1, nsph
+                do ij = inl(isph), inl(isph+1)-1
+                    jsph = nl(ij)
+                    do igrid = 1, ngrid
+                        vji = csph(:,jsph) + rsph(jsph)*cgrid(:,igrid) - csph(:,isph)
+                        vvji = sqrt(vji(1)*vji(1) + vji(2)*vji(2) + vji(3)*vji(3))
+                        tji  = vvji/rsph(isph)
+                        if (tji.lt.thigh) then
+                            xji = fsw(tji, se, eta)
+                            if (fi(igrid,jsph).gt.1.0d0) then
+                                oji = xji/fi(igrid,jsph)
+                            else
+                                oji = xji
+                            endif
+                            fac = wgrid(igrid) * tmp_grid(igrid, jsph) * oji
+                            call fmm_l2p_adj_work(vji, fac, rsph(isph), &
+                                & lmax, vscales_rel, 1.0d0, y(:, isph), work)
+                        endif
+                    end do
+                end do
+                y(:, isph) = - y(:, isph)
+            end do
+        end associate
     end if
     if (constants % dodiag) then
-    ! Loop over harmonics
-        do isph = 1, params % nsph
-            do l = 0, params % lmax
-                ind = l*l + l + 1
-                y(ind-l:ind+l, isph) = y(ind-l:ind+l, isph) + &
-                    & x(ind-l:ind+l, isph) / (constants % vscales(ind)**2)
+        associate(vscales => constants % vscales)
+            !$omp parallel do collapse(2) default(none) &
+            !$omp shared(vscales,x,y) firstprivate(nsph,lmax) &
+            !$omp private(isph,l,ind,m,fac) schedule(static,10)
+            do isph = 1, nsph
+                do l = 0, lmax
+                    ind = l*l + l + 1
+                    fac = vscales(ind)**2
+                    do m = -l, l
+                        y(ind+m, isph) = y(ind+m, isph) + x(ind+m, isph)/fac
+                    end do
+                end do
             end do
-        end do
+        end associate
     end if
+    call time_pull("lstarx")
 end subroutine lstarx
 
 !> Diagonal preconditioning for Lx operator
@@ -214,22 +267,30 @@ subroutine ldm1x(params, constants, workspace, x, y, ddx_error)
     real(dp), intent(out) :: y(constants % nbasis, params % nsph)
     type(ddx_error_type), intent(inout) :: ddx_error
     !! Local variables
-    integer :: isph, l, ind
+    integer :: isph, l, ind, nsph, lmax, m
+    real(dp) :: fac
 
     ! dummy operation on unused interface arguments
     if ((ddx_error % flag .eq. 0) .or. &
-   &    (allocated(workspace % tmp_pot))) continue
+        & (allocated(workspace % tmp_pot))) continue
 
-    !! Loop over harmonics
-    !$omp parallel do default(none) shared(params,constants,x,y) &
-    !$omp private(isph,l,ind) schedule(dynamic)
-    do isph = 1, params % nsph
-        do l = 0, params % lmax
-            ind = l*l + l + 1
-            y(ind-l:ind+l, isph) = x(ind-l:ind+l, isph) &
-                & *(constants % vscales(ind)**2)
+    nsph = params % nsph
+    lmax = params % lmax
+
+    associate(vscales => constants % vscales)
+        !$omp parallel do collapse(2) default(none) &
+        !$omp shared(vscales,x,y) firstprivate(nsph,lmax) &
+        !$omp private(isph,l,ind,m,fac) schedule(static,10)
+        do isph = 1, nsph
+            do l = 0, lmax
+                ind = l*l + l + 1
+                fac = vscales(ind)**2
+                do m = -l, l
+                    y(ind+m, isph) = x(ind+m, isph)*fac
+                end do
+            end do
         end do
-    end do
+    end associate
 end subroutine ldm1x
 
 !> Double layer operator matvec without diagonal blocks
diff --git a/src/ddx_solvers.f90 b/src/ddx_solvers.f90
index 23ee70b0..7e5806d0 100644
--- a/src/ddx_solvers.f90
+++ b/src/ddx_solvers.f90
@@ -207,7 +207,7 @@ subroutine diis(n, nmat, ndiis, x, e, b, xnew)
     end do
     nmat = nmat + 1
 
-    deallocate (bloc, cex, stat=istatus)
+    deallocate (bloc, cex, ipiv, stat=istatus)
 end subroutine diis
 
 !> DIIS helper routine

From 14b884ff3d7ab25ae7ec33fcb04a3f9c6ea4b709 Mon Sep 17 00:00:00 2001
From: Michele Nottoli <michele.nottoli@gmail.com>
Date: Tue, 23 Apr 2024 17:18:45 +0200
Subject: [PATCH 10/21] Bug fix.

---
 src/ddx_constants.f90 | 3 +++
 src/ddx_operators.f90 | 1 +
 2 files changed, 4 insertions(+)

diff --git a/src/ddx_constants.f90 b/src/ddx_constants.f90
index d798b311..ad0a6d18 100644
--- a/src/ddx_constants.f90
+++ b/src/ddx_constants.f90
@@ -229,6 +229,9 @@ module ddx_constants
     !> List of buried points in a CSR format.
     integer, allocatable :: iburied(:)
     integer, allocatable :: buried(:)
+    !> List of points in overlapped balls
+    integer, allocatable :: igrid_overlap(:,:,:)
+    integer, allocatable :: grid_overlap(:)
 end type ddx_constants_type
 
 contains
diff --git a/src/ddx_operators.f90 b/src/ddx_operators.f90
index 094b3259..86dbe62c 100644
--- a/src/ddx_operators.f90
+++ b/src/ddx_operators.f90
@@ -210,6 +210,7 @@ subroutine lstarx(params, constants, workspace, x, y, ddx_error)
             !$omp firstprivate(nsph,ngrid,thigh,se,eta,lmax) &
             !$omp private(isph,ij,jsph,igrid,vji,vvji,tji,xji,oji,fac,work)
             do isph = 1, nsph
+                y(:,isph) = 0.0d0
                 do ij = inl(isph), inl(isph+1)-1
                     jsph = nl(ij)
                     do igrid = 1, ngrid

From 3ae4892065c81f186fd3570178e906f8ef6d6591 Mon Sep 17 00:00:00 2001
From: Michele Nottoli <michele.nottoli@gmail.com>
Date: Mon, 13 May 2024 11:38:44 +0200
Subject: [PATCH 11/21] Update.

---
 src/ddx_constants.f90 |  5 +----
 src/ddx_operators.f90 | 15 +++++++++------
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/ddx_constants.f90 b/src/ddx_constants.f90
index ad0a6d18..cce47822 100644
--- a/src/ddx_constants.f90
+++ b/src/ddx_constants.f90
@@ -229,9 +229,6 @@ module ddx_constants
     !> List of buried points in a CSR format.
     integer, allocatable :: iburied(:)
     integer, allocatable :: buried(:)
-    !> List of points in overlapped balls
-    integer, allocatable :: igrid_overlap(:,:,:)
-    integer, allocatable :: grid_overlap(:)
 end type ddx_constants_type
 
 contains
@@ -1015,7 +1012,7 @@ subroutine constants_geometry_init(params, constants, ddx_error)
         constants % iburied(isph) = iwork
         do igrid = 1, params % ngrid
             if (constants % ui(igrid, isph) .lt. one) then
-                write(7, *) isph, igrid
+                !write(7, *) isph, igrid
                 constants % buried(iwork) = igrid
                 iwork = iwork + 1
             end if
diff --git a/src/ddx_operators.f90 b/src/ddx_operators.f90
index 86dbe62c..d2f68348 100644
--- a/src/ddx_operators.f90
+++ b/src/ddx_operators.f90
@@ -97,7 +97,7 @@ subroutine lx(params, constants, workspace, x, y, ddx_error)
                         vvij = sqrt(vij(1)*vij(1) + vij(2)*vij(2) &
                             & + vij(3)*vij(3))
                         tij  = vvij / rsph(jsph)
-                        if (tij.lt.thigh .and. tij.gt.0.0d0) then
+                        if (tij.lt.thigh) then
                             xij = fsw(tij, se, eta)
                             if (fi(its, isph).gt.1.0d0) then
                                 oij = xij / fi(its, isph)
@@ -151,7 +151,7 @@ subroutine lstarx(params, constants, workspace, x, y, ddx_error)
     real(dp), intent(out) :: y(constants % nbasis, params % nsph)
     type(ddx_error_type), intent(inout) :: ddx_error
     !! Local variables
-    integer :: isph, jsph, ij, indmat, igrid, l, ind, m
+    integer :: isph, jsph, ij, indmat, igrid, l, ind, m, i
     real(dp) :: vji(3), vvji, tji, xji, oji, fac
 
     integer :: nsph, ngrid, nbasis, lmax
@@ -204,16 +204,19 @@ subroutine lstarx(params, constants, workspace, x, y, ddx_error)
                 & cgrid => constants % cgrid, fi => constants % fi, &
                 & wgrid => constants % wgrid, &
                 & tmp_grid => workspace % tmp_grid, &
-                & vscales_rel => constants % vscales_rel)
+                & vscales_rel => constants % vscales_rel, &
+                & iburied => constants % iburied, &
+                & buried => constants % buried)
             !$omp parallel do default(none) schedule(dynamic,1) &
-            !$omp shared(inl,nl,csph,rsph,cgrid,fi,wgrid,tmp_grid,y) &
+            !$omp shared(inl,nl,csph,rsph,cgrid,fi,wgrid,tmp_grid,y,buried,iburied) &
             !$omp firstprivate(nsph,ngrid,thigh,se,eta,lmax) &
-            !$omp private(isph,ij,jsph,igrid,vji,vvji,tji,xji,oji,fac,work)
+            !$omp private(isph,ij,jsph,igrid,vji,vvji,tji,xji,oji,fac,work,i)
             do isph = 1, nsph
                 y(:,isph) = 0.0d0
                 do ij = inl(isph), inl(isph+1)-1
                     jsph = nl(ij)
-                    do igrid = 1, ngrid
+                    do i = iburied(jsph), iburied(jsph + 1) - 1
+                        igrid = buried(i)
                         vji = csph(:,jsph) + rsph(jsph)*cgrid(:,igrid) - csph(:,isph)
                         vvji = sqrt(vji(1)*vji(1) + vji(2)*vji(2) + vji(3)*vji(3))
                         tji  = vvji/rsph(isph)

From a87cc545a534c14825a454a7aa24ce7038a9238a Mon Sep 17 00:00:00 2001
From: Michele Nottoli <michele.nottoli@gmail.com>
Date: Mon, 13 May 2024 15:33:27 +0200
Subject: [PATCH 12/21] Precomputing overlaps cuts LX time by 1/2.

---
 src/ddx_constants.f90 | 49 +++++++++++++++++++++++++++++++++++++++++--
 src/ddx_operators.f90 | 36 ++++++++++++++++---------------
 2 files changed, 66 insertions(+), 19 deletions(-)

diff --git a/src/ddx_constants.f90 b/src/ddx_constants.f90
index cce47822..dca0d29f 100644
--- a/src/ddx_constants.f90
+++ b/src/ddx_constants.f90
@@ -229,6 +229,9 @@ module ddx_constants
     !> List of buried points in a CSR format.
     integer, allocatable :: iburied(:)
     integer, allocatable :: buried(:)
+    !> List of overlapped points in a CSR format.
+    integer, allocatable :: ioverlap(:)
+    integer, allocatable :: overlap(:)
 end type ddx_constants_type
 
 contains
@@ -779,9 +782,9 @@ subroutine constants_geometry_init(params, constants, ddx_error)
     !! Local variables
     real(dp) :: swthr, v(3), maxv, ssqv, vv, t
     integer :: i, isph, jsph, inear, igrid, iwork, jwork, lwork, &
-        & old_lwork, icav, info
+        & old_lwork, icav, info, ij, ijgrid
     integer, allocatable :: work(:, :), tmp_work(:, :)
-    real(dp) :: start_time
+    real(dp) :: start_time, thigh, vij(3), vvij, tij
     !! The code
     ! Prepare FMM structures if needed
     start_time = omp_get_wtime()
@@ -1029,6 +1032,48 @@ subroutine constants_geometry_init(params, constants, ddx_error)
     !!end do
     !!stop 0
 
+    allocate(constants % ioverlap(constants % inl(params % nsph+1)), &
+        & constants % overlap(constants % inl(params % nsph+1) &
+        & *params % ngrid), stat=info)
+    if (info .ne. 0) then
+        call update_error(ddx_error, "Overlapped grid lists allocation failed")
+        return
+    end if
+    thigh = one + pt5*(params % se + one)*params % eta
+    iwork = 1
+    do isph = 1, params % nsph
+        do ij = constants % inl(isph), constants % inl(isph+1) - 1
+            jsph = constants % nl(ij)
+            constants % ioverlap(ij) = iwork
+            do igrid = 1, params % ngrid
+                if (constants % ui(igrid, isph) .lt. one) then
+                    vij = params % csph(:,isph) &
+                        & + params % rsph(isph)*constants % cgrid(:,igrid) &
+                        & - params % csph(:,jsph)
+                    vvij = sqrt(vij(1)*vij(1) + vij(2)*vij(2) &
+                        & + vij(3)*vij(3))
+                    tij  = vvij / params % rsph(jsph)
+                    if (tij.lt.thigh) then
+                        constants % overlap(iwork) = igrid
+                        iwork = iwork + 1
+                    end if
+                end if
+            end do
+        end do
+    end do
+    constants % ioverlap(ij) = iwork
+
+    !do isph = 1, params % nsph
+    !    do ij = constants % inl(isph), constants % inl(isph+1) - 1
+    !        jsph = constants % nl(ij)
+    !        do ijgrid = constants % ioverlap(ij), constants % ioverlap(ij+1) - 1
+    !            igrid = constants % overlap(ijgrid)
+    !            write(6,*) isph, jsph, igrid
+    !        end do
+    !    end do
+    !end do
+    !stop 0
+
     ! Build cavity array. At first get total count for each sphere
     allocate(constants % ncav_sph(params % nsph), stat=info)
     if (info .ne. 0) then
diff --git a/src/ddx_operators.f90 b/src/ddx_operators.f90
index d2f68348..113391e6 100644
--- a/src/ddx_operators.f90
+++ b/src/ddx_operators.f90
@@ -34,7 +34,7 @@ subroutine lx(params, constants, workspace, x, y, ddx_error)
     real(dp), intent(out) :: y(constants % nbasis, params % nsph)
     type(ddx_error_type), intent(inout) :: ddx_error
     !! Local variables
-    integer :: isph, jsph, ij, l, ind, its, i, m
+    integer :: isph, jsph, ij, l, ind, its, i, m, ijgrid
     real(dp) :: vij(3), tij, vvij, xij, oij, thigh, fac
 
     integer :: vgrid_nbasis, nbasis, ngrid, nsph, lmax
@@ -79,35 +79,37 @@ subroutine lx(params, constants, workspace, x, y, ddx_error)
                 & csph => params % csph, rsph => params % rsph, &
                 & cgrid => constants % cgrid, fi => constants % fi, &
                 & vscales_rel => constants % vscales_rel, &
-                & vwgrid => constants % vwgrid)
+                & vwgrid => constants % vwgrid, &
+                & ioverlap => constants % ioverlap, &
+                & overlap => constants % overlap)
 
             !$omp parallel do default(none) firstprivate(nsph,thigh,se,eta, &
             !$omp lmax,nbasis,ngrid,vgrid_nbasis) shared(buried,iburied,inl, &
             !$omp nl,csph,rsph,cgrid,fi,vscales_rel,x,y,vwgrid) private( &
-            !$omp isph,tmp_grid,i,its,ij,jsph,vij,vvij,tij,xij,oij,work) &
+            !$omp isph,tmp_grid,i,its,ij,jsph,vij,vvij,tij,xij,oij,work,ijgrid) &
             !$omp schedule(dynamic,1)
             do isph = 1, nsph
                 tmp_grid(:) = 0.0d0
-                do i = iburied(isph), iburied(isph + 1) - 1
-                    its = buried(i)
-                    do ij = inl(isph), inl(isph+1)-1
-                        jsph = nl(ij)
+                do ij = inl(isph), inl(isph+1)-1
+                    jsph = nl(ij)
+                !do i = iburied(isph), iburied(isph + 1) - 1
+                !its = buried(i)
+                    do ijgrid = ioverlap(ij), ioverlap(ij+1) - 1
+                        its = overlap(ijgrid)
                         vij = csph(:,isph) + rsph(isph)*cgrid(:,its) &
                             & - csph(:,jsph)
                         vvij = sqrt(vij(1)*vij(1) + vij(2)*vij(2) &
                             & + vij(3)*vij(3))
                         tij  = vvij / rsph(jsph)
-                        if (tij.lt.thigh) then
-                            xij = fsw(tij, se, eta)
-                            if (fi(its, isph).gt.1.0d0) then
-                                oij = xij / fi(its, isph)
-                            else
-                                oij = xij
-                            end if
-                            call fmm_l2p_work(vij, rsph(jsph), lmax, &
-                                & vscales_rel, oij, x(:, jsph), 1.0d0, &
-                                & tmp_grid(its), work)
+                        xij = fsw(tij, se, eta)
+                        if (fi(its, isph).gt.1.0d0) then
+                            oij = xij / fi(its, isph)
+                        else
+                            oij = xij
                         end if
+                        call fmm_l2p_work(vij, rsph(jsph), lmax, &
+                            & vscales_rel, oij, x(:, jsph), 1.0d0, &
+                            & tmp_grid(its), work)
                     end do
                 end do
                 call ddintegrate(1, nbasis, ngrid, vwgrid, vgrid_nbasis, &

From d68be8f95793a0c6ec460be961152f73b432f8fb Mon Sep 17 00:00:00 2001
From: Michele Nottoli <michele.nottoli@gmail.com>
Date: Mon, 13 May 2024 16:04:27 +0200
Subject: [PATCH 13/21] Precomputing adjoint overlaps.

---
 src/ddx_constants.f90 | 33 ++++++++++++++++++++----
 src/ddx_operators.f90 | 58 +++++++++++++++++++++----------------------
 2 files changed, 56 insertions(+), 35 deletions(-)

diff --git a/src/ddx_constants.f90 b/src/ddx_constants.f90
index dca0d29f..c1934e6a 100644
--- a/src/ddx_constants.f90
+++ b/src/ddx_constants.f90
@@ -232,6 +232,10 @@ module ddx_constants
     !> List of overlapped points in a CSR format.
     integer, allocatable :: ioverlap(:)
     integer, allocatable :: overlap(:)
+    !> Adjoint list of overlapped points (the relation is not self
+    ! adjoint) in a CSR format.
+    integer, allocatable :: adj_ioverlap(:)
+    integer, allocatable :: adj_overlap(:)
 end type ddx_constants_type
 
 contains
@@ -782,7 +786,7 @@ subroutine constants_geometry_init(params, constants, ddx_error)
     !! Local variables
     real(dp) :: swthr, v(3), maxv, ssqv, vv, t
     integer :: i, isph, jsph, inear, igrid, iwork, jwork, lwork, &
-        & old_lwork, icav, info, ij, ijgrid
+        & old_lwork, icav, info, ij, adj_iwork
     integer, allocatable :: work(:, :), tmp_work(:, :)
     real(dp) :: start_time, thigh, vij(3), vvij, tij
     !! The code
@@ -1033,18 +1037,22 @@ subroutine constants_geometry_init(params, constants, ddx_error)
     !!stop 0
 
     allocate(constants % ioverlap(constants % inl(params % nsph+1)), &
-        & constants % overlap(constants % inl(params % nsph+1) &
-        & *params % ngrid), stat=info)
+        & constants % adj_ioverlap(constants % inl(params % nsph+1)), &
+        & constants % overlap(constants % inl(params % nsph+1)*params % ngrid), &
+        & constants % adj_overlap(constants % inl(params % nsph+1)*params % ngrid), &
+        & stat=info)
     if (info .ne. 0) then
         call update_error(ddx_error, "Overlapped grid lists allocation failed")
         return
     end if
     thigh = one + pt5*(params % se + one)*params % eta
     iwork = 1
+    adj_iwork = 1
     do isph = 1, params % nsph
         do ij = constants % inl(isph), constants % inl(isph+1) - 1
             jsph = constants % nl(ij)
             constants % ioverlap(ij) = iwork
+            constants % adj_ioverlap(ij) = adj_iwork
             do igrid = 1, params % ngrid
                 if (constants % ui(igrid, isph) .lt. one) then
                     vij = params % csph(:,isph) &
@@ -1058,16 +1066,31 @@ subroutine constants_geometry_init(params, constants, ddx_error)
                         iwork = iwork + 1
                     end if
                 end if
+                if (constants % ui(igrid, jsph) .lt. one) then
+                    vij = params % csph(:,jsph) &
+                        & + params % rsph(jsph)*constants % cgrid(:,igrid) &
+                        & - params % csph(:,isph)
+                    vvij = sqrt(vij(1)*vij(1) + vij(2)*vij(2) &
+                        & + vij(3)*vij(3))
+                    tij  = vvij / params % rsph(isph)
+                    if (tij.lt.thigh) then
+                        constants % adj_overlap(adj_iwork) = igrid
+                        adj_iwork = adj_iwork + 1
+                    end if
+                end if
             end do
         end do
     end do
     constants % ioverlap(ij) = iwork
+    constants % adj_ioverlap(ij) = adj_iwork
 
     !do isph = 1, params % nsph
     !    do ij = constants % inl(isph), constants % inl(isph+1) - 1
     !        jsph = constants % nl(ij)
-    !        do ijgrid = constants % ioverlap(ij), constants % ioverlap(ij+1) - 1
-    !            igrid = constants % overlap(ijgrid)
+    !        !do ijgrid = constants % ioverlap(ij), constants % ioverlap(ij+1) - 1
+    !        !    igrid = constants % overlap(ijgrid)
+    !        do ijgrid = constants % adj_ioverlap(ij), constants % adj_ioverlap(ij+1) - 1
+    !            igrid = constants % adj_overlap(ijgrid)
     !            write(6,*) isph, jsph, igrid
     !        end do
     !    end do
diff --git a/src/ddx_operators.f90 b/src/ddx_operators.f90
index 113391e6..592d9a42 100644
--- a/src/ddx_operators.f90
+++ b/src/ddx_operators.f90
@@ -34,7 +34,7 @@ subroutine lx(params, constants, workspace, x, y, ddx_error)
     real(dp), intent(out) :: y(constants % nbasis, params % nsph)
     type(ddx_error_type), intent(inout) :: ddx_error
     !! Local variables
-    integer :: isph, jsph, ij, l, ind, its, i, m, ijgrid
+    integer :: isph, jsph, ij, l, ind, its, i, m, ijits
     real(dp) :: vij(3), tij, vvij, xij, oij, thigh, fac
 
     integer :: vgrid_nbasis, nbasis, ngrid, nsph, lmax
@@ -74,8 +74,7 @@ subroutine lx(params, constants, workspace, x, y, ddx_error)
         ngrid = params % ngrid
         vgrid_nbasis = constants % vgrid_nbasis
 
-        associate(iburied => constants % iburied, buried => constants % buried, &
-                & inl => constants % inl, nl => constants % nl, &
+        associate(inl => constants % inl, nl => constants % nl, &
                 & csph => params % csph, rsph => params % rsph, &
                 & cgrid => constants % cgrid, fi => constants % fi, &
                 & vscales_rel => constants % vscales_rel, &
@@ -83,19 +82,18 @@ subroutine lx(params, constants, workspace, x, y, ddx_error)
                 & ioverlap => constants % ioverlap, &
                 & overlap => constants % overlap)
 
-            !$omp parallel do default(none) firstprivate(nsph,thigh,se,eta, &
-            !$omp lmax,nbasis,ngrid,vgrid_nbasis) shared(buried,iburied,inl, &
-            !$omp nl,csph,rsph,cgrid,fi,vscales_rel,x,y,vwgrid) private( &
-            !$omp isph,tmp_grid,i,its,ij,jsph,vij,vvij,tij,xij,oij,work,ijgrid) &
-            !$omp schedule(dynamic,1)
+            !$omp parallel do default(none) schedule(dynamic,1) &
+            !$omp firstprivate(nsph,thigh,se,eta,lmax,nbasis,ngrid, &
+            !$omp vgrid_nbasis) shared(inl,nl,csph,rsph,cgrid,fi, &
+            !$omp vscales_rel,x,y,vwgrid,ioverlap,overlap) private( &
+            !$omp isph,tmp_grid,i,its,ij,jsph,vij,vvij,tij,xij,oij, &
+            !$omp work,ijits)
             do isph = 1, nsph
                 tmp_grid(:) = 0.0d0
                 do ij = inl(isph), inl(isph+1)-1
                     jsph = nl(ij)
-                !do i = iburied(isph), iburied(isph + 1) - 1
-                !its = buried(i)
-                    do ijgrid = ioverlap(ij), ioverlap(ij+1) - 1
-                        its = overlap(ijgrid)
+                    do ijits = ioverlap(ij), ioverlap(ij+1) - 1
+                        its = overlap(ijits)
                         vij = csph(:,isph) + rsph(isph)*cgrid(:,its) &
                             & - csph(:,jsph)
                         vvij = sqrt(vij(1)*vij(1) + vij(2)*vij(2) &
@@ -153,7 +151,7 @@ subroutine lstarx(params, constants, workspace, x, y, ddx_error)
     real(dp), intent(out) :: y(constants % nbasis, params % nsph)
     type(ddx_error_type), intent(inout) :: ddx_error
     !! Local variables
-    integer :: isph, jsph, ij, indmat, igrid, l, ind, m, i
+    integer :: isph, jsph, ij, indmat, igrid, l, ind, m, i, ijigrid
     real(dp) :: vji(3), vvji, tji, xji, oji, fac
 
     integer :: nsph, ngrid, nbasis, lmax
@@ -207,32 +205,32 @@ subroutine lstarx(params, constants, workspace, x, y, ddx_error)
                 & wgrid => constants % wgrid, &
                 & tmp_grid => workspace % tmp_grid, &
                 & vscales_rel => constants % vscales_rel, &
-                & iburied => constants % iburied, &
-                & buried => constants % buried)
+                & adj_ioverlap => constants % adj_ioverlap, &
+                & adj_overlap => constants % adj_overlap)
             !$omp parallel do default(none) schedule(dynamic,1) &
-            !$omp shared(inl,nl,csph,rsph,cgrid,fi,wgrid,tmp_grid,y,buried,iburied) &
-            !$omp firstprivate(nsph,ngrid,thigh,se,eta,lmax) &
-            !$omp private(isph,ij,jsph,igrid,vji,vvji,tji,xji,oji,fac,work,i)
+            !$omp shared(inl,nl,csph,rsph,cgrid,fi,wgrid,tmp_grid,y, &
+            !$omp adj_overlap,adj_ioverlap) firstprivate(nsph,ngrid, &
+            !$omp thigh,se,eta,lmax) private(isph,ij,jsph,igrid,vji, &
+            !$omp vvji,tji,xji,oji,fac,work,i,ijigrid)
             do isph = 1, nsph
                 y(:,isph) = 0.0d0
                 do ij = inl(isph), inl(isph+1)-1
                     jsph = nl(ij)
-                    do i = iburied(jsph), iburied(jsph + 1) - 1
-                        igrid = buried(i)
+                    do ijigrid = adj_ioverlap(ij), adj_ioverlap(ij+1) - 1
+                        igrid = adj_overlap(ijigrid)
+
                         vji = csph(:,jsph) + rsph(jsph)*cgrid(:,igrid) - csph(:,isph)
                         vvji = sqrt(vji(1)*vji(1) + vji(2)*vji(2) + vji(3)*vji(3))
                         tji  = vvji/rsph(isph)
-                        if (tji.lt.thigh) then
-                            xji = fsw(tji, se, eta)
-                            if (fi(igrid,jsph).gt.1.0d0) then
-                                oji = xji/fi(igrid,jsph)
-                            else
-                                oji = xji
-                            endif
-                            fac = wgrid(igrid) * tmp_grid(igrid, jsph) * oji
-                            call fmm_l2p_adj_work(vji, fac, rsph(isph), &
-                                & lmax, vscales_rel, 1.0d0, y(:, isph), work)
+                        xji = fsw(tji, se, eta)
+                        if (fi(igrid,jsph).gt.1.0d0) then
+                            oji = xji/fi(igrid,jsph)
+                        else
+                            oji = xji
                         endif
+                        fac = wgrid(igrid) * tmp_grid(igrid, jsph) * oji
+                        call fmm_l2p_adj_work(vji, fac, rsph(isph), &
+                            & lmax, vscales_rel, 1.0d0, y(:, isph), work)
                     end do
                 end do
                 y(:, isph) = - y(:, isph)

From a86fa2157e997f69f0b107906b5252e33e0ce292 Mon Sep 17 00:00:00 2001
From: Michele Nottoli <michele.nottoli@gmail.com>
Date: Mon, 13 May 2024 17:31:15 +0200
Subject: [PATCH 14/21] Bug fix and parallelized bx properly.

---
 src/ddx_operators.f90 | 95 +++++++++++++++++++++++++++++++++----------
 1 file changed, 73 insertions(+), 22 deletions(-)

diff --git a/src/ddx_operators.f90 b/src/ddx_operators.f90
index 592d9a42..4d18d588 100644
--- a/src/ddx_operators.f90
+++ b/src/ddx_operators.f90
@@ -34,8 +34,8 @@ subroutine lx(params, constants, workspace, x, y, ddx_error)
     real(dp), intent(out) :: y(constants % nbasis, params % nsph)
     type(ddx_error_type), intent(inout) :: ddx_error
     !! Local variables
-    integer :: isph, jsph, ij, l, ind, its, i, m, ijits
-    real(dp) :: vij(3), tij, vvij, xij, oij, thigh, fac
+    integer :: isph, jsph, ij, l, ind, its, m, ijits
+    real(dp) :: vij(3), tij, vvij, xij, oij, fac
 
     integer :: vgrid_nbasis, nbasis, ngrid, nsph, lmax
     real(dp), allocatable :: tmp_grid(:)
@@ -68,7 +68,6 @@ subroutine lx(params, constants, workspace, x, y, ddx_error)
             end do
         end associate
     else
-        thigh = one + pt5*(params % se + one)*params % eta
         se = params % se
         eta = params % eta
         ngrid = params % ngrid
@@ -83,10 +82,10 @@ subroutine lx(params, constants, workspace, x, y, ddx_error)
                 & overlap => constants % overlap)
 
             !$omp parallel do default(none) schedule(dynamic,1) &
-            !$omp firstprivate(nsph,thigh,se,eta,lmax,nbasis,ngrid, &
+            !$omp firstprivate(nsph,se,eta,lmax,nbasis,ngrid, &
             !$omp vgrid_nbasis) shared(inl,nl,csph,rsph,cgrid,fi, &
             !$omp vscales_rel,x,y,vwgrid,ioverlap,overlap) private( &
-            !$omp isph,tmp_grid,i,its,ij,jsph,vij,vvij,tij,xij,oij, &
+            !$omp isph,tmp_grid,its,ij,jsph,vij,vvij,tij,xij,oij, &
             !$omp work,ijits)
             do isph = 1, nsph
                 tmp_grid(:) = 0.0d0
@@ -190,7 +189,7 @@ subroutine lstarx(params, constants, workspace, x, y, ddx_error)
         lmax = params % lmax
         ! Expand x over spherical harmonics
         associate(vgrid => constants % vgrid, tmp_grid => workspace % tmp_grid)
-            !$omp parallel do collapse(2) default(none) shared(x,tmp_grid) &
+            !$omp parallel do collapse(2) default(none) shared(x,tmp_grid,vgrid) &
             !$omp firstprivate(nsph,ngrid,nbasis) &
             !$omp private(isph,igrid) schedule(static,100)
             do isph = 1, nsph
@@ -209,9 +208,9 @@ subroutine lstarx(params, constants, workspace, x, y, ddx_error)
                 & adj_overlap => constants % adj_overlap)
             !$omp parallel do default(none) schedule(dynamic,1) &
             !$omp shared(inl,nl,csph,rsph,cgrid,fi,wgrid,tmp_grid,y, &
-            !$omp adj_overlap,adj_ioverlap) firstprivate(nsph,ngrid, &
-            !$omp thigh,se,eta,lmax) private(isph,ij,jsph,igrid,vji, &
-            !$omp vvji,tji,xji,oji,fac,work,i,ijigrid)
+            !$omp adj_overlap,adj_ioverlap,vscales_rel) firstprivate( &
+            !$omp nsph,ngrid,thigh,se,eta,lmax) private(isph,ij,jsph, &
+            !$omp igrid,vji,vvji,tji,xji,oji,fac,work,i,ijigrid)
             do isph = 1, nsph
                 y(:,isph) = 0.0d0
                 do ij = inl(isph), inl(isph+1)-1
@@ -867,12 +866,26 @@ subroutine bx(params, constants, workspace, x, y, ddx_error)
     real(dp), dimension(constants % nbasis, params % nsph), intent(in) :: x
     real(dp), dimension(constants % nbasis, params % nsph), intent(out) :: y
     type(ddx_error_type), intent(inout) :: ddx_error
-    integer :: isph, jsph, ij, iproc
+    integer :: isph, jsph, ij, ijits, its
+
+    integer :: nsph, nbasis, lmax, ngrid, vgrid_nbasis
+    real(dp) :: se, eta, vij(3), vvij, tij, xij, oij, vtij(3), kappa
+    complex(dp) :: work_complex(params % lmax+1)
+    real(dp) :: work(params % lmax+1)
+
+    real(dp), allocatable :: tmp_grid(:)
+
+    call time_push()
+    allocate(tmp_grid(params % ngrid))
 
     ! dummy operation on unused interface arguments
     if (ddx_error % flag .eq. 0) continue
+    if (workspace % xs_time .eq. 0) continue
+
+    nsph = params % nsph
+    nbasis = constants % nbasis
+    lmax = params % lmax
 
-    y = zero
     if (params % matvecmem .eq. 1) then
         !$omp parallel do default(none) shared(params,constants,x,y) &
         !$omp private(isph,ij,jsph) schedule(dynamic)
@@ -885,19 +898,57 @@ subroutine bx(params, constants, workspace, x, y, ddx_error)
             end do
         end do
     else
-        call time_push()
-        !$omp parallel do default(none) shared(params,constants,workspace,x,y) &
-        !$omp private(isph,iproc) schedule(dynamic)
-        do isph = 1, params % nsph
-          iproc = omp_get_thread_num() + 1
-          call calcv2_lpb(params, constants, isph, workspace % tmp_grid(:,isph), x)
-        end do
-        call ddintegrate(params % nsph, constants % nbasis, params % ngrid, constants % vwgrid, &
-            & constants % vgrid_nbasis, workspace % tmp_grid, y)
-        y = - y
-        call time_pull("bx")
+        se = params % se
+        eta = params % eta
+        ngrid = params % ngrid
+        kappa = params % kappa
+        vgrid_nbasis = constants % vgrid_nbasis
+
+        associate(inl => constants % inl, nl => constants % nl, &
+                & csph => params % csph, rsph => params % rsph, &
+                & cgrid => constants % cgrid, fi => constants % fi, &
+                & vscales => constants % vscales, &
+                & vwgrid => constants % vwgrid, &
+                & ioverlap => constants % ioverlap, &
+                & overlap => constants % overlap, &
+                & si_ri => constants % si_ri)
+            !$omp parallel do default(none) schedule(dynamic,1) &
+            !$omp firstprivate(nsph,se,eta,lmax,nbasis,ngrid, &
+            !$omp vgrid_nbasis,kappa) shared(inl,nl,csph,rsph,cgrid,fi, &
+            !$omp vscales,x,y,vwgrid,ioverlap,overlap,si_ri) private( &
+            !$omp isph,tmp_grid,its,ij,jsph,vij,vvij,tij,xij,oij,vtij, &
+            !$omp work,work_complex,ijits)
+            do isph = 1, nsph
+                tmp_grid(:) = 0.0d0
+                do ij = inl(isph), inl(isph+1)-1
+                    jsph = nl(ij)
+                    do ijits = ioverlap(ij), ioverlap(ij+1) - 1
+                        its = overlap(ijits)
+                        vij = csph(:,isph) + rsph(isph)*cgrid(:,its) &
+                            & - csph(:,jsph)
+                        vvij = sqrt(vij(1)*vij(1) + vij(2)*vij(2) &
+                            & + vij(3)*vij(3))
+                        tij  = vvij/rsph(jsph)
+                        xij = fsw(tij, se, eta)
+                        if (fi(its,isph).gt.1.0d0) then
+                            oij = xij/fi(its, isph)
+                        else
+                            oij = xij
+                        end if
+                        vtij = vij*kappa
+                        call fmm_l2p_bessel_work(vtij, lmax, vscales, &
+                            & si_ri(:, jsph), oij, x(:, jsph), 1.0d0, &
+                            & tmp_grid(its), work_complex, work)
+                    end do
+                end do
+                call ddintegrate(1, nbasis, ngrid, vwgrid, vgrid_nbasis, &
+                    & tmp_grid, y(:, isph))
+                y(:, isph) = - y(:, isph)
+            end do
+        end associate
     end if
     if (constants % dodiag) y = y + x
+    call time_pull("bx")
 end subroutine bx
 
 !> Adjoint ddLPB matrix-vector product

From 67576f5cb7ec13acf6e6c020f48745568e425135 Mon Sep 17 00:00:00 2001
From: Michele Nottoli <michele.nottoli@gmail.com>
Date: Tue, 14 May 2024 11:24:15 +0200
Subject: [PATCH 15/21] Reshaped bstarx for parallelization.

---
 src/ddx.f90           |   2 +
 src/ddx_operators.f90 | 160 ++++++++++++++++++++++++++++++++++--------
 2 files changed, 134 insertions(+), 28 deletions(-)

diff --git a/src/ddx.f90 b/src/ddx.f90
index ba424b4d..f7193965 100644
--- a/src/ddx.f90
+++ b/src/ddx.f90
@@ -455,6 +455,8 @@ subroutine ddrun(ddx_data, state, electrostatics, psi, tol, esolv, &
         end if
     end if
 
+    stop 0
+
     ! compute the forces
     if (ddx_data % params % force .eq. 1) then
         force = zero
diff --git a/src/ddx_operators.f90 b/src/ddx_operators.f90
index 4d18d588..7eb2d542 100644
--- a/src/ddx_operators.f90
+++ b/src/ddx_operators.f90
@@ -152,9 +152,9 @@ subroutine lstarx(params, constants, workspace, x, y, ddx_error)
     !! Local variables
     integer :: isph, jsph, ij, indmat, igrid, l, ind, m, i, ijigrid
     real(dp) :: vji(3), vvji, tji, xji, oji, fac
-
     integer :: nsph, ngrid, nbasis, lmax
-    real(dp) :: thigh, se, eta, work(params % lmax + 1)
+    real(dp) :: thigh, se, eta, work(params % lmax + 1), &
+        & tmp(constants % nbasis)
 
     call time_push()
 
@@ -188,13 +188,15 @@ subroutine lstarx(params, constants, workspace, x, y, ddx_error)
         eta = params % eta
         lmax = params % lmax
         ! Expand x over spherical harmonics
-        associate(vgrid => constants % vgrid, tmp_grid => workspace % tmp_grid)
-            !$omp parallel do collapse(2) default(none) shared(x,tmp_grid,vgrid) &
-            !$omp firstprivate(nsph,ngrid,nbasis) &
+        associate(vgrid => constants % vgrid, &
+                 & tmp_grid => workspace % tmp_grid)
+            !$omp parallel do collapse(2) default(none) &
+            !$omp firstprivate(nsph,ngrid,nbasis) shared(x,tmp_grid,vgrid) &
             !$omp private(isph,igrid) schedule(static,100)
             do isph = 1, nsph
                 do igrid = 1, ngrid
-                    tmp_grid(igrid,isph) = dot_product(x(:, isph), vgrid(:nbasis, igrid))
+                    tmp_grid(igrid,isph) = dot_product(x(:, isph), &
+                        & vgrid(:nbasis, igrid))
                 end do
             end do
         end associate
@@ -210,29 +212,31 @@ subroutine lstarx(params, constants, workspace, x, y, ddx_error)
             !$omp shared(inl,nl,csph,rsph,cgrid,fi,wgrid,tmp_grid,y, &
             !$omp adj_overlap,adj_ioverlap,vscales_rel) firstprivate( &
             !$omp nsph,ngrid,thigh,se,eta,lmax) private(isph,ij,jsph, &
-            !$omp igrid,vji,vvji,tji,xji,oji,fac,work,i,ijigrid)
+            !$omp igrid,vji,vvji,tji,xji,oji,fac,work,i,ijigrid,tmp)
             do isph = 1, nsph
-                y(:,isph) = 0.0d0
+                tmp = 0.0d0
                 do ij = inl(isph), inl(isph+1)-1
                     jsph = nl(ij)
                     do ijigrid = adj_ioverlap(ij), adj_ioverlap(ij+1) - 1
                         igrid = adj_overlap(ijigrid)
 
-                        vji = csph(:,jsph) + rsph(jsph)*cgrid(:,igrid) - csph(:,isph)
-                        vvji = sqrt(vji(1)*vji(1) + vji(2)*vji(2) + vji(3)*vji(3))
+                        vji = csph(:,jsph) + rsph(jsph)*cgrid(:,igrid) &
+                            & - csph(:,isph)
+                        vvji = sqrt(vji(1)*vji(1) + vji(2)*vji(2) &
+                            & + vji(3)*vji(3))
                         tji  = vvji/rsph(isph)
                         xji = fsw(tji, se, eta)
                         if (fi(igrid,jsph).gt.1.0d0) then
                             oji = xji/fi(igrid,jsph)
                         else
                             oji = xji
-                        endif
+                        end if
                         fac = wgrid(igrid) * tmp_grid(igrid, jsph) * oji
                         call fmm_l2p_adj_work(vji, fac, rsph(isph), &
-                            & lmax, vscales_rel, 1.0d0, y(:, isph), work)
+                            & lmax, vscales_rel, 1.0d0, tmp, work)
                     end do
                 end do
-                y(:, isph) = - y(:, isph)
+                y(:, isph) = - tmp
             end do
         end associate
     end if
@@ -815,16 +819,32 @@ subroutine bstarx(params, constants, workspace, x, y, ddx_error)
     real(dp), intent(out) :: y(constants % nbasis, params % nsph)
     type(ddx_error_type), intent(inout) :: ddx_error
     ! Local variables
-    integer :: isph, jsph, ij, indmat, iproc
+    integer :: isph, jsph, ij, indmat, ind, ijigrid, igrid, l, m
+
+    real(dp) :: thigh, se, eta, kappa, vji(3), vvji, sji(3), tji
+    real(dp) :: rho, ctheta, stheta, cphi, sphi, xji, oji, fac, &
+        & basloc((params % lmax + 1)**2), vcos(params % lmax + 1), &
+        & vsin(params % lmax + 1), vplm((params % lmax + 1)**2)
+    real(dp), dimension(constants % nbasis) :: fac_hsp
+    real(dp), dimension(0:params % lmax) :: SI_rjin, DI_rjin
+    complex(dp) :: work_complex(max(2, params % lmax+1))
+    integer :: nsph, nbasis, lmax, ngrid
+
+    real(dp) :: y_copy(constants % nbasis, params % nsph)
+
+    call time_push()
+
+    nsph = params % nsph
+    nbasis = constants % nbasis
 
     ! dummy operation on unused interface arguments
     if (ddx_error % flag .eq. 0) continue
 
-    y = zero
-    if (params % matvecmem .eq. 1) then
+    !if (params % matvecmem .eq. 1) then
         !$omp parallel do default(none) shared(params,constants,x,y) &
         !$omp private(isph,ij,jsph,indmat) schedule(dynamic)
         do isph = 1, params % nsph
+            y(:,isph) = zero
             do ij = constants % inl(isph), constants % inl(isph + 1) - 1
                 jsph = constants % nl(ij)
                 indmat = constants % itrnl(ij)
@@ -833,7 +853,9 @@ subroutine bstarx(params, constants, workspace, x, y, ddx_error)
                     & one, y(:,isph), 1)
             end do
         end do
-    else
+    y_copy(:,:) = y(:,:)
+    y = zero
+    !else
         !$omp parallel do default(none) shared(params,constants,workspace,x,y) &
         !$omp private(isph) schedule(static,1)
         do isph = 1, params % nsph
@@ -841,20 +863,101 @@ subroutine bstarx(params, constants, workspace, x, y, ddx_error)
                 & constants % vgrid_nbasis, x(:, isph), 1, zero, &
                 & workspace % tmp_grid(:, isph), 1)
         end do
-        !$omp parallel do default(none) shared(params,constants,workspace,x,y) &
-        !$omp private(isph,iproc) schedule(dynamic)
-        do isph = 1, params % nsph
-            iproc = omp_get_thread_num() + 1
-            call adjrhs_lpb(params, constants, isph, workspace % tmp_grid, &
-                & y(:, isph), workspace % tmp_vylm(:, iproc), workspace % tmp_vplm(:, iproc), &
-                & workspace % tmp_vcos(:, iproc), workspace % tmp_vsin(:, iproc), &
-                & workspace % tmp_bessel(:, iproc))
-            y(:,isph)  = - y(:,isph)
-        end do
-    end if
+
+        ngrid = params % ngrid
+        thigh = one + (params % se+one)/two*params % eta
+        se = params % se
+        eta = params % eta
+        lmax = params % lmax
+        kappa = params % kappa
+
+        associate(inl => constants % inl, nl => constants % nl, &
+                & csph => params % csph, rsph => params % rsph, &
+                & cgrid => constants % cgrid, fi => constants % fi, &
+                & wgrid => constants % wgrid, &
+                & tmp_grid => workspace % tmp_grid, &
+                & vscales => constants % vscales, &
+                & adj_ioverlap => constants % adj_ioverlap, &
+                & adj_overlap => constants % adj_overlap)
+            do isph = 1, nsph
+                !y_copy(:, isph) = 0.0d0
+                !call adjrhs_lpb(params, constants, isph, workspace % tmp_grid, &
+                !    & y_copy(:, isph), workspace % tmp_vylm(:, 1), &
+                !    & workspace % tmp_vplm(:, 1), &
+                !    & workspace % tmp_vcos(:, 1), workspace % tmp_vsin(:, 1), &
+                !    & workspace % tmp_bessel(:, 1))
+                !y_copy(:, isph) = - y_copy(:, isph)
+
+                y(:, isph) = 0.0d0
+                do ij = inl(isph), inl(isph+1)-1
+                    jsph = nl(ij)
+                    do ijigrid = adj_ioverlap(ij), adj_ioverlap(ij+1) - 1
+                       igrid = adj_overlap(ijigrid)
+                    !do igrid = 1, params % ngrid
+
+                        vji = csph(:,jsph) + rsph(jsph)*cgrid(:,igrid) &
+                            & - csph(:,isph)
+                        vvji = sqrt(vji(1)*vji(1) + vji(2)*vji(2) &
+                            & + vji(3)*vji(3))
+                        tji = vvji/rsph(isph)
+                        sji = vji/vvji
+
+                        !if ( tji.gt.( one + (se+one)/two*eta ) ) continue
+
+                        call ylmbas(sji, rho, ctheta, stheta, cphi, sphi, &
+                            & lmax, vscales, basloc, vplm, vcos, vsin)
+
+                        !call inthsp_adj(params, constants, vvji, isph, basloc, fac_hsp, &
+                        !    & work_complex)
+
+                        si_rjin = 0.0d0
+                        di_rjin = 0.0d0
+
+                        call modified_spherical_bessel_first_kind(lmax, &
+                            & vvji*kappa, si_rjin, di_rjin, work_complex)
+
+                        do l = 0, lmax
+                            do  m = -l, l
+                                ind = l*l + l + 1 + m
+                                fac_hsp(ind) = SI_rjin(l)/constants % SI_ri(l,isph)*basloc(ind)
+                            end do
+                        end do
+
+                        xji = fsw(tji, se, eta)
+
+                        if (fi(igrid,jsph).gt.1.0d0) then
+                            oji = xji/fi(igrid,jsph)
+                        else
+                            oji = xji
+                        end if
+                        fac = wgrid(igrid) * tmp_grid(igrid,jsph) * oji
+                        do l = 0, lmax
+                            ind = l*l + l + 1
+                            do m = -l,l
+                                y(ind+m, isph) = y(ind+m, isph) + fac*fac_hsp(ind+m)
+                            end do
+                        end do
+                        !                            vji
+                        !call fmm_l2p_adj_bessel_work(vtij, lmax, vscales, &
+                        !    & si_ri(:, isph), oij, y(:, jsph), 1.0d0, &
+                        !    & tmp_grid(its), work_complex, work)
+                    end do
+                end do
+                y(:, isph)  = - y(:, isph)
+            end do
+        end associate
+    !end if
 
     ! add the diagonal if required
     if (constants % dodiag) y = y + x
+    call time_pull("bstarx")
+
+    do isph = 1, params % nsph
+        do l = 1, constants % nbasis
+            write(6,*) y(l, isph), y_copy(l, isph), y(l, isph) - y_copy(l, isph)
+        end do
+    end do
+    !stop 0
 end subroutine bstarx
 
 !> Primal HSP matrix vector product
@@ -890,6 +993,7 @@ subroutine bx(params, constants, workspace, x, y, ddx_error)
         !$omp parallel do default(none) shared(params,constants,x,y) &
         !$omp private(isph,ij,jsph) schedule(dynamic)
         do isph = 1, params % nsph
+            y(:, isph) = 0.0d0
             do ij = constants % inl(isph), constants % inl(isph + 1) - 1
                 jsph = constants % nl(ij)
                 call dgemv('n', constants % nbasis, constants % nbasis, one, &

From 7d74448ec31859f873871b10ac2f600a0332a658 Mon Sep 17 00:00:00 2001
From: Michele Nottoli <michele.nottoli@gmail.com>
Date: Tue, 14 May 2024 13:36:00 +0200
Subject: [PATCH 16/21] Parallelized bstarx.

---
 src/ddx.f90           |  2 --
 src/ddx_operators.f90 | 63 +++++++++++++++----------------------------
 2 files changed, 22 insertions(+), 43 deletions(-)

diff --git a/src/ddx.f90 b/src/ddx.f90
index f7193965..ba424b4d 100644
--- a/src/ddx.f90
+++ b/src/ddx.f90
@@ -455,8 +455,6 @@ subroutine ddrun(ddx_data, state, electrostatics, psi, tol, esolv, &
         end if
     end if
 
-    stop 0
-
     ! compute the forces
     if (ddx_data % params % force .eq. 1) then
         force = zero
diff --git a/src/ddx_operators.f90 b/src/ddx_operators.f90
index 7eb2d542..34dc753a 100644
--- a/src/ddx_operators.f90
+++ b/src/ddx_operators.f90
@@ -824,14 +824,13 @@ subroutine bstarx(params, constants, workspace, x, y, ddx_error)
     real(dp) :: thigh, se, eta, kappa, vji(3), vvji, sji(3), tji
     real(dp) :: rho, ctheta, stheta, cphi, sphi, xji, oji, fac, &
         & basloc((params % lmax + 1)**2), vcos(params % lmax + 1), &
-        & vsin(params % lmax + 1), vplm((params % lmax + 1)**2)
+        & vsin(params % lmax + 1), vplm((params % lmax + 1)**2), &
+        & tmp(constants % nbasis)
     real(dp), dimension(constants % nbasis) :: fac_hsp
     real(dp), dimension(0:params % lmax) :: SI_rjin, DI_rjin
     complex(dp) :: work_complex(max(2, params % lmax+1))
     integer :: nsph, nbasis, lmax, ngrid
 
-    real(dp) :: y_copy(constants % nbasis, params % nsph)
-
     call time_push()
 
     nsph = params % nsph
@@ -840,7 +839,7 @@ subroutine bstarx(params, constants, workspace, x, y, ddx_error)
     ! dummy operation on unused interface arguments
     if (ddx_error % flag .eq. 0) continue
 
-    !if (params % matvecmem .eq. 1) then
+    if (params % matvecmem .eq. 1) then
         !$omp parallel do default(none) shared(params,constants,x,y) &
         !$omp private(isph,ij,jsph,indmat) schedule(dynamic)
         do isph = 1, params % nsph
@@ -853,9 +852,7 @@ subroutine bstarx(params, constants, workspace, x, y, ddx_error)
                     & one, y(:,isph), 1)
             end do
         end do
-    y_copy(:,:) = y(:,:)
-    y = zero
-    !else
+    else
         !$omp parallel do default(none) shared(params,constants,workspace,x,y) &
         !$omp private(isph) schedule(static,1)
         do isph = 1, params % nsph
@@ -878,22 +875,21 @@ subroutine bstarx(params, constants, workspace, x, y, ddx_error)
                 & tmp_grid => workspace % tmp_grid, &
                 & vscales => constants % vscales, &
                 & adj_ioverlap => constants % adj_ioverlap, &
-                & adj_overlap => constants % adj_overlap)
+                & adj_overlap => constants % adj_overlap, &
+                & si_ri => constants % si_ri)
+            !$omp parallel do default(none) schedule(dynamic,1) &
+            !$omp firstprivate(nsph,ngrid,thigh,se,eta,lmax,kappa) &
+            !$omp shared(inl,nl,csph,rsph,cgrid,fi,wgrid,tmp_grid,y, &
+            !$omp adj_overlap,adj_ioverlap,vscales,si_ri) &
+            !$omp private(isph,ij,jsph,ijigrid,igrid,vji,vvji,tji,sji, &
+            !$omp rho,ctheta,stheta,cphi,sphi,basloc,vplm,vcos,vsin,tmp, &
+            !$omp si_rjin,di_rjin,work_complex,fac_hsp,xji,oji,fac,ind)
             do isph = 1, nsph
-                !y_copy(:, isph) = 0.0d0
-                !call adjrhs_lpb(params, constants, isph, workspace % tmp_grid, &
-                !    & y_copy(:, isph), workspace % tmp_vylm(:, 1), &
-                !    & workspace % tmp_vplm(:, 1), &
-                !    & workspace % tmp_vcos(:, 1), workspace % tmp_vsin(:, 1), &
-                !    & workspace % tmp_bessel(:, 1))
-                !y_copy(:, isph) = - y_copy(:, isph)
-
-                y(:, isph) = 0.0d0
+                tmp = 0.0d0
                 do ij = inl(isph), inl(isph+1)-1
                     jsph = nl(ij)
                     do ijigrid = adj_ioverlap(ij), adj_ioverlap(ij+1) - 1
-                       igrid = adj_overlap(ijigrid)
-                    !do igrid = 1, params % ngrid
+                        igrid = adj_overlap(ijigrid)
 
                         vji = csph(:,jsph) + rsph(jsph)*cgrid(:,igrid) &
                             & - csph(:,isph)
@@ -902,14 +898,9 @@ subroutine bstarx(params, constants, workspace, x, y, ddx_error)
                         tji = vvji/rsph(isph)
                         sji = vji/vvji
 
-                        !if ( tji.gt.( one + (se+one)/two*eta ) ) continue
-
                         call ylmbas(sji, rho, ctheta, stheta, cphi, sphi, &
                             & lmax, vscales, basloc, vplm, vcos, vsin)
 
-                        !call inthsp_adj(params, constants, vvji, isph, basloc, fac_hsp, &
-                        !    & work_complex)
-
                         si_rjin = 0.0d0
                         di_rjin = 0.0d0
 
@@ -917,47 +908,37 @@ subroutine bstarx(params, constants, workspace, x, y, ddx_error)
                             & vvji*kappa, si_rjin, di_rjin, work_complex)
 
                         do l = 0, lmax
+                            ind = l*l + l + 1
                             do  m = -l, l
-                                ind = l*l + l + 1 + m
-                                fac_hsp(ind) = SI_rjin(l)/constants % SI_ri(l,isph)*basloc(ind)
+                                fac_hsp(ind+m) = SI_rjin(l)/SI_ri(l,isph)*basloc(ind+m)
                             end do
                         end do
 
                         xji = fsw(tji, se, eta)
-
                         if (fi(igrid,jsph).gt.1.0d0) then
                             oji = xji/fi(igrid,jsph)
                         else
                             oji = xji
                         end if
+
                         fac = wgrid(igrid) * tmp_grid(igrid,jsph) * oji
                         do l = 0, lmax
                             ind = l*l + l + 1
-                            do m = -l,l
-                                y(ind+m, isph) = y(ind+m, isph) + fac*fac_hsp(ind+m)
+                            do m = -l, l
+                                tmp(ind+m) = tmp(ind+m) + fac*fac_hsp(ind+m)
                             end do
                         end do
-                        !                            vji
-                        !call fmm_l2p_adj_bessel_work(vtij, lmax, vscales, &
-                        !    & si_ri(:, isph), oij, y(:, jsph), 1.0d0, &
-                        !    & tmp_grid(its), work_complex, work)
                     end do
                 end do
-                y(:, isph)  = - y(:, isph)
+                y(:, isph) = - tmp
             end do
         end associate
-    !end if
+    end if
 
     ! add the diagonal if required
     if (constants % dodiag) y = y + x
     call time_pull("bstarx")
 
-    do isph = 1, params % nsph
-        do l = 1, constants % nbasis
-            write(6,*) y(l, isph), y_copy(l, isph), y(l, isph) - y_copy(l, isph)
-        end do
-    end do
-    !stop 0
 end subroutine bstarx
 
 !> Primal HSP matrix vector product

From d70af7e4443d10e852c1149cc96c0f7e89ed78a8 Mon Sep 17 00:00:00 2001
From: Michele Nottoli <michele.nottoli@gmail.com>
Date: Tue, 14 May 2024 15:07:02 +0200
Subject: [PATCH 17/21] Update.

---
 src/ddx_operators.f90 | 95 +++++++++++++++++++++++++------------------
 src/ddx_solvers.f90   |  2 +-
 2 files changed, 57 insertions(+), 40 deletions(-)

diff --git a/src/ddx_operators.f90 b/src/ddx_operators.f90
index 34dc753a..e39714b5 100644
--- a/src/ddx_operators.f90
+++ b/src/ddx_operators.f90
@@ -189,7 +189,7 @@ subroutine lstarx(params, constants, workspace, x, y, ddx_error)
         lmax = params % lmax
         ! Expand x over spherical harmonics
         associate(vgrid => constants % vgrid, &
-                 & tmp_grid => workspace % tmp_grid)
+                & tmp_grid => workspace % tmp_grid)
             !$omp parallel do collapse(2) default(none) &
             !$omp firstprivate(nsph,ngrid,nbasis) shared(x,tmp_grid,vgrid) &
             !$omp private(isph,igrid) schedule(static,100)
@@ -819,17 +819,15 @@ subroutine bstarx(params, constants, workspace, x, y, ddx_error)
     real(dp), intent(out) :: y(constants % nbasis, params % nsph)
     type(ddx_error_type), intent(inout) :: ddx_error
     ! Local variables
-    integer :: isph, jsph, ij, indmat, ind, ijigrid, igrid, l, m
-
-    real(dp) :: thigh, se, eta, kappa, vji(3), vvji, sji(3), tji
-    real(dp) :: rho, ctheta, stheta, cphi, sphi, xji, oji, fac, &
+    integer :: isph, jsph, ij, indmat, ind, ijigrid, igrid, l, m, &
+        & nsph, nbasis, lmax, ngrid
+    real(dp) :: thigh, se, eta, kappa, vji(3), vvji, sji(3), tji, &
+        & rho, ctheta, stheta, cphi, sphi, xji, oji, fac, &
         & basloc((params % lmax + 1)**2), vcos(params % lmax + 1), &
         & vsin(params % lmax + 1), vplm((params % lmax + 1)**2), &
-        & tmp(constants % nbasis)
-    real(dp), dimension(constants % nbasis) :: fac_hsp
-    real(dp), dimension(0:params % lmax) :: SI_rjin, DI_rjin
+        & tmp(constants % nbasis), fac_hsp(constants % nbasis), &
+        & si_rjin(0:params % lmax), di_rjin(0:params % lmax) 
     complex(dp) :: work_complex(max(2, params % lmax+1))
-    integer :: nsph, nbasis, lmax, ngrid
 
     call time_push()
 
@@ -840,27 +838,23 @@ subroutine bstarx(params, constants, workspace, x, y, ddx_error)
     if (ddx_error % flag .eq. 0) continue
 
     if (params % matvecmem .eq. 1) then
-        !$omp parallel do default(none) shared(params,constants,x,y) &
-        !$omp private(isph,ij,jsph,indmat) schedule(dynamic)
-        do isph = 1, params % nsph
-            y(:,isph) = zero
-            do ij = constants % inl(isph), constants % inl(isph + 1) - 1
-                jsph = constants % nl(ij)
-                indmat = constants % itrnl(ij)
-                call dgemv('t', constants % nbasis, constants % nbasis, one, &
-                    & constants % b(:,:,indmat), constants % nbasis, x(:,jsph), 1, &
-                    & one, y(:,isph), 1)
+        associate(inl => constants % inl, nl => constants % nl, &
+                & itrnl => constants % itrnl, b => constants % b)
+            !$omp parallel do default(none) schedule(dynamic,1) &
+            !$omp firstprivate(nsph,nbasis) shared(x,y,inl,nl,itrnl,b) &
+            !$omp private(isph,ij,jsph,indmat)
+            do isph = 1, nsph
+                y(:,isph) = zero
+                do ij = inl(isph), inl(isph + 1) - 1
+                    jsph = nl(ij)
+                    indmat = itrnl(ij)
+                    call dgemv('t', nbasis, nbasis, 1.0d0, &
+                        & b(:,:,indmat), nbasis, x(:,jsph), 1, &
+                        & 1.0d0, y(:,isph), 1)
+                end do
             end do
-        end do
+        end associate
     else
-        !$omp parallel do default(none) shared(params,constants,workspace,x,y) &
-        !$omp private(isph) schedule(static,1)
-        do isph = 1, params % nsph
-            call dgemv('t', constants % nbasis, params % ngrid, one, constants % vgrid, &
-                & constants % vgrid_nbasis, x(:, isph), 1, zero, &
-                & workspace % tmp_grid(:, isph), 1)
-        end do
-
         ngrid = params % ngrid
         thigh = one + (params % se+one)/two*params % eta
         se = params % se
@@ -868,6 +862,19 @@ subroutine bstarx(params, constants, workspace, x, y, ddx_error)
         lmax = params % lmax
         kappa = params % kappa
 
+        associate(vgrid => constants % vgrid, &
+                & tmp_grid => workspace % tmp_grid)
+            !$omp parallel do collapse(2) default(none) &
+            !$omp firstprivate(nsph,ngrid,nbasis) shared(x,tmp_grid,vgrid) &
+            !$omp private(isph,igrid) schedule(static,100)
+            do isph = 1, nsph
+                do igrid = 1, ngrid
+                    tmp_grid(igrid,isph) = dot_product(x(:, isph), &
+                        & vgrid(:nbasis, igrid))
+                end do
+            end do
+        end associate
+
         associate(inl => constants % inl, nl => constants % nl, &
                 & csph => params % csph, rsph => params % rsph, &
                 & cgrid => constants % cgrid, fi => constants % fi, &
@@ -1104,9 +1111,12 @@ subroutine prec_tstarx(params, constants, workspace, x, y, ddx_error)
     y(:,:,1) = x(:,:,1)
     call convert_ddcosmo(params, constants, 1, y(:,:,1))
     n_iter = params % maxiter
-    call jacobi_diis(params, constants, workspace, constants % inner_tol, &
-        & y(:,:,1), workspace % ddcosmo_guess, n_iter, x_rel_diff, lstarx, &
+    ! the empirical 10^-2 factor reduces the number of macro iterations
+    call jacobi_diis(params, constants, workspace, &
+        & constants % inner_tol*1.0d-2, y(:,:,1), &
+        & workspace % ddcosmo_guess, n_iter, x_rel_diff, lstarx, &
         & ldm1x, hnorm, ddx_error)
+    write(6,*) "lstarx", x_rel_diff(1:n_iter)
     if (ddx_error % flag .ne. 0) then
         call update_error(ddx_error, 'prec_tstarx: ddCOSMO failed to ' // &
             & 'converge, exiting')
@@ -1117,9 +1127,10 @@ subroutine prec_tstarx(params, constants, workspace, x, y, ddx_error)
 
     start_time = omp_get_wtime()
     n_iter = params % maxiter
-    call jacobi_diis(params, constants, workspace, constants % inner_tol, &
-        & x(:,:,2), workspace % hsp_guess, n_iter, x_rel_diff, bstarx, &
-        & bx_prec, hnorm, ddx_error)
+    call jacobi_diis(params, constants, workspace, &
+        & constants % inner_tol*1.0d1, x(:,:,2), workspace % hsp_guess, &
+        & n_iter, x_rel_diff, bstarx, bx_prec, hnorm, ddx_error)
+    write(6,*) "bstarx", x_rel_diff(1:n_iter)
     if (ddx_error % flag .ne. 0) then
         call update_error(ddx_error, 'prec_tstarx: HSP failed to ' // &
             & 'converge, exiting')
@@ -1153,14 +1164,15 @@ subroutine prec_tx(params, constants, workspace, x, y, ddx_error)
     ! perform A^-1 * Yr
     start_time = omp_get_wtime()
     n_iter = params % maxiter
-    call jacobi_diis(params, constants, workspace, constants % inner_tol, &
-        & x(:,:,1), workspace % ddcosmo_guess, n_iter, x_rel_diff, lx, &
-        & ldm1x, hnorm, ddx_error)
+    call jacobi_diis(params, constants, workspace, &
+        & constants % inner_tol, x(:,:,1), workspace % ddcosmo_guess, &
+        & n_iter, x_rel_diff, lx, ldm1x, hnorm, ddx_error)
     if (ddx_error % flag .ne. 0) then
         call update_error(ddx_error, 'prec_tx: ddCOSMO failed to ' // &
             & 'converge, exiting')
         return
     end if
+    write(6,*) "lx", x_rel_diff(1:n_iter)
 
     ! Scale by the factor of (2l+1)/4Pi
     y(:,:,1) = workspace % ddcosmo_guess
@@ -1170,11 +1182,12 @@ subroutine prec_tx(params, constants, workspace, x, y, ddx_error)
     ! perform B^-1 * Ye
     start_time = omp_get_wtime()
     n_iter = params % maxiter
-    call jacobi_diis(params, constants, workspace, constants % inner_tol, &
-        & x(:,:,2), workspace % hsp_guess, n_iter, x_rel_diff, bx, &
-        & bx_prec, hnorm, ddx_error)
+    call jacobi_diis(params, constants, workspace, &
+        & constants % inner_tol, x(:,:,2), workspace % hsp_guess, &
+        & n_iter, x_rel_diff, bx, bx_prec, hnorm, ddx_error)
     y(:,:,2) = workspace % hsp_guess
     workspace % hsp_time = workspace % hsp_time + omp_get_wtime() - start_time
+    write(6,*) "bx", x_rel_diff(1:n_iter)
 
     if (ddx_error % flag .ne. 0) then
         call update_error(ddx_error, 'prec_tx: HSP failed to ' // &
@@ -1202,6 +1215,8 @@ subroutine cstarx(params, constants, workspace, x, y, ddx_error)
     real(dp) :: val, epsilon_ratio
     real(dp), allocatable :: scratch(:,:), scratch0(:,:)
 
+    call time_push()
+
     ! dummy operation on unused interface arguments
     if (ddx_error % flag .eq. 0) continue
 
@@ -1297,6 +1312,8 @@ subroutine cstarx(params, constants, workspace, x, y, ddx_error)
         end do
     end do
 
+    call time_pull("cstarx")
+
 end subroutine cstarx
 
 !> ddLPB matrix-vector product
diff --git a/src/ddx_solvers.f90 b/src/ddx_solvers.f90
index 7e5806d0..a58ba1fa 100644
--- a/src/ddx_solvers.f90
+++ b/src/ddx_solvers.f90
@@ -331,7 +331,7 @@ subroutine jacobi_diis_external(params, constants, workspace, n, tol, rhs, x, n_
             rel_diff = diff / norm
         end if
         x_rel_diff(it) = rel_diff
-        constants % inner_tol = max(rel_diff*sqrt(tol), tol/100.0d0)
+        constants % inner_tol = max(rel_diff*1d-2, tol*1d-2)
         ! update
         x = x_new
         ! EXIT Jacobi loop here

From 0ec7779cb93dd4b699cbb70b2fe60e2c95aef044 Mon Sep 17 00:00:00 2001
From: Michele Nottoli <michele.nottoli@gmail.com>
Date: Tue, 14 May 2024 15:14:09 +0200
Subject: [PATCH 18/21] Removed allocations in lx and bx.

---
 src/ddx_operators.f90 | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/src/ddx_operators.f90 b/src/ddx_operators.f90
index e39714b5..bbbf4e7e 100644
--- a/src/ddx_operators.f90
+++ b/src/ddx_operators.f90
@@ -38,11 +38,9 @@ subroutine lx(params, constants, workspace, x, y, ddx_error)
     real(dp) :: vij(3), tij, vvij, xij, oij, fac
 
     integer :: vgrid_nbasis, nbasis, ngrid, nsph, lmax
-    real(dp), allocatable :: tmp_grid(:)
-    real(dp) :: se, eta, work(params % lmax + 1)
+    real(dp) :: se, eta, work(params % lmax + 1), tmp_grid(params % ngrid)
 
     call time_push()
-    allocate(tmp_grid(params % ngrid))
 
     ! dummy operation on unused interface arguments
     if (ddx_error % flag .eq. 0) continue
@@ -134,7 +132,6 @@ subroutine lx(params, constants, workspace, x, y, ddx_error)
         end associate
     end if
     call time_pull("lx")
-    deallocate(tmp_grid)
 end subroutine lx
 
 !> Adjoint single layer operator matvec
@@ -962,12 +959,9 @@ subroutine bx(params, constants, workspace, x, y, ddx_error)
     integer :: nsph, nbasis, lmax, ngrid, vgrid_nbasis
     real(dp) :: se, eta, vij(3), vvij, tij, xij, oij, vtij(3), kappa
     complex(dp) :: work_complex(params % lmax+1)
-    real(dp) :: work(params % lmax+1)
-
-    real(dp), allocatable :: tmp_grid(:)
+    real(dp) :: work(params % lmax+1), tmp_grid(params % ngrid)
 
     call time_push()
-    allocate(tmp_grid(params % ngrid))
 
     ! dummy operation on unused interface arguments
     if (ddx_error % flag .eq. 0) continue

From ae929288dfa7b3d9098f5024c8446d11ae10d5ae Mon Sep 17 00:00:00 2001
From: Michele Nottoli <michele.nottoli@gmail.com>
Date: Tue, 14 May 2024 17:32:59 +0200
Subject: [PATCH 19/21] Parallelized cx.

---
 src/ddx_core.f90      | 134 ++++++++++++++++++++++++------------------
 src/ddx_operators.f90 | 122 +++++++++++++++++++++++++-------------
 2 files changed, 157 insertions(+), 99 deletions(-)

diff --git a/src/ddx_core.f90 b/src/ddx_core.f90
index 6f8dfa7d..eb6b5b58 100644
--- a/src/ddx_core.f90
+++ b/src/ddx_core.f90
@@ -2112,40 +2112,48 @@ subroutine tree_m2l_bessel_rotation(params, constants, node_m, node_l)
     real(dp) :: work(6*params % pm**2 + 19*params % pm + 8)
     complex(dp) :: work_complex(2*params % pm+1)
     ! Local variables
-    integer :: i, j, k
-    real(dp) :: c1(3), c(3), r1, r
+    integer :: i, j, k, nclusters, pm
+    real(dp) :: c1(3), c(3), r1, r, kappa
     ! Any order of this cycle is OK
-    !$omp parallel do default(none) shared(constants,params,node_m,node_l) &
-    !$omp private(i,c,r,k,c1,r1,work,work_complex) schedule(dynamic)
-    do i = 1, constants % nclusters
-        ! If no far admissible pairs just set output to zero
-        if (constants % nfar(i) .eq. 0) then
-            node_l(:, i) = zero
-            cycle
-        end if
-        c = constants % cnode(:, i)
-        r = constants % rnode(i)
-        ! Use the first far admissible pair to initialize output
-        k = constants % far(constants % sfar(i))
-        c1 = constants % cnode(:, k)
-        r1 = constants % rnode(k)
-        c1 = params % kappa*(c1 - c)
-        call fmm_m2l_bessel_rotation_work(c1, &
-            & constants % SK_rnode(:, k), constants % SI_rnode(:, i), &
-            & params % pm, &
-            & constants % vscales, one, &
-            & node_m(:, k), zero, node_l(:, i), work, work_complex)
-        do j = constants % sfar(i)+1, constants % sfar(i+1)-1
-            k = constants % far(j)
-            c1 = constants % cnode(:, k)
-            r1 = constants % rnode(k)
-            c1 = params % kappa*(c1 - c)
-            call fmm_m2l_bessel_rotation_work(c1, constants % SK_rnode(:, k), &
-                & constants % SI_rnode(:, i), params % pm, &
-                & constants % vscales, one, &
-                & node_m(:, k), one, node_l(:, i), work, work_complex)
+    nclusters = constants % nclusters
+    kappa = params % kappa
+    pm = params % pm
+    associate(nfar => constants % nfar, cnode => constants % cnode, &
+            & rnode => constants % rnode, far => constants % far, &
+            & sfar => constants % sfar, sk_rnode => constants % sk_rnode, &
+            & si_rnode => constants % si_rnode, &
+            & vscales => constants % vscales)
+        !$omp parallel do schedule(dynamic,1) firstprivate(nclusters,kappa,pm) &
+        !$omp private(i,c,r,k,c1,r1,j,work,work_complex) &
+        !$omp shared(nfar,node_l,cnode,rnode, &
+        !$omp far,sfar,sk_rnode,si_rnode,vscales,node_m)
+        do i = 1, nclusters
+            ! If no far admissible pairs just set output to zero
+            if (nfar(i) .eq. 0) then
+                node_l(:, i) = zero
+                cycle
+            end if
+            c = cnode(:, i)
+            r = rnode(i)
+            ! Use the first far admissible pair to initialize output
+            k = far(sfar(i))
+            c1 = cnode(:, k)
+            r1 = rnode(k)
+            c1 = kappa*(c1 - c)
+            call fmm_m2l_bessel_rotation_work(c1, SK_rnode(:, k), &
+                & SI_rnode(:, i), pm, vscales, 1.0d0, node_m(:, k), &
+                & 0.0d0, node_l(:, i), work, work_complex)
+            do j = sfar(i)+1, sfar(i+1)-1
+                k = far(j)
+                c1 = cnode(:, k)
+                r1 = rnode(k)
+                c1 = kappa*(c1 - c)
+                call fmm_m2l_bessel_rotation_work(c1, SK_rnode(:, k), &
+                    & SI_rnode(:, i), pm, vscales, 1.0d0, node_m(:, k), &
+                    & 1.0d0, node_l(:, i), work, work_complex)
+            end do
         end do
-    end do
+    end associate
 end subroutine tree_m2l_bessel_rotation
 !------------------------------------------------------------------------------
 !> Adjoint transfer multipole local coefficients into local over a tree
@@ -2465,8 +2473,8 @@ subroutine tree_m2p_bessel(params, constants, p, alpha, sph_p, sph_m, beta, grid
     ! Output
     real(dp), intent(inout) :: grid_v(params % ngrid, params % nsph)
     ! Local variables
-    integer :: isph, inode, jnear, jnode, jsph, igrid
-    real(dp) :: c(3)
+    integer :: isph, inode, jnear, jnode, jsph, igrid, nsph, ngrid
+    real(dp) :: c(3), kappa
     ! Temporary workspace
     real(dp) :: work(p+1)
     complex(dp) :: work_complex(p+1)
@@ -2476,32 +2484,44 @@ subroutine tree_m2p_bessel(params, constants, p, alpha, sph_p, sph_m, beta, grid
     else
         grid_v = beta * grid_v
     end if
+    nsph = params % nsph
+    ngrid = params % ngrid
+    kappa = params % kappa
     ! Cycle over all spheres
-    !$omp parallel do default(none) shared(params,constants,grid_v,p, &
-    !$omp alpha,sph_m), private(isph,inode,jnear,jnode,jsph,igrid,c,work, &
-    !$omp work_complex) schedule(dynamic)
-    do isph = 1, params % nsph
-        ! Cycle over all near-field admissible pairs of spheres
-        inode = constants % snode(isph)
-        do jnear = constants % snear(inode), constants % snear(inode+1)-1
-            ! Near-field interactions are possible only between leaf nodes,
-            ! which must contain only a single input sphere
-            jnode = constants % near(jnear)
-            jsph = constants % order(constants % cluster(1, jnode))
-            ! Ignore self-interaction
-            !if(isph .eq. jsph) cycle
-            ! Accumulate interaction for external grid points only
-            do igrid = 1, params % ngrid
-                if(constants % ui(igrid, isph) .eq. zero) cycle
-                c = constants % cgrid(:, igrid)*params % rsph(isph) - &
-                    & params % csph(:, jsph) + params % csph(:, isph)
-                c = c * params % kappa
-                call fmm_m2p_bessel_work(c, p, constants % vscales, &
-                    & constants % SK_ri(:, jsph), alpha, sph_m(:, jsph), one, &
-                    & grid_v(igrid, isph), work_complex, work)
+    associate(snode => constants % snode, snear => constants % snear, &
+            & near => constants % near, order => constants % order, &
+            & cluster => constants % cluster, ui => constants % ui, &
+            & cgrid => constants % cgrid, rsph => params % rsph, &
+            & csph => params % csph, vscales => constants % vscales, &
+            & sk_ri => constants % sk_ri)
+        !$omp parallel do schedule(dynamic,1) default(none) &
+        !$omp firstprivate(nsph,ngrid,kappa,alpha,p) &
+        !$omp private(isph,inode,jnear,jnode,jsph,igrid,c,work_complex, &
+        !$omp work) shared(snode,snear,near,order,cluster,ui,cgrid,rsph, &
+        !$omp vscales,sk_ri,sph_m,grid_v)
+        do isph = 1, nsph
+            ! Cycle over all near-field admissible pairs of spheres
+            inode = snode(isph)
+            do jnear = snear(inode), snear(inode+1)-1
+                ! Near-field interactions are possible only between leaf nodes,
+                ! which must contain only a single input sphere
+                jnode = near(jnear)
+                jsph = order(cluster(1, jnode))
+                ! Ignore self-interaction
+                !if(isph .eq. jsph) cycle
+                ! Accumulate interaction for external grid points only
+                do igrid = 1, ngrid
+                    if(ui(igrid, isph) .eq. zero) cycle
+                    c = cgrid(:, igrid)*rsph(isph) - csph(:, jsph) &
+                        & + csph(:, isph)
+                    c = c * kappa
+                    call fmm_m2p_bessel_work(c, p, vscales, &
+                        & sk_ri(:, jsph), alpha, sph_m(:, jsph), 1.0d0, &
+                        & grid_v(igrid, isph), work_complex, work)
+                end do
             end do
         end do
-    end do
+    end associate
 end subroutine tree_m2p_bessel
 
 !------------------------------------------------------------------------------
diff --git a/src/ddx_operators.f90 b/src/ddx_operators.f90
index bbbf4e7e..b4ecf8b5 100644
--- a/src/ddx_operators.f90
+++ b/src/ddx_operators.f90
@@ -1110,7 +1110,6 @@ subroutine prec_tstarx(params, constants, workspace, x, y, ddx_error)
         & constants % inner_tol*1.0d-2, y(:,:,1), &
         & workspace % ddcosmo_guess, n_iter, x_rel_diff, lstarx, &
         & ldm1x, hnorm, ddx_error)
-    write(6,*) "lstarx", x_rel_diff(1:n_iter)
     if (ddx_error % flag .ne. 0) then
         call update_error(ddx_error, 'prec_tstarx: ddCOSMO failed to ' // &
             & 'converge, exiting')
@@ -1124,7 +1123,6 @@ subroutine prec_tstarx(params, constants, workspace, x, y, ddx_error)
     call jacobi_diis(params, constants, workspace, &
         & constants % inner_tol*1.0d1, x(:,:,2), workspace % hsp_guess, &
         & n_iter, x_rel_diff, bstarx, bx_prec, hnorm, ddx_error)
-    write(6,*) "bstarx", x_rel_diff(1:n_iter)
     if (ddx_error % flag .ne. 0) then
         call update_error(ddx_error, 'prec_tstarx: HSP failed to ' // &
             & 'converge, exiting')
@@ -1166,7 +1164,6 @@ subroutine prec_tx(params, constants, workspace, x, y, ddx_error)
             & 'converge, exiting')
         return
     end if
-    write(6,*) "lx", x_rel_diff(1:n_iter)
 
     ! Scale by the factor of (2l+1)/4Pi
     y(:,:,1) = workspace % ddcosmo_guess
@@ -1181,7 +1178,6 @@ subroutine prec_tx(params, constants, workspace, x, y, ddx_error)
         & n_iter, x_rel_diff, bx, bx_prec, hnorm, ddx_error)
     y(:,:,2) = workspace % hsp_guess
     workspace % hsp_time = workspace % hsp_time + omp_get_wtime() - start_time
-    write(6,*) "bx", x_rel_diff(1:n_iter)
 
     if (ddx_error % flag .ne. 0) then
         call update_error(ddx_error, 'prec_tx: HSP failed to ' // &
@@ -1326,6 +1322,8 @@ subroutine cx(params, constants, workspace, x, y, ddx_error)
     complex(dp) :: work_complex(constants % lmax0 + 1)
     real(dp) :: work(constants % lmax0 + 1)
     integer :: indl, inode, info
+    real(dp) :: epsp, eps, fac
+    integer :: nsph, lmax, nbasis, nbasis0, lmax0, ngrid
 
     real(dp), allocatable :: diff_re(:,:), diff0(:,:)
 
@@ -1337,44 +1335,65 @@ subroutine cx(params, constants, workspace, x, y, ddx_error)
         return
     end if
 
-    ! diff_re = params % epsp/eps*l1/ri*Xr - i'(ri)/i(ri)*Xe,
-    diff_re = zero
-    !$omp parallel do default(none) shared(params,diff_re, &
-    !$omp constants,x) private(jsph,l,m,ind)
-    do jsph = 1, params % nsph
-      do l = 0, params % lmax
-        do m = -l,l
-          ind = l**2 + l + m + 1
-          diff_re(ind,jsph) = (params % epsp/params % eps)* &
-              & (l/params % rsph(jsph))*x(ind,jsph,1) &
-              & - constants % termimat(l,jsph)*x(ind,jsph,2)
+    nsph = params % nsph
+    lmax = params % lmax
+    lmax0 = constants % lmax0
+    epsp = params % epsp
+    eps = params % eps
+    nbasis = constants % nbasis
+    nbasis0 = constants % nbasis0
+    ngrid = params % ngrid
+
+    call time_push()
+    associate(rsph => params % rsph, termimat => constants % termimat)
+        !$omp parallel do collapse(2) default(none) schedule(static,100) &
+        !$omp firstprivate(nsph,lmax,epsp,eps) private(jsph,l,ind,m) &
+        !$omp shared(diff_re,rsph,x,termimat)
+        do jsph = 1, nsph
+            do l = 0, lmax
+                ind = l**2 + l + 1
+                do m = -l, l
+                    diff_re(ind+m,jsph) = (epsp/eps)*(l/rsph(jsph)) &
+                        & *x(ind+m,jsph,1) - termimat(l,jsph)*x(ind+m,jsph,2)
+                end do
+            end do
         end do
-      end do
-    end do
+    end associate
+    call time_pull("cx1")
+    call time_push()
 
     ! diff0 = Pchi * diff_er, linear scaling
-    !$omp parallel do default(none) shared(constants,params, &
-    !$omp diff_re,diff0) private(jsph)
-    do jsph = 1, params % nsph
-      call dgemv('t', constants % nbasis, constants % nbasis0, one, &
-          & constants % pchi(1,1,jsph), constants % nbasis, &
-          & diff_re(1,jsph), 1, zero, diff0(1,jsph), 1)
-    end do
+    associate(pchi => constants % pchi)
+        !$omp parallel do default(none) schedule(static,10) &
+        !$omp firstprivate(nsph,nbasis,nbasis0) private(jsph) &
+        !$omp shared(pchi,diff_re,diff0)
+        do jsph = 1, nsph
+            call dgemv('t', nbasis, nbasis0, 1.0d0, pchi(1,1,jsph), nbasis, &
+                & diff_re(1,jsph), 1, 0.0d0, diff0(1,jsph), 1)
+        end do
+    end associate
+    call time_pull("cx2")
 
+    call time_push()
     ! Multiply diff0 by C_ik inplace
-    do isph = 1, params % nsph
-        do l = 0, constants % lmax0
-            ind0 = l*l+l+1
-            diff0(ind0-l:ind0+l, isph) = diff0(ind0-l:ind0+l, isph) * &
-                & constants % C_ik(l, isph)
+    associate(c_ik => constants % c_ik)
+        !$omp parallel do collapse(2) schedule(static,100) &
+        !$omp firstprivate(nsph,lmax0) private(isph,l,ind,fac,m) &
+        !$omp shared(diff0,c_ik)
+        do isph = 1, nsph
+            do l = 0, lmax0
+                ind = l*l+l+1
+                fac = c_ik(l, isph)
+                do m = -l, l
+                    diff0(ind+m, isph) = diff0(ind+m, isph) * fac
+                end do
+            end do
         end do
-    end do
+    end associate
+    call time_pull("cx3")
     ! avoiding N^2 storage, this code does not use the cached coefY
     y(:,:,1) = zero
     if (params % fmm .eq. 0) then
-        !$omp parallel do default(none) shared(params,constants, &
-        !$omp diff0,y) private(isph,igrid,val,vij,work, &
-        !$omp ind0,ind,vtij,work_complex)
         do isph = 1, params % nsph
             do igrid = 1, params % ngrid
                 if (constants % ui(igrid,isph).gt.zero) then
@@ -1400,6 +1419,7 @@ subroutine cx(params, constants, workspace, x, y, ddx_error)
         end do
     else
         ! Load input harmonics into tree data
+        call time_push()
         workspace % tmp_node_m = zero
         workspace % tmp_node_l = zero
         workspace % tmp_sph = zero
@@ -1424,37 +1444,55 @@ subroutine cx(params, constants, workspace, x, y, ddx_error)
                 workspace % tmp_node_m(:, inode) = workspace % tmp_sph(1:indl, isph)
             end do
         end if
+        call time_pull("cx-fmmprep")
+        call time_push()
         ! Do FMM operations
         call tree_m2m_bessel_rotation(params, constants, workspace % tmp_node_m)
+        call time_pull("cx-m2m")
+        call time_push()
         call tree_m2l_bessel_rotation(params, constants, workspace % tmp_node_m, &
             & workspace % tmp_node_l)
+        call time_pull("cx-m2l")
+        call time_push()
         call tree_l2l_bessel_rotation(params, constants, workspace % tmp_node_l)
+        call time_pull("cx-l2l")
+        call time_push()
         workspace % tmp_grid = zero
         call tree_l2p_bessel(params, constants, one, workspace % tmp_node_l, zero, &
             & workspace % tmp_grid)
+        call time_pull("cx-l2p")
+        call time_push()
         call tree_m2p_bessel(params, constants, constants % lmax0, one, &
             & params % lmax, workspace % tmp_sph, one, &
             & workspace % tmp_grid)
-
-        do isph = 1, params % nsph
-            do igrid = 1, params % ngrid
-                do ind = 1, constants % nbasis
-                    y(ind,isph,1) = y(ind,isph,1) + workspace % tmp_grid(igrid, isph)*&
-                        & constants % vwgrid(ind, igrid)*&
-                        & constants % ui(igrid,isph)
+        call time_pull("cx-m2p")
+        call time_push()
+
+        associate(tmp_grid => workspace % tmp_grid, &
+                 & vwgrid => constants % vwgrid, ui => constants % ui)
+            !$omp parallel do collapse(2) schedule(static,100) &
+            !$omp firstprivate(nsph,ngrid,nbasis), shared(y,tmp_grid, &
+            !$omp vwgrid,ui) private(isph,igrid,ind)
+            do isph = 1, nsph
+                do ind = 1, nbasis
+                    do igrid = 1, ngrid
+                        y(ind,isph,1) = y(ind,isph,1) + tmp_grid(igrid, isph)*&
+                            & vwgrid(ind, igrid)*ui(igrid,isph)
+                    end do
                 end do
             end do
-        end do
+        end associate
+        call time_pull("cx-fmmend")
     end if
 
     y(:,:,2) = y(:,:,1)
 
+    call time_pull("cx")
     deallocate(diff_re, diff0, stat=info)
     if (info.ne.0) then
         call update_error(ddx_error, "Deallocation failed in cx")
         return
     end if
-    call time_pull("cx")
 
 end subroutine cx
 

From 7a139aac88b2c9b1c87873e58afadf7acb2d1d6d Mon Sep 17 00:00:00 2001
From: Michele Nottoli <michele.nottoli@gmail.com>
Date: Wed, 15 May 2024 11:58:44 +0200
Subject: [PATCH 20/21] Update.

---
 src/ddx_core.f90      | 15 ++++++----
 src/ddx_operators.f90 | 67 +++++++++++++++++++++++--------------------
 2 files changed, 45 insertions(+), 37 deletions(-)

diff --git a/src/ddx_core.f90 b/src/ddx_core.f90
index eb6b5b58..42661799 100644
--- a/src/ddx_core.f90
+++ b/src/ddx_core.f90
@@ -2314,7 +2314,7 @@ subroutine tree_l2p_bessel(params, constants, alpha, node_l, beta, grid_v)
     real(dp), intent(inout) :: grid_v(params % ngrid, params % nsph)
     ! Local variables
     real(dp) :: sph_l((params % pl+1)**2, params % nsph)
-    integer :: isph
+    integer :: isph, nsph
     external :: dgemm
     ! Init output
     if (beta .eq. zero) then
@@ -2322,12 +2322,15 @@ subroutine tree_l2p_bessel(params, constants, alpha, node_l, beta, grid_v)
     else
         grid_v = beta * grid_v
     end if
+    nsph = params % nsph
     ! Get data from all clusters to spheres
-    !$omp parallel do default(none) shared(params,constants,node_l,sph_l) &
-    !$omp private(isph) schedule(dynamic)
-    do isph = 1, params % nsph
-        sph_l(:, isph) = node_l(:, constants % snode(isph))
-    end do
+    associate(snode => constants % snode)
+        !$omp parallel do default(none) schedule(static,100) &
+        !$omp shared(node_l,sph_l,snode) private(isph) firstprivate(nsph)
+        do isph = 1, nsph
+            sph_l(:, isph) = node_l(:, snode(isph))
+        end do
+    end associate
     ! Get values at grid points
     call dgemm('T', 'N', params % ngrid, params % nsph, &
         & (params % pl+1)**2, alpha, constants % vgrid, &
diff --git a/src/ddx_operators.f90 b/src/ddx_operators.f90
index b4ecf8b5..7ea32768 100644
--- a/src/ddx_operators.f90
+++ b/src/ddx_operators.f90
@@ -1316,15 +1316,11 @@ subroutine cx(params, constants, workspace, x, y, ddx_error)
     real(dp), dimension(constants % nbasis, params % nsph, 2), intent(out) :: y
     type(ddx_error_type), intent(inout) :: ddx_error
 
-    integer :: isph, jsph, igrid, ind, l, m, ind0
-    real(dp), dimension(3) :: vij, vtij
-    real(dp) :: val
+    integer :: isph, jsph, igrid, ind, l, m, indl, inode, info, &
+        & nsph, lmax, nbasis, nbasis0, lmax0, ngrid
+    real(dp) :: vij(3), vtij(3), val, epsp, eps, fac, &
+        & work(constants % lmax0 + 1)
     complex(dp) :: work_complex(constants % lmax0 + 1)
-    real(dp) :: work(constants % lmax0 + 1)
-    integer :: indl, inode, info
-    real(dp) :: epsp, eps, fac
-    integer :: nsph, lmax, nbasis, nbasis0, lmax0, ngrid
-
     real(dp), allocatable :: diff_re(:,:), diff0(:,:)
 
     call time_push()
@@ -1360,9 +1356,9 @@ subroutine cx(params, constants, workspace, x, y, ddx_error)
         end do
     end associate
     call time_pull("cx1")
-    call time_push()
 
     ! diff0 = Pchi * diff_er, linear scaling
+    call time_push()
     associate(pchi => constants % pchi)
         !$omp parallel do default(none) schedule(static,10) &
         !$omp firstprivate(nsph,nbasis,nbasis0) private(jsph) &
@@ -1374,8 +1370,8 @@ subroutine cx(params, constants, workspace, x, y, ddx_error)
     end associate
     call time_pull("cx2")
 
-    call time_push()
     ! Multiply diff0 by C_ik inplace
+    call time_push()
     associate(c_ik => constants % c_ik)
         !$omp parallel do collapse(2) schedule(static,100) &
         !$omp firstprivate(nsph,lmax0) private(isph,l,ind,fac,m) &
@@ -1391,10 +1387,11 @@ subroutine cx(params, constants, workspace, x, y, ddx_error)
         end do
     end associate
     call time_pull("cx3")
+
     ! avoiding N^2 storage, this code does not use the cached coefY
-    y(:,:,1) = zero
     if (params % fmm .eq. 0) then
         do isph = 1, params % nsph
+            y(:,isph,1) = zero
             do igrid = 1, params % ngrid
                 if (constants % ui(igrid,isph).gt.zero) then
                     val = zero
@@ -1422,27 +1419,33 @@ subroutine cx(params, constants, workspace, x, y, ddx_error)
         call time_push()
         workspace % tmp_node_m = zero
         workspace % tmp_node_l = zero
-        workspace % tmp_sph = zero
-        do isph = 1, params % nsph
-            do l = 0, constants % lmax0
-                ind0 = l*l+l+1
-                workspace % tmp_sph(ind0-l:ind0+l, isph) = &
-                    & diff0(ind0-l:ind0+l, isph)
-            end do
-        end do
         if(constants % lmax0 .lt. params % pm) then
-            do isph = 1, params % nsph
-                inode = constants % snode(isph)
-                workspace % tmp_node_m(1:constants % nbasis0, inode) = &
-                    & workspace % tmp_sph(1:constants % nbasis0, isph)
-                workspace % tmp_node_m(constants % nbasis0+1:, inode) = zero
-            end do
+            associate(tmp_node_m => workspace % tmp_node_m, &
+                    & snode => constants % snode)
+                !$omp parallel do schedule(static,1) default(none) &
+                !$omp firstprivate(nsph,nbasis0) &
+                !$omp shared(snode,tmp_node_m,diff0) &
+                !$omp private(isph,inode)
+                do isph = 1, nsph
+                    inode = snode(isph)
+                    tmp_node_m(1:nbasis0, inode) = &
+                        & diff0(1:nbasis0, isph)
+                    tmp_node_m(nbasis0+1:, inode) = 0.0d0
+                end do
+            end associate
         else
             indl = (params % pm+1)**2
-            do isph = 1, params % nsph
-                inode = constants % snode(isph)
-                workspace % tmp_node_m(:, inode) = workspace % tmp_sph(1:indl, isph)
-            end do
+            associate(tmp_node_m => workspace % tmp_node_m, &
+                    & snode => constants % snode)
+                !$omp parallel do schedule(static,1) default(none) &
+                !$omp firstprivate(nsph,nbasis0) &
+                !$omp shared(snode,tmp_node_m,diff0) &
+                !$omp private(isph,inode,indl)
+                do isph = 1, nsph
+                    inode = snode(isph)
+                    tmp_node_m(:, inode) = diff0(1:indl, isph)
+                end do
+            end associate
         end if
         call time_pull("cx-fmmprep")
         call time_push()
@@ -1472,13 +1475,15 @@ subroutine cx(params, constants, workspace, x, y, ddx_error)
                  & vwgrid => constants % vwgrid, ui => constants % ui)
             !$omp parallel do collapse(2) schedule(static,100) &
             !$omp firstprivate(nsph,ngrid,nbasis), shared(y,tmp_grid, &
-            !$omp vwgrid,ui) private(isph,igrid,ind)
+            !$omp vwgrid,ui) private(isph,igrid,ind,val)
             do isph = 1, nsph
                 do ind = 1, nbasis
+                    val = 0.0d0
                     do igrid = 1, ngrid
-                        y(ind,isph,1) = y(ind,isph,1) + tmp_grid(igrid, isph)*&
+                         val = val + tmp_grid(igrid, isph)*&
                             & vwgrid(ind, igrid)*ui(igrid,isph)
                     end do
+                    y(ind,isph,1) = val
                 end do
             end do
         end associate

From 3e0ba812514202b208924d29ddaeb939feab3afd Mon Sep 17 00:00:00 2001
From: Michele Nottoli <michele.nottoli@gmail.com>
Date: Fri, 24 May 2024 12:50:21 +0200
Subject: [PATCH 21/21] Bug fix.

---
 MANIFEST.in           |  3 ---
 src/ddx_core.f90      |  6 +++---
 src/ddx_operators.f90 | 12 ++++++++++++
 3 files changed, 15 insertions(+), 6 deletions(-)
 delete mode 100644 MANIFEST.in

diff --git a/MANIFEST.in b/MANIFEST.in
deleted file mode 100644
index 588679d7..00000000
--- a/MANIFEST.in
+++ /dev/null
@@ -1,3 +0,0 @@
-global-include src/**/*.h src/**/*.cpp src/**/*.f src/**/*.f90 src/**/*.cmake src/**/CMakeLists.txt
-include CMakeLists.txt
-include README.md
diff --git a/src/ddx_core.f90 b/src/ddx_core.f90
index 42661799..15860ef0 100644
--- a/src/ddx_core.f90
+++ b/src/ddx_core.f90
@@ -2499,9 +2499,9 @@ subroutine tree_m2p_bessel(params, constants, p, alpha, sph_p, sph_m, beta, grid
             & sk_ri => constants % sk_ri)
         !$omp parallel do schedule(dynamic,1) default(none) &
         !$omp firstprivate(nsph,ngrid,kappa,alpha,p) &
-        !$omp private(isph,inode,jnear,jnode,jsph,igrid,c,work_complex, &
-        !$omp work) shared(snode,snear,near,order,cluster,ui,cgrid,rsph, &
-        !$omp vscales,sk_ri,sph_m,grid_v)
+        !$omp private(isph,inode,jnear,jnode,jsph,igrid,c, &
+        !$omp work_complex,work) shared(snode,snear,near,order, &
+        !$omp csph,cluster,ui,cgrid,rsph,vscales,sk_ri,sph_m,grid_v)
         do isph = 1, nsph
             ! Cycle over all near-field admissible pairs of spheres
             inode = snode(isph)
diff --git a/src/ddx_operators.f90 b/src/ddx_operators.f90
index 7ea32768..89d60296 100644
--- a/src/ddx_operators.f90
+++ b/src/ddx_operators.f90
@@ -1419,6 +1419,18 @@ subroutine cx(params, constants, workspace, x, y, ddx_error)
         call time_push()
         workspace % tmp_node_m = zero
         workspace % tmp_node_l = zero
+        workspace % tmp_sph = zero
+        associate(tmp_sph => workspace % tmp_sph)
+            !$omp parallel do collapse(2) schedule(static,100) &
+            !$omp firstprivate(nsph,lmax0), shared(tmp_sph,diff0) &
+            !$omp private(isph,l,ind)
+            do isph = 1, nsph
+                do l = 0, lmax0
+                    ind = l*l+l+1
+                    tmp_sph(ind-l:ind+l, isph) = diff0(ind-l:ind+l, isph)
+                end do
+            end do
+        end associate
         if(constants % lmax0 .lt. params % pm) then
             associate(tmp_node_m => workspace % tmp_node_m, &
                     & snode => constants % snode)