From bc9dba8d22655d39fe2f071f2876b555e3687ee7 Mon Sep 17 00:00:00 2001 From: Michele Nottoli Date: Mon, 15 Apr 2024 15:02:38 +0200 Subject: [PATCH 01/21] Profiling enabled. --- src/CMakeLists.txt | 1 + src/ddx.f90 | 14 +++++++++++++ src/ddx_core.f90 | 1 + src/ddx_driver.f90 | 6 +++++- src/ddx_operators.f90 | 6 +++++- src/ddx_profiling.f90 | 48 +++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 74 insertions(+), 2 deletions(-) create mode 100644 src/ddx_profiling.f90 diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 3d498a52..36af8011 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -17,6 +17,7 @@ set(SRC ddx.f90 ddx_legacy.f90 ddx_multipolar_solutes.f90 + ddx_profiling.f90 llgnew.f cbessel.f90 ddx_cinterface.f90 diff --git a/src/ddx.f90 b/src/ddx.f90 index 4b67ef44..b9ee78ed 100644 --- a/src/ddx.f90 +++ b/src/ddx.f90 @@ -390,8 +390,10 @@ subroutine ddrun(ddx_data, state, electrostatics, psi, tol, esolv, & return end if + call time_push() call setup(ddx_data % params, ddx_data % constants, & & ddx_data % workspace, state, electrostatics, psi, ddx_error) + call time_pull("setup") if (ddx_error % flag .ne. 0) then call update_error(ddx_error, "ddrun: setup returned an error, exiting") return @@ -399,24 +401,30 @@ subroutine ddrun(ddx_data, state, electrostatics, psi, tol, esolv, & ! solve the primal linear system if (do_guess) then + call time_push() call fill_guess(ddx_data % params, ddx_data % constants, & & ddx_data % workspace, state, tol, ddx_error) + call time_pull("guess") if (ddx_error % flag .ne. 0) then call update_error(ddx_error, & & "ddrun: fill_guess returned an error, exiting") return end if end if + call time_push() call solve(ddx_data % params, ddx_data % constants, & & ddx_data % workspace, state, tol, ddx_error) + call time_pull("solve") if (ddx_error % flag .ne. 0) then call update_error(ddx_error, "ddrun: solve returned an error, exiting") return end if ! compute the energy + call time_push() call energy(ddx_data % params, ddx_data % constants, & & ddx_data % workspace, state, esolv, ddx_error) + call time_pull("energy") if (ddx_error % flag .ne. 0) then call update_error(ddx_error, "ddrun: energy returned an error, exiting") return @@ -425,16 +433,20 @@ subroutine ddrun(ddx_data, state, electrostatics, psi, tol, esolv, & ! solve the primal linear system if (ddx_data % params % force .eq. 1) then if (do_guess) then + call time_push() call fill_guess_adjoint(ddx_data % params, ddx_data % constants, & & ddx_data % workspace, state, tol, ddx_error) + call time_pull("guess adjoint") if (ddx_error % flag .ne. 0) then call update_error(ddx_error, & & "ddrun: fill_guess_adjoint returned an error, exiting") return end if end if + call time_push() call solve_adjoint(ddx_data % params, ddx_data % constants, & & ddx_data % workspace, state, tol, ddx_error) + call time_pull("solve adjoint") if (ddx_error % flag .ne. 0) then call update_error(ddx_error, & & "ddrun: solve_adjoint returned an error, exiting") @@ -445,8 +457,10 @@ subroutine ddrun(ddx_data, state, electrostatics, psi, tol, esolv, & ! compute the forces if (ddx_data % params % force .eq. 1) then force = zero + call time_push() call solvation_force_terms(ddx_data % params, ddx_data % constants, & & ddx_data % workspace, state, electrostatics, force, ddx_error) + call time_pull("solvation forces") if (ddx_error % flag .ne. 0) then call update_error(ddx_error, & & "ddrun: solvation_force_terms returned an error, exiting") diff --git a/src/ddx_core.f90 b/src/ddx_core.f90 index cc957e68..1df85ca8 100644 --- a/src/ddx_core.f90 +++ b/src/ddx_core.f90 @@ -21,6 +21,7 @@ module ddx_core use ddx_harmonics ! Enable OpenMP use omp_lib +use ddx_profiling implicit none !> @defgroup Fortran_interface_core Fortran interface: core routines diff --git a/src/ddx_driver.f90 b/src/ddx_driver.f90 index 078551f5..422189b8 100644 --- a/src/ddx_driver.f90 +++ b/src/ddx_driver.f90 @@ -114,6 +114,7 @@ program main finish_time = omp_get_wtime() write(*, 100) "Psi time:", finish_time-start_time, " seconds" +call time_push() if (ddx_data % params % force .eq. 1) then allocate(force(3, ddx_data % params % nsph), stat=info) if (info .ne. 0) then @@ -126,6 +127,7 @@ program main call ddrun(ddx_data, state, electrostatics, psi, tol, esolv, ddx_error) end if call check_error(ddx_error) +call time_pull("ddrun") if (ddx_data % params % force .eq. 1) write(*, 100) & & "solvation force terms time:", state % force_time, " seconds" @@ -134,9 +136,11 @@ program main ! forces. if (ddx_data % params % force .eq. 1) then start_time = omp_get_wtime() + call time_push() call multipole_force_terms(ddx_data % params, ddx_data % constants, & & ddx_data % workspace, state, 0, multipoles, force, ddx_error) - call check_error(ddx_error) + call time_pull("solute forces") + call check_error(ddx_error) finish_time = omp_get_wtime() write(*, 100) "multipolar force terms time:", & & finish_time - start_time, " seconds" diff --git a/src/ddx_operators.f90 b/src/ddx_operators.f90 index dedbf00f..161479ea 100644 --- a/src/ddx_operators.f90 +++ b/src/ddx_operators.f90 @@ -56,6 +56,7 @@ subroutine lx(params, constants, workspace, x, y, ddx_error) end do end do else + call time_push() !$omp parallel do default(none) shared(params,constants,workspace,x,y) & !$omp private(isph,iproc) schedule(dynamic) do isph = 1, params % nsph @@ -69,6 +70,7 @@ subroutine lx(params, constants, workspace, x, y, ddx_error) ! now, fix the sign. y(:, isph) = - y(:, isph) end do + call time_pull("lx") end if ! ! if required, add the diagonal. @@ -768,7 +770,8 @@ subroutine bx(params, constants, workspace, x, y, ddx_error) & one, y(:,isph), 1) end do end do - else + else + call time_push() !$omp parallel do default(none) shared(params,constants,workspace,x,y) & !$omp private(isph,iproc) schedule(dynamic) do isph = 1, params % nsph @@ -778,6 +781,7 @@ subroutine bx(params, constants, workspace, x, y, ddx_error) & constants % vgrid_nbasis, workspace % tmp_pot(:, iproc), y(:,isph)) y(:,isph) = - y(:,isph) end do + call time_pull("bx") end if if (constants % dodiag) y = y + x end subroutine bx diff --git a/src/ddx_profiling.f90 b/src/ddx_profiling.f90 new file mode 100644 index 00000000..b5799d88 --- /dev/null +++ b/src/ddx_profiling.f90 @@ -0,0 +1,48 @@ +module ddx_profiling + !! Unified Input/Output handling across the code. + use ddx_definitions, only: dp + implicit none + private + + integer, parameter:: ntimes = 128 + integer:: tcnt = 1 + real(dp):: times(ntimes) + + public:: time_pull, time_push + + contains + + subroutine time_push() + implicit none + real(dp):: omp_get_wtime + + if(tcnt <= ntimes) then + times(tcnt) = omp_get_wtime() + tcnt = tcnt+1 + else + write(*, *) 'time_push Cannot push another time in the buffer.' + stop 1 + end if + end subroutine + + subroutine time_pull(s) + implicit none + + character(len=*), intent(in):: s + real(dp):: elap + character(len = 2048):: msg + + real(dp):: omp_get_wtime + + if(tcnt > 1) then + elap = omp_get_wtime() - times(tcnt-1) + tcnt = tcnt-1 + write(msg, "(3a, ': ', e14.6E2, ' s')") repeat('-', tcnt), '> ', s, elap + write(*, *) '[fmm-time]', trim(msg) + else + write(*, *) 'time_pull Cannot pull any value.' + stop 1 + end if + end subroutine + +end module ddx_profiling From 63a99cf050e50199e3ef72c6c706ee95d1e87b3a Mon Sep 17 00:00:00 2001 From: Michele Nottoli Date: Mon, 15 Apr 2024 15:54:21 +0200 Subject: [PATCH 02/21] Opt. --- src/ddx_core.f90 | 19 +++++++++++++------ src/ddx_operators.f90 | 21 ++++++++++----------- 2 files changed, 23 insertions(+), 17 deletions(-) diff --git a/src/ddx_core.f90 b/src/ddx_core.f90 index 1df85ca8..6f8dfa7d 100644 --- a/src/ddx_core.f90 +++ b/src/ddx_core.f90 @@ -1391,15 +1391,22 @@ real(dp) function hnorm(lmax, nbasis, nsph, x) implicit none integer, intent(in) :: lmax, nbasis, nsph real(dp), dimension(nbasis, nsph), intent(in) :: x - integer :: isph - real(dp) :: vrms, fac + integer :: isph, l, ind, m + real(dp) :: vrms, fac, tmp - vrms = 0.0_dp + vrms = 0.0d0 !$omp parallel do default(none) shared(nsph,lmax,nbasis,x) & - !$omp private(isph,fac) schedule(dynamic) reduction(+:vrms) + !$omp private(isph,tmp,l,ind,fac,m) schedule(static,1) reduction(+:vrms) do isph = 1, nsph - call hsnorm(lmax, nbasis, x(:,isph), fac) - vrms = vrms + fac*fac + tmp = 0.0d0 + do l = 0, lmax + ind = l*l + l + 1 + fac = 1.0d0/(1.0d0 + dble(l)) + do m = -l, l + tmp = tmp + fac*x(ind+m,isph)*x(ind+m,isph) + end do + end do + vrms = vrms + tmp enddo hnorm = sqrt(vrms/dble(nsph)) end function hnorm diff --git a/src/ddx_operators.f90 b/src/ddx_operators.f90 index 161479ea..1c4e3188 100644 --- a/src/ddx_operators.f90 +++ b/src/ddx_operators.f90 @@ -62,20 +62,19 @@ subroutine lx(params, constants, workspace, x, y, ddx_error) do isph = 1, params % nsph iproc = omp_get_thread_num() + 1 ! Compute NEGATIVE action of off-digonal blocks - call calcv(params, constants, isph, workspace % tmp_pot(:, iproc), x, & + call calcv(params, constants, isph, workspace % tmp_grid(:, isph), x, & & workspace % tmp_work(:, iproc)) - call ddintegrate(1, constants % nbasis, params % ngrid, & - & constants % vwgrid, constants % vgrid_nbasis, & - & workspace % tmp_pot(:, iproc), y(:, isph)) - ! now, fix the sign. - y(:, isph) = - y(:, isph) end do + call ddintegrate(params % nsph, constants % nbasis, params % ngrid, & + & constants % vwgrid, constants % vgrid_nbasis, & + & workspace % tmp_grid, y) + y = - y call time_pull("lx") end if ! ! if required, add the diagonal. ! - if (constants % dodiag) then + if (constants % dodiag) then do isph = 1, params % nsph do l = 0, params % lmax ind = l*l + l + 1 @@ -776,11 +775,11 @@ subroutine bx(params, constants, workspace, x, y, ddx_error) !$omp private(isph,iproc) schedule(dynamic) do isph = 1, params % nsph iproc = omp_get_thread_num() + 1 - call calcv2_lpb(params, constants, isph, workspace % tmp_pot(:, iproc), x) - call ddintegrate(1, constants % nbasis, params % ngrid, constants % vwgrid, & - & constants % vgrid_nbasis, workspace % tmp_pot(:, iproc), y(:,isph)) - y(:,isph) = - y(:,isph) + call calcv2_lpb(params, constants, isph, workspace % tmp_grid(:,isph), x) end do + call ddintegrate(params % nsph, constants % nbasis, params % ngrid, constants % vwgrid, & + & constants % vgrid_nbasis, workspace % tmp_grid, y) + y = - y call time_pull("bx") end if if (constants % dodiag) y = y + x From 57c1fd01fef321065ee77ee33e0d959a59b4216a Mon Sep 17 00:00:00 2001 From: Michele Nottoli Date: Mon, 15 Apr 2024 17:13:44 +0200 Subject: [PATCH 03/21] Further profiling. --- src/ddx_operators.f90 | 13 ++++++++++++- src/ddx_profiling.f90 | 2 +- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/src/ddx_operators.f90 b/src/ddx_operators.f90 index 1c4e3188..17a404d4 100644 --- a/src/ddx_operators.f90 +++ b/src/ddx_operators.f90 @@ -36,11 +36,14 @@ subroutine lx(params, constants, workspace, x, y, ddx_error) !! Local variables integer :: isph, jsph, ij, l, ind, iproc + call time_push() ! dummy operation on unused interface arguments if (ddx_error % flag .eq. 0) continue !! Initialize + call time_push() y = zero + call time_pull("zero") ! ! incore code: ! @@ -65,11 +68,16 @@ subroutine lx(params, constants, workspace, x, y, ddx_error) call calcv(params, constants, isph, workspace % tmp_grid(:, isph), x, & & workspace % tmp_work(:, iproc)) end do + call time_pull("calcv") + + call time_push() call ddintegrate(params % nsph, constants % nbasis, params % ngrid, & & constants % vwgrid, constants % vgrid_nbasis, & & workspace % tmp_grid, y) + call time_pull("integrate") + call time_push() y = - y - call time_pull("lx") + call time_pull("sign") end if ! ! if required, add the diagonal. @@ -83,6 +91,7 @@ subroutine lx(params, constants, workspace, x, y, ddx_error) end do end do end if + call time_pull("lx") end subroutine lx !> Adjoint single layer operator matvec @@ -1067,6 +1076,7 @@ subroutine cx(params, constants, workspace, x, y, ddx_error) real(dp), allocatable :: diff_re(:,:), diff0(:,:) + call time_push() allocate(diff_re(constants % nbasis, params % nsph), & & diff0(constants % nbasis0, params % nsph), stat=info) if (info.ne.0) then @@ -1191,6 +1201,7 @@ subroutine cx(params, constants, workspace, x, y, ddx_error) call update_error(ddx_error, "Deallocation failed in cx") return end if + call time_pull("cx") end subroutine cx diff --git a/src/ddx_profiling.f90 b/src/ddx_profiling.f90 index b5799d88..7e056f3c 100644 --- a/src/ddx_profiling.f90 +++ b/src/ddx_profiling.f90 @@ -38,7 +38,7 @@ subroutine time_pull(s) elap = omp_get_wtime() - times(tcnt-1) tcnt = tcnt-1 write(msg, "(3a, ': ', e14.6E2, ' s')") repeat('-', tcnt), '> ', s, elap - write(*, *) '[fmm-time]', trim(msg) + write(*, *) '[ddX-time]', trim(msg) else write(*, *) 'time_pull Cannot pull any value.' stop 1 From 58493f8b64be6b98b422a5dc79e2016958d1df61 Mon Sep 17 00:00:00 2001 From: Michele Nottoli Date: Tue, 16 Apr 2024 10:50:51 +0200 Subject: [PATCH 04/21] Slight improvement. --- src/ddx_operators.f90 | 40 +++++++++++++++++++++++++++++++++------- 1 file changed, 33 insertions(+), 7 deletions(-) diff --git a/src/ddx_operators.f90 b/src/ddx_operators.f90 index 17a404d4..2caefb59 100644 --- a/src/ddx_operators.f90 +++ b/src/ddx_operators.f90 @@ -34,7 +34,8 @@ subroutine lx(params, constants, workspace, x, y, ddx_error) real(dp), intent(out) :: y(constants % nbasis, params % nsph) type(ddx_error_type), intent(inout) :: ddx_error !! Local variables - integer :: isph, jsph, ij, l, ind, iproc + integer :: isph, jsph, ij, l, ind, iproc, its + real(dp) :: vij(3), tij, vvij, xij, oij, thigh call time_push() ! dummy operation on unused interface arguments @@ -60,13 +61,38 @@ subroutine lx(params, constants, workspace, x, y, ddx_error) end do else call time_push() - !$omp parallel do default(none) shared(params,constants,workspace,x,y) & - !$omp private(isph,iproc) schedule(dynamic) + thigh = one + pt5*(params % se + one)*params % eta + !$omp parallel do collapse(2) default(none) shared(params,constants,workspace,x,y,thigh) & + !$omp private(isph,iproc,its,ij,jsph,vij,vvij,tij,xij,oij) schedule(static,1) do isph = 1, params % nsph - iproc = omp_get_thread_num() + 1 - ! Compute NEGATIVE action of off-digonal blocks - call calcv(params, constants, isph, workspace % tmp_grid(:, isph), x, & - & workspace % tmp_work(:, iproc)) + do its = 1, params % ngrid + workspace % tmp_grid(its, isph) = zero + iproc = omp_get_thread_num() + 1 + ! contribution from integration point present + if (constants % ui(its,isph).lt.one) then + ! loop over neighbors of i-sphere + do ij = constants % inl(isph), constants % inl(isph+1)-1 + jsph = constants % nl(ij) + ! compute t_n^ij = | r_i + \rho_i s_n - r_j | / \rho_j + vij = params % csph(:,isph) + params % rsph(isph)* & + & constants % cgrid(:,its) - params % csph(:,jsph) + vvij = sqrt(vij(1)*vij(1) + vij(2)*vij(2) + vij(3)*vij(3)) + tij = vvij / params % rsph(jsph) + ! point is INSIDE j-sphere + if (tij.lt.thigh .and. tij.gt.zero) then + xij = fsw(tij, params % se, params % eta) + if (constants % fi(its,isph).gt.one) then + oij = xij / constants % fi(its,isph) + else + oij = xij + end if + call fmm_l2p_work(vij, params % rsph(jsph), params % lmax, & + & constants % vscales_rel, oij, x(:, jsph), one, & + & workspace % tmp_grid(its, isph), workspace % tmp_work(:, iproc)) + end if + end do + end if + end do end do call time_pull("calcv") From 0535cb269605ece136a8117d5fab8e1adfe07e7b Mon Sep 17 00:00:00 2001 From: Michele Nottoli Date: Tue, 16 Apr 2024 12:45:26 +0200 Subject: [PATCH 05/21] Big improvement by precomputing buried region. --- src/ddx_constants.f90 | 67 ++++++++++++++++++++++++++++++++++++++++++- src/ddx_operators.f90 | 55 +++++++++++++++++------------------ 2 files changed, 93 insertions(+), 29 deletions(-) diff --git a/src/ddx_constants.f90 b/src/ddx_constants.f90 index 66577adc..d798b311 100644 --- a/src/ddx_constants.f90 +++ b/src/ddx_constants.f90 @@ -223,6 +223,12 @@ module ddx_constants !> Whether the diagonal of the matrices has to be used in the mvp for !! ddCOSMO, ddPCM or inner ddLPB iterations logical :: dodiag + !> List of exposed points in a CSR format. + integer, allocatable :: iexposed(:) + integer, allocatable :: exposed(:) + !> List of buried points in a CSR format. + integer, allocatable :: iburied(:) + integer, allocatable :: buried(:) end type ddx_constants_type contains @@ -423,7 +429,7 @@ subroutine constants_init(params, constants, ddx_error) end do end if - ! Generate geometry-related constants (required by the LPB code) + ! Generate geometry-related constants call constants_geometry_init(params, constants, ddx_error) if (ddx_error % flag .ne. 0) then call update_error(ddx_error, "constants_geometry_init returned an " // & @@ -992,6 +998,37 @@ subroutine constants_geometry_init(params, constants, ddx_error) end if enddo enddo + + allocate(constants % iburied(params % nsph + 1), & + & constants % iexposed(params % nsph + 1), & + & constants % buried(params % nsph * params % ngrid), & + & constants % exposed(params % nsph * params % ngrid), stat=info) + if (info .ne. 0) then + call update_error(ddx_error, "Buried exposed lists allocation failed") + return + endif + iwork = 1 + do isph = 1, params % nsph + constants % iburied(isph) = iwork + do igrid = 1, params % ngrid + if (constants % ui(igrid, isph) .lt. one) then + write(7, *) isph, igrid + constants % buried(iwork) = igrid + iwork = iwork + 1 + end if + end do + end do + constants % iburied(params % nsph + 1) = iwork + + !!do isph = 1, params % nsph + !! write(6,*) isph, constants % iburied(isph), constants % iburied(isph + 1) - 1 + !! do iwork = constants % iburied(isph), constants % iburied(isph + 1) - 1 + !! igrid = constants % buried(iwork) + !! write(8, *) isph, igrid + !! end do + !!end do + !!stop 0 + ! Build cavity array. At first get total count for each sphere allocate(constants % ncav_sph(params % nsph), stat=info) if (info .ne. 0) then @@ -2144,6 +2181,34 @@ subroutine constants_free(constants, ddx_error) & "deallocation failed!") end if end if + if (allocated(constants % iburied)) then + deallocate(constants % iburied, stat=istat) + if (istat .ne. 0) then + call update_error(ddx_error, "`iburied` " // & + & "deallocation failed!") + end if + end if + if (allocated(constants % buried)) then + deallocate(constants % buried, stat=istat) + if (istat .ne. 0) then + call update_error(ddx_error, "`buried` " // & + & "deallocation failed!") + end if + end if + if (allocated(constants % iexposed)) then + deallocate(constants % iexposed, stat=istat) + if (istat .ne. 0) then + call update_error(ddx_error, "`iexposed` " // & + & "deallocation failed!") + end if + end if + if (allocated(constants % exposed)) then + deallocate(constants % exposed, stat=istat) + if (istat .ne. 0) then + call update_error(ddx_error, "`exposed` " // & + & "deallocation failed!") + end if + end if end subroutine constants_free end module ddx_constants diff --git a/src/ddx_operators.f90 b/src/ddx_operators.f90 index 2caefb59..9175fce8 100644 --- a/src/ddx_operators.f90 +++ b/src/ddx_operators.f90 @@ -34,7 +34,7 @@ subroutine lx(params, constants, workspace, x, y, ddx_error) real(dp), intent(out) :: y(constants % nbasis, params % nsph) type(ddx_error_type), intent(inout) :: ddx_error !! Local variables - integer :: isph, jsph, ij, l, ind, iproc, its + integer :: isph, jsph, ij, l, ind, iproc, its, i real(dp) :: vij(3), tij, vvij, xij, oij, thigh call time_push() @@ -62,36 +62,35 @@ subroutine lx(params, constants, workspace, x, y, ddx_error) else call time_push() thigh = one + pt5*(params % se + one)*params % eta - !$omp parallel do collapse(2) default(none) shared(params,constants,workspace,x,y,thigh) & - !$omp private(isph,iproc,its,ij,jsph,vij,vvij,tij,xij,oij) schedule(static,1) + workspace % tmp_grid = zero + !$omp parallel do default(none) shared(params, & + !$omp constants,workspace,x,y,thigh) private(isph,iproc,its,ij, & + !$omp jsph,vij,vvij,tij,xij,oij,i) schedule(static,1) do isph = 1, params % nsph - do its = 1, params % ngrid - workspace % tmp_grid(its, isph) = zero + do i = constants % iburied(isph), constants % iburied(isph + 1) - 1 + its = constants % buried(i) iproc = omp_get_thread_num() + 1 - ! contribution from integration point present - if (constants % ui(its,isph).lt.one) then - ! loop over neighbors of i-sphere - do ij = constants % inl(isph), constants % inl(isph+1)-1 - jsph = constants % nl(ij) - ! compute t_n^ij = | r_i + \rho_i s_n - r_j | / \rho_j - vij = params % csph(:,isph) + params % rsph(isph)* & - & constants % cgrid(:,its) - params % csph(:,jsph) - vvij = sqrt(vij(1)*vij(1) + vij(2)*vij(2) + vij(3)*vij(3)) - tij = vvij / params % rsph(jsph) - ! point is INSIDE j-sphere - if (tij.lt.thigh .and. tij.gt.zero) then - xij = fsw(tij, params % se, params % eta) - if (constants % fi(its,isph).gt.one) then - oij = xij / constants % fi(its,isph) - else - oij = xij - end if - call fmm_l2p_work(vij, params % rsph(jsph), params % lmax, & - & constants % vscales_rel, oij, x(:, jsph), one, & - & workspace % tmp_grid(its, isph), workspace % tmp_work(:, iproc)) + write(6,*) "thread", iproc, "working on", isph, its + do ij = constants % inl(isph), constants % inl(isph+1)-1 + jsph = constants % nl(ij) + ! compute t_n^ij = | r_i + \rho_i s_n - r_j | / \rho_j + vij = params % csph(:,isph) + params % rsph(isph)* & + & constants % cgrid(:,its) - params % csph(:,jsph) + vvij = sqrt(vij(1)*vij(1) + vij(2)*vij(2) + vij(3)*vij(3)) + tij = vvij / params % rsph(jsph) + ! point is INSIDE j-sphere + if (tij.lt.thigh .and. tij.gt.zero) then + xij = fsw(tij, params % se, params % eta) + if (constants % fi(its,isph).gt.one) then + oij = xij / constants % fi(its,isph) + else + oij = xij end if - end do - end if + call fmm_l2p_work(vij, params % rsph(jsph), params % lmax, & + & constants % vscales_rel, oij, x(:, jsph), one, & + & workspace % tmp_grid(its, isph), workspace % tmp_work(:, iproc)) + end if + end do end do end do call time_pull("calcv") From a95dc8a5b3c7b2bb7754320d214a4cc1b58ac672 Mon Sep 17 00:00:00 2001 From: Michele Nottoli Date: Mon, 22 Apr 2024 16:07:40 +0200 Subject: [PATCH 06/21] Good parallel scaling. --- src/ddx_operators.f90 | 103 +++++++++++++++++++++++------------------- 1 file changed, 57 insertions(+), 46 deletions(-) diff --git a/src/ddx_operators.f90 b/src/ddx_operators.f90 index 9175fce8..f24e6538 100644 --- a/src/ddx_operators.f90 +++ b/src/ddx_operators.f90 @@ -34,21 +34,22 @@ subroutine lx(params, constants, workspace, x, y, ddx_error) real(dp), intent(out) :: y(constants % nbasis, params % nsph) type(ddx_error_type), intent(inout) :: ddx_error !! Local variables - integer :: isph, jsph, ij, l, ind, iproc, its, i + integer :: isph, jsph, ij, l, ind, its, i real(dp) :: vij(3), tij, vvij, xij, oij, thigh + integer :: vgrid_nbasis, nbasis, ngrid, nsph, lmax + real(dp), allocatable :: tmp_grid(:) + real(dp) :: se, eta, work(params % lmax + 1) + + allocate(tmp_grid(params % ngrid)) + call time_push() ! dummy operation on unused interface arguments if (ddx_error % flag .eq. 0) continue + if (workspace % xs_time .eq. 0) continue - !! Initialize - call time_push() - y = zero - call time_pull("zero") -! -! incore code: -! if (params % matvecmem .eq. 1) then + y = zero !$omp parallel do default(none) shared(params,constants,x,y) & !$omp private(isph,ij,jsph) schedule(dynamic) do isph = 1, params % nsph @@ -61,48 +62,57 @@ subroutine lx(params, constants, workspace, x, y, ddx_error) end do else call time_push() + + ! set up the environment for a good usage of open mp thigh = one + pt5*(params % se + one)*params % eta - workspace % tmp_grid = zero - !$omp parallel do default(none) shared(params, & - !$omp constants,workspace,x,y,thigh) private(isph,iproc,its,ij, & - !$omp jsph,vij,vvij,tij,xij,oij,i) schedule(static,1) - do isph = 1, params % nsph - do i = constants % iburied(isph), constants % iburied(isph + 1) - 1 - its = constants % buried(i) - iproc = omp_get_thread_num() + 1 - write(6,*) "thread", iproc, "working on", isph, its - do ij = constants % inl(isph), constants % inl(isph+1)-1 - jsph = constants % nl(ij) - ! compute t_n^ij = | r_i + \rho_i s_n - r_j | / \rho_j - vij = params % csph(:,isph) + params % rsph(isph)* & - & constants % cgrid(:,its) - params % csph(:,jsph) - vvij = sqrt(vij(1)*vij(1) + vij(2)*vij(2) + vij(3)*vij(3)) - tij = vvij / params % rsph(jsph) - ! point is INSIDE j-sphere - if (tij.lt.thigh .and. tij.gt.zero) then - xij = fsw(tij, params % se, params % eta) - if (constants % fi(its,isph).gt.one) then - oij = xij / constants % fi(its,isph) - else - oij = xij + nsph = params % nsph + se = params % se + eta = params % eta + lmax = params % lmax + nbasis = constants % nbasis + ngrid = params % ngrid + vgrid_nbasis = constants % vgrid_nbasis + + associate(iburied => constants % iburied, buried => constants % buried, & + & inl => constants % inl, nl => constants % nl, & + & csph => params % csph, rsph => params % rsph, & + & cgrid => constants % cgrid, fi => constants % fi, & + & vscales_rel => constants % vscales_rel, & + & vwgrid => constants % vwgrid) + + !$omp parallel do default(none) & + !$omp firstprivate(nsph,thigh,se,eta,lmax,nbasis,ngrid,vgrid_nbasis) & + !$omp shared(iburied,buried,inl,nl,csph,rsph,fi,x,y,vscales_rel,vwgrid) & + !$omp private(isph,its,ij,jsph,vij,vvij,tij,xij,oij,i,work,tmp_grid) & + !$omp schedule(dynamic,1) + do isph = 1, nsph + tmp_grid(:) = 0.0d0 + do i = iburied(isph), iburied(isph + 1) - 1 + its = buried(i) + do ij = inl(isph), inl(isph+1)-1 + jsph = nl(ij) + vij = csph(:,isph) + rsph(isph)*cgrid(:,its) - csph(:,jsph) + vvij = sqrt(vij(1)*vij(1) + vij(2)*vij(2) + vij(3)*vij(3)) + tij = vvij / rsph(jsph) + if (tij.lt.thigh .and. tij.gt.0.0d0) then + xij = fsw(tij, se, eta) + if (fi(its, isph).gt.1.0d0) then + oij = xij / fi(its, isph) + else + oij = xij + end if + call fmm_l2p_work(vij, rsph(jsph), lmax, & + & vscales_rel, oij, x(:, jsph), 1.0d0, & + & tmp_grid(its), work) end if - call fmm_l2p_work(vij, params % rsph(jsph), params % lmax, & - & constants % vscales_rel, oij, x(:, jsph), one, & - & workspace % tmp_grid(its, isph), workspace % tmp_work(:, iproc)) - end if + end do end do + call ddintegrate(1, nbasis, ngrid, vwgrid, vgrid_nbasis, & + & tmp_grid, y(:, isph)) + y(:, isph) = - y(:, isph) end do - end do - call time_pull("calcv") - - call time_push() - call ddintegrate(params % nsph, constants % nbasis, params % ngrid, & - & constants % vwgrid, constants % vgrid_nbasis, & - & workspace % tmp_grid, y) - call time_pull("integrate") - call time_push() - y = - y - call time_pull("sign") + end associate + call time_pull("calcv+integrate+sign") end if ! ! if required, add the diagonal. @@ -117,6 +127,7 @@ subroutine lx(params, constants, workspace, x, y, ddx_error) end do end if call time_pull("lx") + deallocate(tmp_grid) end subroutine lx !> Adjoint single layer operator matvec From 05a61c266e0539026e66e249b18f4007ec7a6df8 Mon Sep 17 00:00:00 2001 From: Michele Nottoli Date: Mon, 22 Apr 2024 16:55:45 +0200 Subject: [PATCH 07/21] Fixed a bug. --- src/ddx_operators.f90 | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ddx_operators.f90 b/src/ddx_operators.f90 index f24e6538..051c5e8a 100644 --- a/src/ddx_operators.f90 +++ b/src/ddx_operators.f90 @@ -82,8 +82,8 @@ subroutine lx(params, constants, workspace, x, y, ddx_error) !$omp parallel do default(none) & !$omp firstprivate(nsph,thigh,se,eta,lmax,nbasis,ngrid,vgrid_nbasis) & - !$omp shared(iburied,buried,inl,nl,csph,rsph,fi,x,y,vscales_rel,vwgrid) & - !$omp private(isph,its,ij,jsph,vij,vvij,tij,xij,oij,i,work,tmp_grid) & + !$omp shared(buried,iburied,inl,nl,csph,rsph,cgrid,fi,vscales_rel,x,y,vwgrid) & + !$omp private(isph,tmp_grid,i,its,ij,jsph,vij,vvij,tij,xij,oij,work) & !$omp schedule(dynamic,1) do isph = 1, nsph tmp_grid(:) = 0.0d0 From 13566a1a3158d957918797a007aaaf569fdd447e Mon Sep 17 00:00:00 2001 From: Michele Nottoli Date: Tue, 23 Apr 2024 15:47:21 +0200 Subject: [PATCH 08/21] lx completely parallel. --- src/ddx_operators.f90 | 71 +++++++++++++++++++++++-------------------- 1 file changed, 38 insertions(+), 33 deletions(-) diff --git a/src/ddx_operators.f90 b/src/ddx_operators.f90 index 051c5e8a..5367e874 100644 --- a/src/ddx_operators.f90 +++ b/src/ddx_operators.f90 @@ -41,35 +41,36 @@ subroutine lx(params, constants, workspace, x, y, ddx_error) real(dp), allocatable :: tmp_grid(:) real(dp) :: se, eta, work(params % lmax + 1) + call time_push() allocate(tmp_grid(params % ngrid)) - call time_push() ! dummy operation on unused interface arguments if (ddx_error % flag .eq. 0) continue if (workspace % xs_time .eq. 0) continue + nsph = params % nsph + nbasis = constants % nbasis + lmax = params % lmax + if (params % matvecmem .eq. 1) then - y = zero - !$omp parallel do default(none) shared(params,constants,x,y) & - !$omp private(isph,ij,jsph) schedule(dynamic) - do isph = 1, params % nsph - do ij = constants % inl(isph), constants % inl(isph + 1) - 1 - jsph = constants % nl(ij) - call dgemv('n', constants % nbasis, constants % nbasis, one, & - & constants % l(:,:,ij), constants % nbasis, x(:,jsph), 1, & - & one, y(:,isph), 1) + associate(inl => constants % inl, nl => constants % nl, & + & l => constants % l) + !$omp parallel do default(none) shared(inl,nl,l,x,y) & + !$omp firstprivate(nsph,nbasis) & + !$omp private(isph,ij,jsph) schedule(dynamic,1) + do isph = 1, nsph + y(:,isph) = 0.0d0 + do ij = inl(isph), inl(isph + 1) - 1 + jsph = nl(ij) + call dgemv('n', nbasis, nbasis, 1.0d0, l(:,:,ij), & + & nbasis, x(:,jsph), 1, 1.0d0, y(:,isph), 1) + end do end do - end do + end associate else - call time_push() - - ! set up the environment for a good usage of open mp thigh = one + pt5*(params % se + one)*params % eta - nsph = params % nsph se = params % se eta = params % eta - lmax = params % lmax - nbasis = constants % nbasis ngrid = params % ngrid vgrid_nbasis = constants % vgrid_nbasis @@ -80,10 +81,10 @@ subroutine lx(params, constants, workspace, x, y, ddx_error) & vscales_rel => constants % vscales_rel, & & vwgrid => constants % vwgrid) - !$omp parallel do default(none) & - !$omp firstprivate(nsph,thigh,se,eta,lmax,nbasis,ngrid,vgrid_nbasis) & - !$omp shared(buried,iburied,inl,nl,csph,rsph,cgrid,fi,vscales_rel,x,y,vwgrid) & - !$omp private(isph,tmp_grid,i,its,ij,jsph,vij,vvij,tij,xij,oij,work) & + !$omp parallel do default(none) firstprivate(nsph,thigh,se,eta, & + !$omp lmax,nbasis,ngrid,vgrid_nbasis) shared(buried,iburied,inl, & + !$omp nl,csph,rsph,cgrid,fi,vscales_rel,x,y,vwgrid) private( & + !$omp isph,tmp_grid,i,its,ij,jsph,vij,vvij,tij,xij,oij,work) & !$omp schedule(dynamic,1) do isph = 1, nsph tmp_grid(:) = 0.0d0 @@ -91,8 +92,10 @@ subroutine lx(params, constants, workspace, x, y, ddx_error) its = buried(i) do ij = inl(isph), inl(isph+1)-1 jsph = nl(ij) - vij = csph(:,isph) + rsph(isph)*cgrid(:,its) - csph(:,jsph) - vvij = sqrt(vij(1)*vij(1) + vij(2)*vij(2) + vij(3)*vij(3)) + vij = csph(:,isph) + rsph(isph)*cgrid(:,its) & + & - csph(:,jsph) + vvij = sqrt(vij(1)*vij(1) + vij(2)*vij(2) & + & + vij(3)*vij(3)) tij = vvij / rsph(jsph) if (tij.lt.thigh .and. tij.gt.0.0d0) then xij = fsw(tij, se, eta) @@ -112,19 +115,21 @@ subroutine lx(params, constants, workspace, x, y, ddx_error) y(:, isph) = - y(:, isph) end do end associate - call time_pull("calcv+integrate+sign") end if -! -! if required, add the diagonal. -! + + ! if required, add the diagonal. if (constants % dodiag) then - do isph = 1, params % nsph - do l = 0, params % lmax - ind = l*l + l + 1 - y(ind-l:ind+l, isph) = y(ind-l:ind+l, isph) + & - x(ind-l:ind+l, isph) / (constants % vscales(ind)**2) + associate(vscales => constants % vscales) + !$omp parallel do default(none) private(isph,l,ind) & + !$omp firstprivate(nsph,lmax) shared(x,y,vscales) + do isph = 1, nsph + do l = 0, lmax + ind = l*l + l + 1 + y(ind-l:ind+l, isph) = y(ind-l:ind+l, isph) & + & + x(ind-l:ind+l, isph) / (vscales(ind)**2) + end do end do - end do + end associate end if call time_pull("lx") deallocate(tmp_grid) From ca8108c500a3072d98605bc4051c3108b63619eb Mon Sep 17 00:00:00 2001 From: Michele Nottoli Date: Tue, 23 Apr 2024 17:15:39 +0200 Subject: [PATCH 09/21] Parallelized adjoint. --- src/ddx.f90 | 1 + src/ddx_operators.f90 | 169 ++++++++++++++++++++++++++++-------------- src/ddx_solvers.f90 | 2 +- 3 files changed, 117 insertions(+), 55 deletions(-) diff --git a/src/ddx.f90 b/src/ddx.f90 index b9ee78ed..ba424b4d 100644 --- a/src/ddx.f90 +++ b/src/ddx.f90 @@ -411,6 +411,7 @@ subroutine ddrun(ddx_data, state, electrostatics, psi, tol, esolv, & return end if end if + call time_push() call solve(ddx_data % params, ddx_data % constants, & & ddx_data % workspace, state, tol, ddx_error) diff --git a/src/ddx_operators.f90 b/src/ddx_operators.f90 index 5367e874..094b3259 100644 --- a/src/ddx_operators.f90 +++ b/src/ddx_operators.f90 @@ -34,8 +34,8 @@ subroutine lx(params, constants, workspace, x, y, ddx_error) real(dp), intent(out) :: y(constants % nbasis, params % nsph) type(ddx_error_type), intent(inout) :: ddx_error !! Local variables - integer :: isph, jsph, ij, l, ind, its, i - real(dp) :: vij(3), tij, vvij, xij, oij, thigh + integer :: isph, jsph, ij, l, ind, its, i, m + real(dp) :: vij(3), tij, vvij, xij, oij, thigh, fac integer :: vgrid_nbasis, nbasis, ngrid, nsph, lmax real(dp), allocatable :: tmp_grid(:) @@ -120,13 +120,16 @@ subroutine lx(params, constants, workspace, x, y, ddx_error) ! if required, add the diagonal. if (constants % dodiag) then associate(vscales => constants % vscales) - !$omp parallel do default(none) private(isph,l,ind) & - !$omp firstprivate(nsph,lmax) shared(x,y,vscales) + !$omp parallel do collapse(2) default(none) & + !$omp shared(vscales,x,y) firstprivate(nsph,lmax) & + !$omp private(isph,l,ind,m,fac) schedule(static,10) do isph = 1, nsph do l = 0, lmax ind = l*l + l + 1 - y(ind-l:ind+l, isph) = y(ind-l:ind+l, isph) & - & + x(ind-l:ind+l, isph) / (vscales(ind)**2) + fac = vscales(ind)**2 + do m = -l, l + y(ind+m, isph) = y(ind+m, isph) + x(ind+m, isph)/fac + end do end do end do end associate @@ -135,7 +138,7 @@ subroutine lx(params, constants, workspace, x, y, ddx_error) deallocate(tmp_grid) end subroutine lx -!> Adjoint single layer operator matvec +!> Adjoint single layer operator matvec subroutine lstarx(params, constants, workspace, x, y, ddx_error) implicit none !! Inputs @@ -148,55 +151,105 @@ subroutine lstarx(params, constants, workspace, x, y, ddx_error) real(dp), intent(out) :: y(constants % nbasis, params % nsph) type(ddx_error_type), intent(inout) :: ddx_error !! Local variables - integer :: isph, jsph, ij, indmat, igrid, l, ind, iproc + integer :: isph, jsph, ij, indmat, igrid, l, ind, m + real(dp) :: vji(3), vvji, tji, xji, oji, fac + + integer :: nsph, ngrid, nbasis, lmax + real(dp) :: thigh, se, eta, work(params % lmax + 1) + + call time_push() ! dummy operation on unused interface arguments if (ddx_error % flag .eq. 0) continue - y = zero + nsph = params % nsph + nbasis = constants % nbasis + if (params % matvecmem .eq. 1) then - !$omp parallel do default(none) shared(params,constants,x,y) & - !$omp private(isph,ij,jsph,indmat) schedule(dynamic) - do isph = 1, params % nsph - do ij = constants % inl(isph), constants % inl(isph + 1) - 1 - jsph = constants % nl(ij) - indmat = constants % itrnl(ij) - call dgemv('t', constants % nbasis, constants % nbasis, one, & - & constants % l(:,:,indmat), constants % nbasis, x(:,jsph), 1, & - & one, y(:,isph), 1) + associate(inl => constants % inl, nl => constants % nl, & + & itrnl => constants % itrnl, l => constants % l) + !$omp parallel do default(none) schedule(dynamic,1) & + !$omp firstprivate(nsph,nbasis) shared(x,y,inl,nl,itrnl,l) & + !$omp private(isph,ij,jsph,indmat) + do isph = 1, nsph + y(:,isph) = 0.0d0 + do ij = inl(isph), inl(isph + 1) - 1 + jsph = nl(ij) + indmat = itrnl(ij) + call dgemv('t', nbasis, nbasis, 1.0d0, & + & l(:,:,indmat), nbasis, x(:,jsph), 1, & + & 1.0d0, y(:,isph), 1) + end do end do - end do + end associate else + ngrid = params % ngrid + thigh = one + (params % se+one)/two*params % eta + se = params % se + eta = params % eta + lmax = params % lmax ! Expand x over spherical harmonics - !$omp parallel do default(none) shared(params,constants,workspace,x) & - !$omp private(isph,igrid) schedule(dynamic) - do isph = 1, params % nsph - do igrid = 1, params % ngrid - workspace % tmp_grid(igrid, isph) = dot_product(x(:, isph), & - & constants % vgrid(:constants % nbasis, igrid)) + associate(vgrid => constants % vgrid, tmp_grid => workspace % tmp_grid) + !$omp parallel do collapse(2) default(none) shared(x,tmp_grid) & + !$omp firstprivate(nsph,ngrid,nbasis) & + !$omp private(isph,igrid) schedule(static,100) + do isph = 1, nsph + do igrid = 1, ngrid + tmp_grid(igrid,isph) = dot_product(x(:, isph), vgrid(:nbasis, igrid)) + end do end do - end do - ! Compute (negative) action - !$omp parallel do default(none) shared(params,constants,workspace,x,y) & - !$omp private(isph,iproc) schedule(dynamic) - do isph = 1, params % nsph - iproc = omp_get_thread_num() + 1 - call adjrhs(params, constants, isph, workspace % tmp_grid, & - & y(:, isph), workspace % tmp_work(:, iproc)) - ! fix the sign - y(:, isph) = - y(:, isph) - end do + end associate + associate(inl => constants % inl, nl => constants % nl, & + & csph => params % csph, rsph => params % rsph, & + & cgrid => constants % cgrid, fi => constants % fi, & + & wgrid => constants % wgrid, & + & tmp_grid => workspace % tmp_grid, & + & vscales_rel => constants % vscales_rel) + !$omp parallel do default(none) schedule(dynamic,1) & + !$omp shared(inl,nl,csph,rsph,cgrid,fi,wgrid,tmp_grid,y) & + !$omp firstprivate(nsph,ngrid,thigh,se,eta,lmax) & + !$omp private(isph,ij,jsph,igrid,vji,vvji,tji,xji,oji,fac,work) + do isph = 1, nsph + do ij = inl(isph), inl(isph+1)-1 + jsph = nl(ij) + do igrid = 1, ngrid + vji = csph(:,jsph) + rsph(jsph)*cgrid(:,igrid) - csph(:,isph) + vvji = sqrt(vji(1)*vji(1) + vji(2)*vji(2) + vji(3)*vji(3)) + tji = vvji/rsph(isph) + if (tji.lt.thigh) then + xji = fsw(tji, se, eta) + if (fi(igrid,jsph).gt.1.0d0) then + oji = xji/fi(igrid,jsph) + else + oji = xji + endif + fac = wgrid(igrid) * tmp_grid(igrid, jsph) * oji + call fmm_l2p_adj_work(vji, fac, rsph(isph), & + & lmax, vscales_rel, 1.0d0, y(:, isph), work) + endif + end do + end do + y(:, isph) = - y(:, isph) + end do + end associate end if if (constants % dodiag) then - ! Loop over harmonics - do isph = 1, params % nsph - do l = 0, params % lmax - ind = l*l + l + 1 - y(ind-l:ind+l, isph) = y(ind-l:ind+l, isph) + & - & x(ind-l:ind+l, isph) / (constants % vscales(ind)**2) + associate(vscales => constants % vscales) + !$omp parallel do collapse(2) default(none) & + !$omp shared(vscales,x,y) firstprivate(nsph,lmax) & + !$omp private(isph,l,ind,m,fac) schedule(static,10) + do isph = 1, nsph + do l = 0, lmax + ind = l*l + l + 1 + fac = vscales(ind)**2 + do m = -l, l + y(ind+m, isph) = y(ind+m, isph) + x(ind+m, isph)/fac + end do + end do end do - end do + end associate end if + call time_pull("lstarx") end subroutine lstarx !> Diagonal preconditioning for Lx operator @@ -214,22 +267,30 @@ subroutine ldm1x(params, constants, workspace, x, y, ddx_error) real(dp), intent(out) :: y(constants % nbasis, params % nsph) type(ddx_error_type), intent(inout) :: ddx_error !! Local variables - integer :: isph, l, ind + integer :: isph, l, ind, nsph, lmax, m + real(dp) :: fac ! dummy operation on unused interface arguments if ((ddx_error % flag .eq. 0) .or. & - & (allocated(workspace % tmp_pot))) continue + & (allocated(workspace % tmp_pot))) continue - !! Loop over harmonics - !$omp parallel do default(none) shared(params,constants,x,y) & - !$omp private(isph,l,ind) schedule(dynamic) - do isph = 1, params % nsph - do l = 0, params % lmax - ind = l*l + l + 1 - y(ind-l:ind+l, isph) = x(ind-l:ind+l, isph) & - & *(constants % vscales(ind)**2) + nsph = params % nsph + lmax = params % lmax + + associate(vscales => constants % vscales) + !$omp parallel do collapse(2) default(none) & + !$omp shared(vscales,x,y) firstprivate(nsph,lmax) & + !$omp private(isph,l,ind,m,fac) schedule(static,10) + do isph = 1, nsph + do l = 0, lmax + ind = l*l + l + 1 + fac = vscales(ind)**2 + do m = -l, l + y(ind+m, isph) = x(ind+m, isph)*fac + end do + end do end do - end do + end associate end subroutine ldm1x !> Double layer operator matvec without diagonal blocks diff --git a/src/ddx_solvers.f90 b/src/ddx_solvers.f90 index 23ee70b0..7e5806d0 100644 --- a/src/ddx_solvers.f90 +++ b/src/ddx_solvers.f90 @@ -207,7 +207,7 @@ subroutine diis(n, nmat, ndiis, x, e, b, xnew) end do nmat = nmat + 1 - deallocate (bloc, cex, stat=istatus) + deallocate (bloc, cex, ipiv, stat=istatus) end subroutine diis !> DIIS helper routine From 14b884ff3d7ab25ae7ec33fcb04a3f9c6ea4b709 Mon Sep 17 00:00:00 2001 From: Michele Nottoli Date: Tue, 23 Apr 2024 17:18:45 +0200 Subject: [PATCH 10/21] Bug fix. --- src/ddx_constants.f90 | 3 +++ src/ddx_operators.f90 | 1 + 2 files changed, 4 insertions(+) diff --git a/src/ddx_constants.f90 b/src/ddx_constants.f90 index d798b311..ad0a6d18 100644 --- a/src/ddx_constants.f90 +++ b/src/ddx_constants.f90 @@ -229,6 +229,9 @@ module ddx_constants !> List of buried points in a CSR format. integer, allocatable :: iburied(:) integer, allocatable :: buried(:) + !> List of points in overlapped balls + integer, allocatable :: igrid_overlap(:,:,:) + integer, allocatable :: grid_overlap(:) end type ddx_constants_type contains diff --git a/src/ddx_operators.f90 b/src/ddx_operators.f90 index 094b3259..86dbe62c 100644 --- a/src/ddx_operators.f90 +++ b/src/ddx_operators.f90 @@ -210,6 +210,7 @@ subroutine lstarx(params, constants, workspace, x, y, ddx_error) !$omp firstprivate(nsph,ngrid,thigh,se,eta,lmax) & !$omp private(isph,ij,jsph,igrid,vji,vvji,tji,xji,oji,fac,work) do isph = 1, nsph + y(:,isph) = 0.0d0 do ij = inl(isph), inl(isph+1)-1 jsph = nl(ij) do igrid = 1, ngrid From 3ae4892065c81f186fd3570178e906f8ef6d6591 Mon Sep 17 00:00:00 2001 From: Michele Nottoli Date: Mon, 13 May 2024 11:38:44 +0200 Subject: [PATCH 11/21] Update. --- src/ddx_constants.f90 | 5 +---- src/ddx_operators.f90 | 15 +++++++++------ 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/ddx_constants.f90 b/src/ddx_constants.f90 index ad0a6d18..cce47822 100644 --- a/src/ddx_constants.f90 +++ b/src/ddx_constants.f90 @@ -229,9 +229,6 @@ module ddx_constants !> List of buried points in a CSR format. integer, allocatable :: iburied(:) integer, allocatable :: buried(:) - !> List of points in overlapped balls - integer, allocatable :: igrid_overlap(:,:,:) - integer, allocatable :: grid_overlap(:) end type ddx_constants_type contains @@ -1015,7 +1012,7 @@ subroutine constants_geometry_init(params, constants, ddx_error) constants % iburied(isph) = iwork do igrid = 1, params % ngrid if (constants % ui(igrid, isph) .lt. one) then - write(7, *) isph, igrid + !write(7, *) isph, igrid constants % buried(iwork) = igrid iwork = iwork + 1 end if diff --git a/src/ddx_operators.f90 b/src/ddx_operators.f90 index 86dbe62c..d2f68348 100644 --- a/src/ddx_operators.f90 +++ b/src/ddx_operators.f90 @@ -97,7 +97,7 @@ subroutine lx(params, constants, workspace, x, y, ddx_error) vvij = sqrt(vij(1)*vij(1) + vij(2)*vij(2) & & + vij(3)*vij(3)) tij = vvij / rsph(jsph) - if (tij.lt.thigh .and. tij.gt.0.0d0) then + if (tij.lt.thigh) then xij = fsw(tij, se, eta) if (fi(its, isph).gt.1.0d0) then oij = xij / fi(its, isph) @@ -151,7 +151,7 @@ subroutine lstarx(params, constants, workspace, x, y, ddx_error) real(dp), intent(out) :: y(constants % nbasis, params % nsph) type(ddx_error_type), intent(inout) :: ddx_error !! Local variables - integer :: isph, jsph, ij, indmat, igrid, l, ind, m + integer :: isph, jsph, ij, indmat, igrid, l, ind, m, i real(dp) :: vji(3), vvji, tji, xji, oji, fac integer :: nsph, ngrid, nbasis, lmax @@ -204,16 +204,19 @@ subroutine lstarx(params, constants, workspace, x, y, ddx_error) & cgrid => constants % cgrid, fi => constants % fi, & & wgrid => constants % wgrid, & & tmp_grid => workspace % tmp_grid, & - & vscales_rel => constants % vscales_rel) + & vscales_rel => constants % vscales_rel, & + & iburied => constants % iburied, & + & buried => constants % buried) !$omp parallel do default(none) schedule(dynamic,1) & - !$omp shared(inl,nl,csph,rsph,cgrid,fi,wgrid,tmp_grid,y) & + !$omp shared(inl,nl,csph,rsph,cgrid,fi,wgrid,tmp_grid,y,buried,iburied) & !$omp firstprivate(nsph,ngrid,thigh,se,eta,lmax) & - !$omp private(isph,ij,jsph,igrid,vji,vvji,tji,xji,oji,fac,work) + !$omp private(isph,ij,jsph,igrid,vji,vvji,tji,xji,oji,fac,work,i) do isph = 1, nsph y(:,isph) = 0.0d0 do ij = inl(isph), inl(isph+1)-1 jsph = nl(ij) - do igrid = 1, ngrid + do i = iburied(jsph), iburied(jsph + 1) - 1 + igrid = buried(i) vji = csph(:,jsph) + rsph(jsph)*cgrid(:,igrid) - csph(:,isph) vvji = sqrt(vji(1)*vji(1) + vji(2)*vji(2) + vji(3)*vji(3)) tji = vvji/rsph(isph) From a87cc545a534c14825a454a7aa24ce7038a9238a Mon Sep 17 00:00:00 2001 From: Michele Nottoli Date: Mon, 13 May 2024 15:33:27 +0200 Subject: [PATCH 12/21] Precomputing overlaps cuts LX time by 1/2. --- src/ddx_constants.f90 | 49 +++++++++++++++++++++++++++++++++++++++++-- src/ddx_operators.f90 | 36 ++++++++++++++++--------------- 2 files changed, 66 insertions(+), 19 deletions(-) diff --git a/src/ddx_constants.f90 b/src/ddx_constants.f90 index cce47822..dca0d29f 100644 --- a/src/ddx_constants.f90 +++ b/src/ddx_constants.f90 @@ -229,6 +229,9 @@ module ddx_constants !> List of buried points in a CSR format. integer, allocatable :: iburied(:) integer, allocatable :: buried(:) + !> List of overlapped points in a CSR format. + integer, allocatable :: ioverlap(:) + integer, allocatable :: overlap(:) end type ddx_constants_type contains @@ -779,9 +782,9 @@ subroutine constants_geometry_init(params, constants, ddx_error) !! Local variables real(dp) :: swthr, v(3), maxv, ssqv, vv, t integer :: i, isph, jsph, inear, igrid, iwork, jwork, lwork, & - & old_lwork, icav, info + & old_lwork, icav, info, ij, ijgrid integer, allocatable :: work(:, :), tmp_work(:, :) - real(dp) :: start_time + real(dp) :: start_time, thigh, vij(3), vvij, tij !! The code ! Prepare FMM structures if needed start_time = omp_get_wtime() @@ -1029,6 +1032,48 @@ subroutine constants_geometry_init(params, constants, ddx_error) !!end do !!stop 0 + allocate(constants % ioverlap(constants % inl(params % nsph+1)), & + & constants % overlap(constants % inl(params % nsph+1) & + & *params % ngrid), stat=info) + if (info .ne. 0) then + call update_error(ddx_error, "Overlapped grid lists allocation failed") + return + end if + thigh = one + pt5*(params % se + one)*params % eta + iwork = 1 + do isph = 1, params % nsph + do ij = constants % inl(isph), constants % inl(isph+1) - 1 + jsph = constants % nl(ij) + constants % ioverlap(ij) = iwork + do igrid = 1, params % ngrid + if (constants % ui(igrid, isph) .lt. one) then + vij = params % csph(:,isph) & + & + params % rsph(isph)*constants % cgrid(:,igrid) & + & - params % csph(:,jsph) + vvij = sqrt(vij(1)*vij(1) + vij(2)*vij(2) & + & + vij(3)*vij(3)) + tij = vvij / params % rsph(jsph) + if (tij.lt.thigh) then + constants % overlap(iwork) = igrid + iwork = iwork + 1 + end if + end if + end do + end do + end do + constants % ioverlap(ij) = iwork + + !do isph = 1, params % nsph + ! do ij = constants % inl(isph), constants % inl(isph+1) - 1 + ! jsph = constants % nl(ij) + ! do ijgrid = constants % ioverlap(ij), constants % ioverlap(ij+1) - 1 + ! igrid = constants % overlap(ijgrid) + ! write(6,*) isph, jsph, igrid + ! end do + ! end do + !end do + !stop 0 + ! Build cavity array. At first get total count for each sphere allocate(constants % ncav_sph(params % nsph), stat=info) if (info .ne. 0) then diff --git a/src/ddx_operators.f90 b/src/ddx_operators.f90 index d2f68348..113391e6 100644 --- a/src/ddx_operators.f90 +++ b/src/ddx_operators.f90 @@ -34,7 +34,7 @@ subroutine lx(params, constants, workspace, x, y, ddx_error) real(dp), intent(out) :: y(constants % nbasis, params % nsph) type(ddx_error_type), intent(inout) :: ddx_error !! Local variables - integer :: isph, jsph, ij, l, ind, its, i, m + integer :: isph, jsph, ij, l, ind, its, i, m, ijgrid real(dp) :: vij(3), tij, vvij, xij, oij, thigh, fac integer :: vgrid_nbasis, nbasis, ngrid, nsph, lmax @@ -79,35 +79,37 @@ subroutine lx(params, constants, workspace, x, y, ddx_error) & csph => params % csph, rsph => params % rsph, & & cgrid => constants % cgrid, fi => constants % fi, & & vscales_rel => constants % vscales_rel, & - & vwgrid => constants % vwgrid) + & vwgrid => constants % vwgrid, & + & ioverlap => constants % ioverlap, & + & overlap => constants % overlap) !$omp parallel do default(none) firstprivate(nsph,thigh,se,eta, & !$omp lmax,nbasis,ngrid,vgrid_nbasis) shared(buried,iburied,inl, & !$omp nl,csph,rsph,cgrid,fi,vscales_rel,x,y,vwgrid) private( & - !$omp isph,tmp_grid,i,its,ij,jsph,vij,vvij,tij,xij,oij,work) & + !$omp isph,tmp_grid,i,its,ij,jsph,vij,vvij,tij,xij,oij,work,ijgrid) & !$omp schedule(dynamic,1) do isph = 1, nsph tmp_grid(:) = 0.0d0 - do i = iburied(isph), iburied(isph + 1) - 1 - its = buried(i) - do ij = inl(isph), inl(isph+1)-1 - jsph = nl(ij) + do ij = inl(isph), inl(isph+1)-1 + jsph = nl(ij) + !do i = iburied(isph), iburied(isph + 1) - 1 + !its = buried(i) + do ijgrid = ioverlap(ij), ioverlap(ij+1) - 1 + its = overlap(ijgrid) vij = csph(:,isph) + rsph(isph)*cgrid(:,its) & & - csph(:,jsph) vvij = sqrt(vij(1)*vij(1) + vij(2)*vij(2) & & + vij(3)*vij(3)) tij = vvij / rsph(jsph) - if (tij.lt.thigh) then - xij = fsw(tij, se, eta) - if (fi(its, isph).gt.1.0d0) then - oij = xij / fi(its, isph) - else - oij = xij - end if - call fmm_l2p_work(vij, rsph(jsph), lmax, & - & vscales_rel, oij, x(:, jsph), 1.0d0, & - & tmp_grid(its), work) + xij = fsw(tij, se, eta) + if (fi(its, isph).gt.1.0d0) then + oij = xij / fi(its, isph) + else + oij = xij end if + call fmm_l2p_work(vij, rsph(jsph), lmax, & + & vscales_rel, oij, x(:, jsph), 1.0d0, & + & tmp_grid(its), work) end do end do call ddintegrate(1, nbasis, ngrid, vwgrid, vgrid_nbasis, & From d68be8f95793a0c6ec460be961152f73b432f8fb Mon Sep 17 00:00:00 2001 From: Michele Nottoli Date: Mon, 13 May 2024 16:04:27 +0200 Subject: [PATCH 13/21] Precomputing adjoint overlaps. --- src/ddx_constants.f90 | 33 ++++++++++++++++++++---- src/ddx_operators.f90 | 58 +++++++++++++++++++++---------------------- 2 files changed, 56 insertions(+), 35 deletions(-) diff --git a/src/ddx_constants.f90 b/src/ddx_constants.f90 index dca0d29f..c1934e6a 100644 --- a/src/ddx_constants.f90 +++ b/src/ddx_constants.f90 @@ -232,6 +232,10 @@ module ddx_constants !> List of overlapped points in a CSR format. integer, allocatable :: ioverlap(:) integer, allocatable :: overlap(:) + !> Adjoint list of overlapped points (the relation is not self + ! adjoint) in a CSR format. + integer, allocatable :: adj_ioverlap(:) + integer, allocatable :: adj_overlap(:) end type ddx_constants_type contains @@ -782,7 +786,7 @@ subroutine constants_geometry_init(params, constants, ddx_error) !! Local variables real(dp) :: swthr, v(3), maxv, ssqv, vv, t integer :: i, isph, jsph, inear, igrid, iwork, jwork, lwork, & - & old_lwork, icav, info, ij, ijgrid + & old_lwork, icav, info, ij, adj_iwork integer, allocatable :: work(:, :), tmp_work(:, :) real(dp) :: start_time, thigh, vij(3), vvij, tij !! The code @@ -1033,18 +1037,22 @@ subroutine constants_geometry_init(params, constants, ddx_error) !!stop 0 allocate(constants % ioverlap(constants % inl(params % nsph+1)), & - & constants % overlap(constants % inl(params % nsph+1) & - & *params % ngrid), stat=info) + & constants % adj_ioverlap(constants % inl(params % nsph+1)), & + & constants % overlap(constants % inl(params % nsph+1)*params % ngrid), & + & constants % adj_overlap(constants % inl(params % nsph+1)*params % ngrid), & + & stat=info) if (info .ne. 0) then call update_error(ddx_error, "Overlapped grid lists allocation failed") return end if thigh = one + pt5*(params % se + one)*params % eta iwork = 1 + adj_iwork = 1 do isph = 1, params % nsph do ij = constants % inl(isph), constants % inl(isph+1) - 1 jsph = constants % nl(ij) constants % ioverlap(ij) = iwork + constants % adj_ioverlap(ij) = adj_iwork do igrid = 1, params % ngrid if (constants % ui(igrid, isph) .lt. one) then vij = params % csph(:,isph) & @@ -1058,16 +1066,31 @@ subroutine constants_geometry_init(params, constants, ddx_error) iwork = iwork + 1 end if end if + if (constants % ui(igrid, jsph) .lt. one) then + vij = params % csph(:,jsph) & + & + params % rsph(jsph)*constants % cgrid(:,igrid) & + & - params % csph(:,isph) + vvij = sqrt(vij(1)*vij(1) + vij(2)*vij(2) & + & + vij(3)*vij(3)) + tij = vvij / params % rsph(isph) + if (tij.lt.thigh) then + constants % adj_overlap(adj_iwork) = igrid + adj_iwork = adj_iwork + 1 + end if + end if end do end do end do constants % ioverlap(ij) = iwork + constants % adj_ioverlap(ij) = adj_iwork !do isph = 1, params % nsph ! do ij = constants % inl(isph), constants % inl(isph+1) - 1 ! jsph = constants % nl(ij) - ! do ijgrid = constants % ioverlap(ij), constants % ioverlap(ij+1) - 1 - ! igrid = constants % overlap(ijgrid) + ! !do ijgrid = constants % ioverlap(ij), constants % ioverlap(ij+1) - 1 + ! ! igrid = constants % overlap(ijgrid) + ! do ijgrid = constants % adj_ioverlap(ij), constants % adj_ioverlap(ij+1) - 1 + ! igrid = constants % adj_overlap(ijgrid) ! write(6,*) isph, jsph, igrid ! end do ! end do diff --git a/src/ddx_operators.f90 b/src/ddx_operators.f90 index 113391e6..592d9a42 100644 --- a/src/ddx_operators.f90 +++ b/src/ddx_operators.f90 @@ -34,7 +34,7 @@ subroutine lx(params, constants, workspace, x, y, ddx_error) real(dp), intent(out) :: y(constants % nbasis, params % nsph) type(ddx_error_type), intent(inout) :: ddx_error !! Local variables - integer :: isph, jsph, ij, l, ind, its, i, m, ijgrid + integer :: isph, jsph, ij, l, ind, its, i, m, ijits real(dp) :: vij(3), tij, vvij, xij, oij, thigh, fac integer :: vgrid_nbasis, nbasis, ngrid, nsph, lmax @@ -74,8 +74,7 @@ subroutine lx(params, constants, workspace, x, y, ddx_error) ngrid = params % ngrid vgrid_nbasis = constants % vgrid_nbasis - associate(iburied => constants % iburied, buried => constants % buried, & - & inl => constants % inl, nl => constants % nl, & + associate(inl => constants % inl, nl => constants % nl, & & csph => params % csph, rsph => params % rsph, & & cgrid => constants % cgrid, fi => constants % fi, & & vscales_rel => constants % vscales_rel, & @@ -83,19 +82,18 @@ subroutine lx(params, constants, workspace, x, y, ddx_error) & ioverlap => constants % ioverlap, & & overlap => constants % overlap) - !$omp parallel do default(none) firstprivate(nsph,thigh,se,eta, & - !$omp lmax,nbasis,ngrid,vgrid_nbasis) shared(buried,iburied,inl, & - !$omp nl,csph,rsph,cgrid,fi,vscales_rel,x,y,vwgrid) private( & - !$omp isph,tmp_grid,i,its,ij,jsph,vij,vvij,tij,xij,oij,work,ijgrid) & - !$omp schedule(dynamic,1) + !$omp parallel do default(none) schedule(dynamic,1) & + !$omp firstprivate(nsph,thigh,se,eta,lmax,nbasis,ngrid, & + !$omp vgrid_nbasis) shared(inl,nl,csph,rsph,cgrid,fi, & + !$omp vscales_rel,x,y,vwgrid,ioverlap,overlap) private( & + !$omp isph,tmp_grid,i,its,ij,jsph,vij,vvij,tij,xij,oij, & + !$omp work,ijits) do isph = 1, nsph tmp_grid(:) = 0.0d0 do ij = inl(isph), inl(isph+1)-1 jsph = nl(ij) - !do i = iburied(isph), iburied(isph + 1) - 1 - !its = buried(i) - do ijgrid = ioverlap(ij), ioverlap(ij+1) - 1 - its = overlap(ijgrid) + do ijits = ioverlap(ij), ioverlap(ij+1) - 1 + its = overlap(ijits) vij = csph(:,isph) + rsph(isph)*cgrid(:,its) & & - csph(:,jsph) vvij = sqrt(vij(1)*vij(1) + vij(2)*vij(2) & @@ -153,7 +151,7 @@ subroutine lstarx(params, constants, workspace, x, y, ddx_error) real(dp), intent(out) :: y(constants % nbasis, params % nsph) type(ddx_error_type), intent(inout) :: ddx_error !! Local variables - integer :: isph, jsph, ij, indmat, igrid, l, ind, m, i + integer :: isph, jsph, ij, indmat, igrid, l, ind, m, i, ijigrid real(dp) :: vji(3), vvji, tji, xji, oji, fac integer :: nsph, ngrid, nbasis, lmax @@ -207,32 +205,32 @@ subroutine lstarx(params, constants, workspace, x, y, ddx_error) & wgrid => constants % wgrid, & & tmp_grid => workspace % tmp_grid, & & vscales_rel => constants % vscales_rel, & - & iburied => constants % iburied, & - & buried => constants % buried) + & adj_ioverlap => constants % adj_ioverlap, & + & adj_overlap => constants % adj_overlap) !$omp parallel do default(none) schedule(dynamic,1) & - !$omp shared(inl,nl,csph,rsph,cgrid,fi,wgrid,tmp_grid,y,buried,iburied) & - !$omp firstprivate(nsph,ngrid,thigh,se,eta,lmax) & - !$omp private(isph,ij,jsph,igrid,vji,vvji,tji,xji,oji,fac,work,i) + !$omp shared(inl,nl,csph,rsph,cgrid,fi,wgrid,tmp_grid,y, & + !$omp adj_overlap,adj_ioverlap) firstprivate(nsph,ngrid, & + !$omp thigh,se,eta,lmax) private(isph,ij,jsph,igrid,vji, & + !$omp vvji,tji,xji,oji,fac,work,i,ijigrid) do isph = 1, nsph y(:,isph) = 0.0d0 do ij = inl(isph), inl(isph+1)-1 jsph = nl(ij) - do i = iburied(jsph), iburied(jsph + 1) - 1 - igrid = buried(i) + do ijigrid = adj_ioverlap(ij), adj_ioverlap(ij+1) - 1 + igrid = adj_overlap(ijigrid) + vji = csph(:,jsph) + rsph(jsph)*cgrid(:,igrid) - csph(:,isph) vvji = sqrt(vji(1)*vji(1) + vji(2)*vji(2) + vji(3)*vji(3)) tji = vvji/rsph(isph) - if (tji.lt.thigh) then - xji = fsw(tji, se, eta) - if (fi(igrid,jsph).gt.1.0d0) then - oji = xji/fi(igrid,jsph) - else - oji = xji - endif - fac = wgrid(igrid) * tmp_grid(igrid, jsph) * oji - call fmm_l2p_adj_work(vji, fac, rsph(isph), & - & lmax, vscales_rel, 1.0d0, y(:, isph), work) + xji = fsw(tji, se, eta) + if (fi(igrid,jsph).gt.1.0d0) then + oji = xji/fi(igrid,jsph) + else + oji = xji endif + fac = wgrid(igrid) * tmp_grid(igrid, jsph) * oji + call fmm_l2p_adj_work(vji, fac, rsph(isph), & + & lmax, vscales_rel, 1.0d0, y(:, isph), work) end do end do y(:, isph) = - y(:, isph) From a86fa2157e997f69f0b107906b5252e33e0ce292 Mon Sep 17 00:00:00 2001 From: Michele Nottoli Date: Mon, 13 May 2024 17:31:15 +0200 Subject: [PATCH 14/21] Bug fix and parallelized bx properly. --- src/ddx_operators.f90 | 95 +++++++++++++++++++++++++++++++++---------- 1 file changed, 73 insertions(+), 22 deletions(-) diff --git a/src/ddx_operators.f90 b/src/ddx_operators.f90 index 592d9a42..4d18d588 100644 --- a/src/ddx_operators.f90 +++ b/src/ddx_operators.f90 @@ -34,8 +34,8 @@ subroutine lx(params, constants, workspace, x, y, ddx_error) real(dp), intent(out) :: y(constants % nbasis, params % nsph) type(ddx_error_type), intent(inout) :: ddx_error !! Local variables - integer :: isph, jsph, ij, l, ind, its, i, m, ijits - real(dp) :: vij(3), tij, vvij, xij, oij, thigh, fac + integer :: isph, jsph, ij, l, ind, its, m, ijits + real(dp) :: vij(3), tij, vvij, xij, oij, fac integer :: vgrid_nbasis, nbasis, ngrid, nsph, lmax real(dp), allocatable :: tmp_grid(:) @@ -68,7 +68,6 @@ subroutine lx(params, constants, workspace, x, y, ddx_error) end do end associate else - thigh = one + pt5*(params % se + one)*params % eta se = params % se eta = params % eta ngrid = params % ngrid @@ -83,10 +82,10 @@ subroutine lx(params, constants, workspace, x, y, ddx_error) & overlap => constants % overlap) !$omp parallel do default(none) schedule(dynamic,1) & - !$omp firstprivate(nsph,thigh,se,eta,lmax,nbasis,ngrid, & + !$omp firstprivate(nsph,se,eta,lmax,nbasis,ngrid, & !$omp vgrid_nbasis) shared(inl,nl,csph,rsph,cgrid,fi, & !$omp vscales_rel,x,y,vwgrid,ioverlap,overlap) private( & - !$omp isph,tmp_grid,i,its,ij,jsph,vij,vvij,tij,xij,oij, & + !$omp isph,tmp_grid,its,ij,jsph,vij,vvij,tij,xij,oij, & !$omp work,ijits) do isph = 1, nsph tmp_grid(:) = 0.0d0 @@ -190,7 +189,7 @@ subroutine lstarx(params, constants, workspace, x, y, ddx_error) lmax = params % lmax ! Expand x over spherical harmonics associate(vgrid => constants % vgrid, tmp_grid => workspace % tmp_grid) - !$omp parallel do collapse(2) default(none) shared(x,tmp_grid) & + !$omp parallel do collapse(2) default(none) shared(x,tmp_grid,vgrid) & !$omp firstprivate(nsph,ngrid,nbasis) & !$omp private(isph,igrid) schedule(static,100) do isph = 1, nsph @@ -209,9 +208,9 @@ subroutine lstarx(params, constants, workspace, x, y, ddx_error) & adj_overlap => constants % adj_overlap) !$omp parallel do default(none) schedule(dynamic,1) & !$omp shared(inl,nl,csph,rsph,cgrid,fi,wgrid,tmp_grid,y, & - !$omp adj_overlap,adj_ioverlap) firstprivate(nsph,ngrid, & - !$omp thigh,se,eta,lmax) private(isph,ij,jsph,igrid,vji, & - !$omp vvji,tji,xji,oji,fac,work,i,ijigrid) + !$omp adj_overlap,adj_ioverlap,vscales_rel) firstprivate( & + !$omp nsph,ngrid,thigh,se,eta,lmax) private(isph,ij,jsph, & + !$omp igrid,vji,vvji,tji,xji,oji,fac,work,i,ijigrid) do isph = 1, nsph y(:,isph) = 0.0d0 do ij = inl(isph), inl(isph+1)-1 @@ -867,12 +866,26 @@ subroutine bx(params, constants, workspace, x, y, ddx_error) real(dp), dimension(constants % nbasis, params % nsph), intent(in) :: x real(dp), dimension(constants % nbasis, params % nsph), intent(out) :: y type(ddx_error_type), intent(inout) :: ddx_error - integer :: isph, jsph, ij, iproc + integer :: isph, jsph, ij, ijits, its + + integer :: nsph, nbasis, lmax, ngrid, vgrid_nbasis + real(dp) :: se, eta, vij(3), vvij, tij, xij, oij, vtij(3), kappa + complex(dp) :: work_complex(params % lmax+1) + real(dp) :: work(params % lmax+1) + + real(dp), allocatable :: tmp_grid(:) + + call time_push() + allocate(tmp_grid(params % ngrid)) ! dummy operation on unused interface arguments if (ddx_error % flag .eq. 0) continue + if (workspace % xs_time .eq. 0) continue + + nsph = params % nsph + nbasis = constants % nbasis + lmax = params % lmax - y = zero if (params % matvecmem .eq. 1) then !$omp parallel do default(none) shared(params,constants,x,y) & !$omp private(isph,ij,jsph) schedule(dynamic) @@ -885,19 +898,57 @@ subroutine bx(params, constants, workspace, x, y, ddx_error) end do end do else - call time_push() - !$omp parallel do default(none) shared(params,constants,workspace,x,y) & - !$omp private(isph,iproc) schedule(dynamic) - do isph = 1, params % nsph - iproc = omp_get_thread_num() + 1 - call calcv2_lpb(params, constants, isph, workspace % tmp_grid(:,isph), x) - end do - call ddintegrate(params % nsph, constants % nbasis, params % ngrid, constants % vwgrid, & - & constants % vgrid_nbasis, workspace % tmp_grid, y) - y = - y - call time_pull("bx") + se = params % se + eta = params % eta + ngrid = params % ngrid + kappa = params % kappa + vgrid_nbasis = constants % vgrid_nbasis + + associate(inl => constants % inl, nl => constants % nl, & + & csph => params % csph, rsph => params % rsph, & + & cgrid => constants % cgrid, fi => constants % fi, & + & vscales => constants % vscales, & + & vwgrid => constants % vwgrid, & + & ioverlap => constants % ioverlap, & + & overlap => constants % overlap, & + & si_ri => constants % si_ri) + !$omp parallel do default(none) schedule(dynamic,1) & + !$omp firstprivate(nsph,se,eta,lmax,nbasis,ngrid, & + !$omp vgrid_nbasis,kappa) shared(inl,nl,csph,rsph,cgrid,fi, & + !$omp vscales,x,y,vwgrid,ioverlap,overlap,si_ri) private( & + !$omp isph,tmp_grid,its,ij,jsph,vij,vvij,tij,xij,oij,vtij, & + !$omp work,work_complex,ijits) + do isph = 1, nsph + tmp_grid(:) = 0.0d0 + do ij = inl(isph), inl(isph+1)-1 + jsph = nl(ij) + do ijits = ioverlap(ij), ioverlap(ij+1) - 1 + its = overlap(ijits) + vij = csph(:,isph) + rsph(isph)*cgrid(:,its) & + & - csph(:,jsph) + vvij = sqrt(vij(1)*vij(1) + vij(2)*vij(2) & + & + vij(3)*vij(3)) + tij = vvij/rsph(jsph) + xij = fsw(tij, se, eta) + if (fi(its,isph).gt.1.0d0) then + oij = xij/fi(its, isph) + else + oij = xij + end if + vtij = vij*kappa + call fmm_l2p_bessel_work(vtij, lmax, vscales, & + & si_ri(:, jsph), oij, x(:, jsph), 1.0d0, & + & tmp_grid(its), work_complex, work) + end do + end do + call ddintegrate(1, nbasis, ngrid, vwgrid, vgrid_nbasis, & + & tmp_grid, y(:, isph)) + y(:, isph) = - y(:, isph) + end do + end associate end if if (constants % dodiag) y = y + x + call time_pull("bx") end subroutine bx !> Adjoint ddLPB matrix-vector product From 67576f5cb7ec13acf6e6c020f48745568e425135 Mon Sep 17 00:00:00 2001 From: Michele Nottoli Date: Tue, 14 May 2024 11:24:15 +0200 Subject: [PATCH 15/21] Reshaped bstarx for parallelization. --- src/ddx.f90 | 2 + src/ddx_operators.f90 | 160 ++++++++++++++++++++++++++++++++++-------- 2 files changed, 134 insertions(+), 28 deletions(-) diff --git a/src/ddx.f90 b/src/ddx.f90 index ba424b4d..f7193965 100644 --- a/src/ddx.f90 +++ b/src/ddx.f90 @@ -455,6 +455,8 @@ subroutine ddrun(ddx_data, state, electrostatics, psi, tol, esolv, & end if end if + stop 0 + ! compute the forces if (ddx_data % params % force .eq. 1) then force = zero diff --git a/src/ddx_operators.f90 b/src/ddx_operators.f90 index 4d18d588..7eb2d542 100644 --- a/src/ddx_operators.f90 +++ b/src/ddx_operators.f90 @@ -152,9 +152,9 @@ subroutine lstarx(params, constants, workspace, x, y, ddx_error) !! Local variables integer :: isph, jsph, ij, indmat, igrid, l, ind, m, i, ijigrid real(dp) :: vji(3), vvji, tji, xji, oji, fac - integer :: nsph, ngrid, nbasis, lmax - real(dp) :: thigh, se, eta, work(params % lmax + 1) + real(dp) :: thigh, se, eta, work(params % lmax + 1), & + & tmp(constants % nbasis) call time_push() @@ -188,13 +188,15 @@ subroutine lstarx(params, constants, workspace, x, y, ddx_error) eta = params % eta lmax = params % lmax ! Expand x over spherical harmonics - associate(vgrid => constants % vgrid, tmp_grid => workspace % tmp_grid) - !$omp parallel do collapse(2) default(none) shared(x,tmp_grid,vgrid) & - !$omp firstprivate(nsph,ngrid,nbasis) & + associate(vgrid => constants % vgrid, & + & tmp_grid => workspace % tmp_grid) + !$omp parallel do collapse(2) default(none) & + !$omp firstprivate(nsph,ngrid,nbasis) shared(x,tmp_grid,vgrid) & !$omp private(isph,igrid) schedule(static,100) do isph = 1, nsph do igrid = 1, ngrid - tmp_grid(igrid,isph) = dot_product(x(:, isph), vgrid(:nbasis, igrid)) + tmp_grid(igrid,isph) = dot_product(x(:, isph), & + & vgrid(:nbasis, igrid)) end do end do end associate @@ -210,29 +212,31 @@ subroutine lstarx(params, constants, workspace, x, y, ddx_error) !$omp shared(inl,nl,csph,rsph,cgrid,fi,wgrid,tmp_grid,y, & !$omp adj_overlap,adj_ioverlap,vscales_rel) firstprivate( & !$omp nsph,ngrid,thigh,se,eta,lmax) private(isph,ij,jsph, & - !$omp igrid,vji,vvji,tji,xji,oji,fac,work,i,ijigrid) + !$omp igrid,vji,vvji,tji,xji,oji,fac,work,i,ijigrid,tmp) do isph = 1, nsph - y(:,isph) = 0.0d0 + tmp = 0.0d0 do ij = inl(isph), inl(isph+1)-1 jsph = nl(ij) do ijigrid = adj_ioverlap(ij), adj_ioverlap(ij+1) - 1 igrid = adj_overlap(ijigrid) - vji = csph(:,jsph) + rsph(jsph)*cgrid(:,igrid) - csph(:,isph) - vvji = sqrt(vji(1)*vji(1) + vji(2)*vji(2) + vji(3)*vji(3)) + vji = csph(:,jsph) + rsph(jsph)*cgrid(:,igrid) & + & - csph(:,isph) + vvji = sqrt(vji(1)*vji(1) + vji(2)*vji(2) & + & + vji(3)*vji(3)) tji = vvji/rsph(isph) xji = fsw(tji, se, eta) if (fi(igrid,jsph).gt.1.0d0) then oji = xji/fi(igrid,jsph) else oji = xji - endif + end if fac = wgrid(igrid) * tmp_grid(igrid, jsph) * oji call fmm_l2p_adj_work(vji, fac, rsph(isph), & - & lmax, vscales_rel, 1.0d0, y(:, isph), work) + & lmax, vscales_rel, 1.0d0, tmp, work) end do end do - y(:, isph) = - y(:, isph) + y(:, isph) = - tmp end do end associate end if @@ -815,16 +819,32 @@ subroutine bstarx(params, constants, workspace, x, y, ddx_error) real(dp), intent(out) :: y(constants % nbasis, params % nsph) type(ddx_error_type), intent(inout) :: ddx_error ! Local variables - integer :: isph, jsph, ij, indmat, iproc + integer :: isph, jsph, ij, indmat, ind, ijigrid, igrid, l, m + + real(dp) :: thigh, se, eta, kappa, vji(3), vvji, sji(3), tji + real(dp) :: rho, ctheta, stheta, cphi, sphi, xji, oji, fac, & + & basloc((params % lmax + 1)**2), vcos(params % lmax + 1), & + & vsin(params % lmax + 1), vplm((params % lmax + 1)**2) + real(dp), dimension(constants % nbasis) :: fac_hsp + real(dp), dimension(0:params % lmax) :: SI_rjin, DI_rjin + complex(dp) :: work_complex(max(2, params % lmax+1)) + integer :: nsph, nbasis, lmax, ngrid + + real(dp) :: y_copy(constants % nbasis, params % nsph) + + call time_push() + + nsph = params % nsph + nbasis = constants % nbasis ! dummy operation on unused interface arguments if (ddx_error % flag .eq. 0) continue - y = zero - if (params % matvecmem .eq. 1) then + !if (params % matvecmem .eq. 1) then !$omp parallel do default(none) shared(params,constants,x,y) & !$omp private(isph,ij,jsph,indmat) schedule(dynamic) do isph = 1, params % nsph + y(:,isph) = zero do ij = constants % inl(isph), constants % inl(isph + 1) - 1 jsph = constants % nl(ij) indmat = constants % itrnl(ij) @@ -833,7 +853,9 @@ subroutine bstarx(params, constants, workspace, x, y, ddx_error) & one, y(:,isph), 1) end do end do - else + y_copy(:,:) = y(:,:) + y = zero + !else !$omp parallel do default(none) shared(params,constants,workspace,x,y) & !$omp private(isph) schedule(static,1) do isph = 1, params % nsph @@ -841,20 +863,101 @@ subroutine bstarx(params, constants, workspace, x, y, ddx_error) & constants % vgrid_nbasis, x(:, isph), 1, zero, & & workspace % tmp_grid(:, isph), 1) end do - !$omp parallel do default(none) shared(params,constants,workspace,x,y) & - !$omp private(isph,iproc) schedule(dynamic) - do isph = 1, params % nsph - iproc = omp_get_thread_num() + 1 - call adjrhs_lpb(params, constants, isph, workspace % tmp_grid, & - & y(:, isph), workspace % tmp_vylm(:, iproc), workspace % tmp_vplm(:, iproc), & - & workspace % tmp_vcos(:, iproc), workspace % tmp_vsin(:, iproc), & - & workspace % tmp_bessel(:, iproc)) - y(:,isph) = - y(:,isph) - end do - end if + + ngrid = params % ngrid + thigh = one + (params % se+one)/two*params % eta + se = params % se + eta = params % eta + lmax = params % lmax + kappa = params % kappa + + associate(inl => constants % inl, nl => constants % nl, & + & csph => params % csph, rsph => params % rsph, & + & cgrid => constants % cgrid, fi => constants % fi, & + & wgrid => constants % wgrid, & + & tmp_grid => workspace % tmp_grid, & + & vscales => constants % vscales, & + & adj_ioverlap => constants % adj_ioverlap, & + & adj_overlap => constants % adj_overlap) + do isph = 1, nsph + !y_copy(:, isph) = 0.0d0 + !call adjrhs_lpb(params, constants, isph, workspace % tmp_grid, & + ! & y_copy(:, isph), workspace % tmp_vylm(:, 1), & + ! & workspace % tmp_vplm(:, 1), & + ! & workspace % tmp_vcos(:, 1), workspace % tmp_vsin(:, 1), & + ! & workspace % tmp_bessel(:, 1)) + !y_copy(:, isph) = - y_copy(:, isph) + + y(:, isph) = 0.0d0 + do ij = inl(isph), inl(isph+1)-1 + jsph = nl(ij) + do ijigrid = adj_ioverlap(ij), adj_ioverlap(ij+1) - 1 + igrid = adj_overlap(ijigrid) + !do igrid = 1, params % ngrid + + vji = csph(:,jsph) + rsph(jsph)*cgrid(:,igrid) & + & - csph(:,isph) + vvji = sqrt(vji(1)*vji(1) + vji(2)*vji(2) & + & + vji(3)*vji(3)) + tji = vvji/rsph(isph) + sji = vji/vvji + + !if ( tji.gt.( one + (se+one)/two*eta ) ) continue + + call ylmbas(sji, rho, ctheta, stheta, cphi, sphi, & + & lmax, vscales, basloc, vplm, vcos, vsin) + + !call inthsp_adj(params, constants, vvji, isph, basloc, fac_hsp, & + ! & work_complex) + + si_rjin = 0.0d0 + di_rjin = 0.0d0 + + call modified_spherical_bessel_first_kind(lmax, & + & vvji*kappa, si_rjin, di_rjin, work_complex) + + do l = 0, lmax + do m = -l, l + ind = l*l + l + 1 + m + fac_hsp(ind) = SI_rjin(l)/constants % SI_ri(l,isph)*basloc(ind) + end do + end do + + xji = fsw(tji, se, eta) + + if (fi(igrid,jsph).gt.1.0d0) then + oji = xji/fi(igrid,jsph) + else + oji = xji + end if + fac = wgrid(igrid) * tmp_grid(igrid,jsph) * oji + do l = 0, lmax + ind = l*l + l + 1 + do m = -l,l + y(ind+m, isph) = y(ind+m, isph) + fac*fac_hsp(ind+m) + end do + end do + ! vji + !call fmm_l2p_adj_bessel_work(vtij, lmax, vscales, & + ! & si_ri(:, isph), oij, y(:, jsph), 1.0d0, & + ! & tmp_grid(its), work_complex, work) + end do + end do + y(:, isph) = - y(:, isph) + end do + end associate + !end if ! add the diagonal if required if (constants % dodiag) y = y + x + call time_pull("bstarx") + + do isph = 1, params % nsph + do l = 1, constants % nbasis + write(6,*) y(l, isph), y_copy(l, isph), y(l, isph) - y_copy(l, isph) + end do + end do + !stop 0 end subroutine bstarx !> Primal HSP matrix vector product @@ -890,6 +993,7 @@ subroutine bx(params, constants, workspace, x, y, ddx_error) !$omp parallel do default(none) shared(params,constants,x,y) & !$omp private(isph,ij,jsph) schedule(dynamic) do isph = 1, params % nsph + y(:, isph) = 0.0d0 do ij = constants % inl(isph), constants % inl(isph + 1) - 1 jsph = constants % nl(ij) call dgemv('n', constants % nbasis, constants % nbasis, one, & From 7d74448ec31859f873871b10ac2f600a0332a658 Mon Sep 17 00:00:00 2001 From: Michele Nottoli Date: Tue, 14 May 2024 13:36:00 +0200 Subject: [PATCH 16/21] Parallelized bstarx. --- src/ddx.f90 | 2 -- src/ddx_operators.f90 | 63 +++++++++++++++---------------------------- 2 files changed, 22 insertions(+), 43 deletions(-) diff --git a/src/ddx.f90 b/src/ddx.f90 index f7193965..ba424b4d 100644 --- a/src/ddx.f90 +++ b/src/ddx.f90 @@ -455,8 +455,6 @@ subroutine ddrun(ddx_data, state, electrostatics, psi, tol, esolv, & end if end if - stop 0 - ! compute the forces if (ddx_data % params % force .eq. 1) then force = zero diff --git a/src/ddx_operators.f90 b/src/ddx_operators.f90 index 7eb2d542..34dc753a 100644 --- a/src/ddx_operators.f90 +++ b/src/ddx_operators.f90 @@ -824,14 +824,13 @@ subroutine bstarx(params, constants, workspace, x, y, ddx_error) real(dp) :: thigh, se, eta, kappa, vji(3), vvji, sji(3), tji real(dp) :: rho, ctheta, stheta, cphi, sphi, xji, oji, fac, & & basloc((params % lmax + 1)**2), vcos(params % lmax + 1), & - & vsin(params % lmax + 1), vplm((params % lmax + 1)**2) + & vsin(params % lmax + 1), vplm((params % lmax + 1)**2), & + & tmp(constants % nbasis) real(dp), dimension(constants % nbasis) :: fac_hsp real(dp), dimension(0:params % lmax) :: SI_rjin, DI_rjin complex(dp) :: work_complex(max(2, params % lmax+1)) integer :: nsph, nbasis, lmax, ngrid - real(dp) :: y_copy(constants % nbasis, params % nsph) - call time_push() nsph = params % nsph @@ -840,7 +839,7 @@ subroutine bstarx(params, constants, workspace, x, y, ddx_error) ! dummy operation on unused interface arguments if (ddx_error % flag .eq. 0) continue - !if (params % matvecmem .eq. 1) then + if (params % matvecmem .eq. 1) then !$omp parallel do default(none) shared(params,constants,x,y) & !$omp private(isph,ij,jsph,indmat) schedule(dynamic) do isph = 1, params % nsph @@ -853,9 +852,7 @@ subroutine bstarx(params, constants, workspace, x, y, ddx_error) & one, y(:,isph), 1) end do end do - y_copy(:,:) = y(:,:) - y = zero - !else + else !$omp parallel do default(none) shared(params,constants,workspace,x,y) & !$omp private(isph) schedule(static,1) do isph = 1, params % nsph @@ -878,22 +875,21 @@ subroutine bstarx(params, constants, workspace, x, y, ddx_error) & tmp_grid => workspace % tmp_grid, & & vscales => constants % vscales, & & adj_ioverlap => constants % adj_ioverlap, & - & adj_overlap => constants % adj_overlap) + & adj_overlap => constants % adj_overlap, & + & si_ri => constants % si_ri) + !$omp parallel do default(none) schedule(dynamic,1) & + !$omp firstprivate(nsph,ngrid,thigh,se,eta,lmax,kappa) & + !$omp shared(inl,nl,csph,rsph,cgrid,fi,wgrid,tmp_grid,y, & + !$omp adj_overlap,adj_ioverlap,vscales,si_ri) & + !$omp private(isph,ij,jsph,ijigrid,igrid,vji,vvji,tji,sji, & + !$omp rho,ctheta,stheta,cphi,sphi,basloc,vplm,vcos,vsin,tmp, & + !$omp si_rjin,di_rjin,work_complex,fac_hsp,xji,oji,fac,ind) do isph = 1, nsph - !y_copy(:, isph) = 0.0d0 - !call adjrhs_lpb(params, constants, isph, workspace % tmp_grid, & - ! & y_copy(:, isph), workspace % tmp_vylm(:, 1), & - ! & workspace % tmp_vplm(:, 1), & - ! & workspace % tmp_vcos(:, 1), workspace % tmp_vsin(:, 1), & - ! & workspace % tmp_bessel(:, 1)) - !y_copy(:, isph) = - y_copy(:, isph) - - y(:, isph) = 0.0d0 + tmp = 0.0d0 do ij = inl(isph), inl(isph+1)-1 jsph = nl(ij) do ijigrid = adj_ioverlap(ij), adj_ioverlap(ij+1) - 1 - igrid = adj_overlap(ijigrid) - !do igrid = 1, params % ngrid + igrid = adj_overlap(ijigrid) vji = csph(:,jsph) + rsph(jsph)*cgrid(:,igrid) & & - csph(:,isph) @@ -902,14 +898,9 @@ subroutine bstarx(params, constants, workspace, x, y, ddx_error) tji = vvji/rsph(isph) sji = vji/vvji - !if ( tji.gt.( one + (se+one)/two*eta ) ) continue - call ylmbas(sji, rho, ctheta, stheta, cphi, sphi, & & lmax, vscales, basloc, vplm, vcos, vsin) - !call inthsp_adj(params, constants, vvji, isph, basloc, fac_hsp, & - ! & work_complex) - si_rjin = 0.0d0 di_rjin = 0.0d0 @@ -917,47 +908,37 @@ subroutine bstarx(params, constants, workspace, x, y, ddx_error) & vvji*kappa, si_rjin, di_rjin, work_complex) do l = 0, lmax + ind = l*l + l + 1 do m = -l, l - ind = l*l + l + 1 + m - fac_hsp(ind) = SI_rjin(l)/constants % SI_ri(l,isph)*basloc(ind) + fac_hsp(ind+m) = SI_rjin(l)/SI_ri(l,isph)*basloc(ind+m) end do end do xji = fsw(tji, se, eta) - if (fi(igrid,jsph).gt.1.0d0) then oji = xji/fi(igrid,jsph) else oji = xji end if + fac = wgrid(igrid) * tmp_grid(igrid,jsph) * oji do l = 0, lmax ind = l*l + l + 1 - do m = -l,l - y(ind+m, isph) = y(ind+m, isph) + fac*fac_hsp(ind+m) + do m = -l, l + tmp(ind+m) = tmp(ind+m) + fac*fac_hsp(ind+m) end do end do - ! vji - !call fmm_l2p_adj_bessel_work(vtij, lmax, vscales, & - ! & si_ri(:, isph), oij, y(:, jsph), 1.0d0, & - ! & tmp_grid(its), work_complex, work) end do end do - y(:, isph) = - y(:, isph) + y(:, isph) = - tmp end do end associate - !end if + end if ! add the diagonal if required if (constants % dodiag) y = y + x call time_pull("bstarx") - do isph = 1, params % nsph - do l = 1, constants % nbasis - write(6,*) y(l, isph), y_copy(l, isph), y(l, isph) - y_copy(l, isph) - end do - end do - !stop 0 end subroutine bstarx !> Primal HSP matrix vector product From d70af7e4443d10e852c1149cc96c0f7e89ed78a8 Mon Sep 17 00:00:00 2001 From: Michele Nottoli Date: Tue, 14 May 2024 15:07:02 +0200 Subject: [PATCH 17/21] Update. --- src/ddx_operators.f90 | 95 +++++++++++++++++++++++++------------------ src/ddx_solvers.f90 | 2 +- 2 files changed, 57 insertions(+), 40 deletions(-) diff --git a/src/ddx_operators.f90 b/src/ddx_operators.f90 index 34dc753a..e39714b5 100644 --- a/src/ddx_operators.f90 +++ b/src/ddx_operators.f90 @@ -189,7 +189,7 @@ subroutine lstarx(params, constants, workspace, x, y, ddx_error) lmax = params % lmax ! Expand x over spherical harmonics associate(vgrid => constants % vgrid, & - & tmp_grid => workspace % tmp_grid) + & tmp_grid => workspace % tmp_grid) !$omp parallel do collapse(2) default(none) & !$omp firstprivate(nsph,ngrid,nbasis) shared(x,tmp_grid,vgrid) & !$omp private(isph,igrid) schedule(static,100) @@ -819,17 +819,15 @@ subroutine bstarx(params, constants, workspace, x, y, ddx_error) real(dp), intent(out) :: y(constants % nbasis, params % nsph) type(ddx_error_type), intent(inout) :: ddx_error ! Local variables - integer :: isph, jsph, ij, indmat, ind, ijigrid, igrid, l, m - - real(dp) :: thigh, se, eta, kappa, vji(3), vvji, sji(3), tji - real(dp) :: rho, ctheta, stheta, cphi, sphi, xji, oji, fac, & + integer :: isph, jsph, ij, indmat, ind, ijigrid, igrid, l, m, & + & nsph, nbasis, lmax, ngrid + real(dp) :: thigh, se, eta, kappa, vji(3), vvji, sji(3), tji, & + & rho, ctheta, stheta, cphi, sphi, xji, oji, fac, & & basloc((params % lmax + 1)**2), vcos(params % lmax + 1), & & vsin(params % lmax + 1), vplm((params % lmax + 1)**2), & - & tmp(constants % nbasis) - real(dp), dimension(constants % nbasis) :: fac_hsp - real(dp), dimension(0:params % lmax) :: SI_rjin, DI_rjin + & tmp(constants % nbasis), fac_hsp(constants % nbasis), & + & si_rjin(0:params % lmax), di_rjin(0:params % lmax) complex(dp) :: work_complex(max(2, params % lmax+1)) - integer :: nsph, nbasis, lmax, ngrid call time_push() @@ -840,27 +838,23 @@ subroutine bstarx(params, constants, workspace, x, y, ddx_error) if (ddx_error % flag .eq. 0) continue if (params % matvecmem .eq. 1) then - !$omp parallel do default(none) shared(params,constants,x,y) & - !$omp private(isph,ij,jsph,indmat) schedule(dynamic) - do isph = 1, params % nsph - y(:,isph) = zero - do ij = constants % inl(isph), constants % inl(isph + 1) - 1 - jsph = constants % nl(ij) - indmat = constants % itrnl(ij) - call dgemv('t', constants % nbasis, constants % nbasis, one, & - & constants % b(:,:,indmat), constants % nbasis, x(:,jsph), 1, & - & one, y(:,isph), 1) + associate(inl => constants % inl, nl => constants % nl, & + & itrnl => constants % itrnl, b => constants % b) + !$omp parallel do default(none) schedule(dynamic,1) & + !$omp firstprivate(nsph,nbasis) shared(x,y,inl,nl,itrnl,b) & + !$omp private(isph,ij,jsph,indmat) + do isph = 1, nsph + y(:,isph) = zero + do ij = inl(isph), inl(isph + 1) - 1 + jsph = nl(ij) + indmat = itrnl(ij) + call dgemv('t', nbasis, nbasis, 1.0d0, & + & b(:,:,indmat), nbasis, x(:,jsph), 1, & + & 1.0d0, y(:,isph), 1) + end do end do - end do + end associate else - !$omp parallel do default(none) shared(params,constants,workspace,x,y) & - !$omp private(isph) schedule(static,1) - do isph = 1, params % nsph - call dgemv('t', constants % nbasis, params % ngrid, one, constants % vgrid, & - & constants % vgrid_nbasis, x(:, isph), 1, zero, & - & workspace % tmp_grid(:, isph), 1) - end do - ngrid = params % ngrid thigh = one + (params % se+one)/two*params % eta se = params % se @@ -868,6 +862,19 @@ subroutine bstarx(params, constants, workspace, x, y, ddx_error) lmax = params % lmax kappa = params % kappa + associate(vgrid => constants % vgrid, & + & tmp_grid => workspace % tmp_grid) + !$omp parallel do collapse(2) default(none) & + !$omp firstprivate(nsph,ngrid,nbasis) shared(x,tmp_grid,vgrid) & + !$omp private(isph,igrid) schedule(static,100) + do isph = 1, nsph + do igrid = 1, ngrid + tmp_grid(igrid,isph) = dot_product(x(:, isph), & + & vgrid(:nbasis, igrid)) + end do + end do + end associate + associate(inl => constants % inl, nl => constants % nl, & & csph => params % csph, rsph => params % rsph, & & cgrid => constants % cgrid, fi => constants % fi, & @@ -1104,9 +1111,12 @@ subroutine prec_tstarx(params, constants, workspace, x, y, ddx_error) y(:,:,1) = x(:,:,1) call convert_ddcosmo(params, constants, 1, y(:,:,1)) n_iter = params % maxiter - call jacobi_diis(params, constants, workspace, constants % inner_tol, & - & y(:,:,1), workspace % ddcosmo_guess, n_iter, x_rel_diff, lstarx, & + ! the empirical 10^-2 factor reduces the number of macro iterations + call jacobi_diis(params, constants, workspace, & + & constants % inner_tol*1.0d-2, y(:,:,1), & + & workspace % ddcosmo_guess, n_iter, x_rel_diff, lstarx, & & ldm1x, hnorm, ddx_error) + write(6,*) "lstarx", x_rel_diff(1:n_iter) if (ddx_error % flag .ne. 0) then call update_error(ddx_error, 'prec_tstarx: ddCOSMO failed to ' // & & 'converge, exiting') @@ -1117,9 +1127,10 @@ subroutine prec_tstarx(params, constants, workspace, x, y, ddx_error) start_time = omp_get_wtime() n_iter = params % maxiter - call jacobi_diis(params, constants, workspace, constants % inner_tol, & - & x(:,:,2), workspace % hsp_guess, n_iter, x_rel_diff, bstarx, & - & bx_prec, hnorm, ddx_error) + call jacobi_diis(params, constants, workspace, & + & constants % inner_tol*1.0d1, x(:,:,2), workspace % hsp_guess, & + & n_iter, x_rel_diff, bstarx, bx_prec, hnorm, ddx_error) + write(6,*) "bstarx", x_rel_diff(1:n_iter) if (ddx_error % flag .ne. 0) then call update_error(ddx_error, 'prec_tstarx: HSP failed to ' // & & 'converge, exiting') @@ -1153,14 +1164,15 @@ subroutine prec_tx(params, constants, workspace, x, y, ddx_error) ! perform A^-1 * Yr start_time = omp_get_wtime() n_iter = params % maxiter - call jacobi_diis(params, constants, workspace, constants % inner_tol, & - & x(:,:,1), workspace % ddcosmo_guess, n_iter, x_rel_diff, lx, & - & ldm1x, hnorm, ddx_error) + call jacobi_diis(params, constants, workspace, & + & constants % inner_tol, x(:,:,1), workspace % ddcosmo_guess, & + & n_iter, x_rel_diff, lx, ldm1x, hnorm, ddx_error) if (ddx_error % flag .ne. 0) then call update_error(ddx_error, 'prec_tx: ddCOSMO failed to ' // & & 'converge, exiting') return end if + write(6,*) "lx", x_rel_diff(1:n_iter) ! Scale by the factor of (2l+1)/4Pi y(:,:,1) = workspace % ddcosmo_guess @@ -1170,11 +1182,12 @@ subroutine prec_tx(params, constants, workspace, x, y, ddx_error) ! perform B^-1 * Ye start_time = omp_get_wtime() n_iter = params % maxiter - call jacobi_diis(params, constants, workspace, constants % inner_tol, & - & x(:,:,2), workspace % hsp_guess, n_iter, x_rel_diff, bx, & - & bx_prec, hnorm, ddx_error) + call jacobi_diis(params, constants, workspace, & + & constants % inner_tol, x(:,:,2), workspace % hsp_guess, & + & n_iter, x_rel_diff, bx, bx_prec, hnorm, ddx_error) y(:,:,2) = workspace % hsp_guess workspace % hsp_time = workspace % hsp_time + omp_get_wtime() - start_time + write(6,*) "bx", x_rel_diff(1:n_iter) if (ddx_error % flag .ne. 0) then call update_error(ddx_error, 'prec_tx: HSP failed to ' // & @@ -1202,6 +1215,8 @@ subroutine cstarx(params, constants, workspace, x, y, ddx_error) real(dp) :: val, epsilon_ratio real(dp), allocatable :: scratch(:,:), scratch0(:,:) + call time_push() + ! dummy operation on unused interface arguments if (ddx_error % flag .eq. 0) continue @@ -1297,6 +1312,8 @@ subroutine cstarx(params, constants, workspace, x, y, ddx_error) end do end do + call time_pull("cstarx") + end subroutine cstarx !> ddLPB matrix-vector product diff --git a/src/ddx_solvers.f90 b/src/ddx_solvers.f90 index 7e5806d0..a58ba1fa 100644 --- a/src/ddx_solvers.f90 +++ b/src/ddx_solvers.f90 @@ -331,7 +331,7 @@ subroutine jacobi_diis_external(params, constants, workspace, n, tol, rhs, x, n_ rel_diff = diff / norm end if x_rel_diff(it) = rel_diff - constants % inner_tol = max(rel_diff*sqrt(tol), tol/100.0d0) + constants % inner_tol = max(rel_diff*1d-2, tol*1d-2) ! update x = x_new ! EXIT Jacobi loop here From 0ec7779cb93dd4b699cbb70b2fe60e2c95aef044 Mon Sep 17 00:00:00 2001 From: Michele Nottoli Date: Tue, 14 May 2024 15:14:09 +0200 Subject: [PATCH 18/21] Removed allocations in lx and bx. --- src/ddx_operators.f90 | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/src/ddx_operators.f90 b/src/ddx_operators.f90 index e39714b5..bbbf4e7e 100644 --- a/src/ddx_operators.f90 +++ b/src/ddx_operators.f90 @@ -38,11 +38,9 @@ subroutine lx(params, constants, workspace, x, y, ddx_error) real(dp) :: vij(3), tij, vvij, xij, oij, fac integer :: vgrid_nbasis, nbasis, ngrid, nsph, lmax - real(dp), allocatable :: tmp_grid(:) - real(dp) :: se, eta, work(params % lmax + 1) + real(dp) :: se, eta, work(params % lmax + 1), tmp_grid(params % ngrid) call time_push() - allocate(tmp_grid(params % ngrid)) ! dummy operation on unused interface arguments if (ddx_error % flag .eq. 0) continue @@ -134,7 +132,6 @@ subroutine lx(params, constants, workspace, x, y, ddx_error) end associate end if call time_pull("lx") - deallocate(tmp_grid) end subroutine lx !> Adjoint single layer operator matvec @@ -962,12 +959,9 @@ subroutine bx(params, constants, workspace, x, y, ddx_error) integer :: nsph, nbasis, lmax, ngrid, vgrid_nbasis real(dp) :: se, eta, vij(3), vvij, tij, xij, oij, vtij(3), kappa complex(dp) :: work_complex(params % lmax+1) - real(dp) :: work(params % lmax+1) - - real(dp), allocatable :: tmp_grid(:) + real(dp) :: work(params % lmax+1), tmp_grid(params % ngrid) call time_push() - allocate(tmp_grid(params % ngrid)) ! dummy operation on unused interface arguments if (ddx_error % flag .eq. 0) continue From ae929288dfa7b3d9098f5024c8446d11ae10d5ae Mon Sep 17 00:00:00 2001 From: Michele Nottoli Date: Tue, 14 May 2024 17:32:59 +0200 Subject: [PATCH 19/21] Parallelized cx. --- src/ddx_core.f90 | 134 ++++++++++++++++++++++++------------------ src/ddx_operators.f90 | 122 +++++++++++++++++++++++++------------- 2 files changed, 157 insertions(+), 99 deletions(-) diff --git a/src/ddx_core.f90 b/src/ddx_core.f90 index 6f8dfa7d..eb6b5b58 100644 --- a/src/ddx_core.f90 +++ b/src/ddx_core.f90 @@ -2112,40 +2112,48 @@ subroutine tree_m2l_bessel_rotation(params, constants, node_m, node_l) real(dp) :: work(6*params % pm**2 + 19*params % pm + 8) complex(dp) :: work_complex(2*params % pm+1) ! Local variables - integer :: i, j, k - real(dp) :: c1(3), c(3), r1, r + integer :: i, j, k, nclusters, pm + real(dp) :: c1(3), c(3), r1, r, kappa ! Any order of this cycle is OK - !$omp parallel do default(none) shared(constants,params,node_m,node_l) & - !$omp private(i,c,r,k,c1,r1,work,work_complex) schedule(dynamic) - do i = 1, constants % nclusters - ! If no far admissible pairs just set output to zero - if (constants % nfar(i) .eq. 0) then - node_l(:, i) = zero - cycle - end if - c = constants % cnode(:, i) - r = constants % rnode(i) - ! Use the first far admissible pair to initialize output - k = constants % far(constants % sfar(i)) - c1 = constants % cnode(:, k) - r1 = constants % rnode(k) - c1 = params % kappa*(c1 - c) - call fmm_m2l_bessel_rotation_work(c1, & - & constants % SK_rnode(:, k), constants % SI_rnode(:, i), & - & params % pm, & - & constants % vscales, one, & - & node_m(:, k), zero, node_l(:, i), work, work_complex) - do j = constants % sfar(i)+1, constants % sfar(i+1)-1 - k = constants % far(j) - c1 = constants % cnode(:, k) - r1 = constants % rnode(k) - c1 = params % kappa*(c1 - c) - call fmm_m2l_bessel_rotation_work(c1, constants % SK_rnode(:, k), & - & constants % SI_rnode(:, i), params % pm, & - & constants % vscales, one, & - & node_m(:, k), one, node_l(:, i), work, work_complex) + nclusters = constants % nclusters + kappa = params % kappa + pm = params % pm + associate(nfar => constants % nfar, cnode => constants % cnode, & + & rnode => constants % rnode, far => constants % far, & + & sfar => constants % sfar, sk_rnode => constants % sk_rnode, & + & si_rnode => constants % si_rnode, & + & vscales => constants % vscales) + !$omp parallel do schedule(dynamic,1) firstprivate(nclusters,kappa,pm) & + !$omp private(i,c,r,k,c1,r1,j,work,work_complex) & + !$omp shared(nfar,node_l,cnode,rnode, & + !$omp far,sfar,sk_rnode,si_rnode,vscales,node_m) + do i = 1, nclusters + ! If no far admissible pairs just set output to zero + if (nfar(i) .eq. 0) then + node_l(:, i) = zero + cycle + end if + c = cnode(:, i) + r = rnode(i) + ! Use the first far admissible pair to initialize output + k = far(sfar(i)) + c1 = cnode(:, k) + r1 = rnode(k) + c1 = kappa*(c1 - c) + call fmm_m2l_bessel_rotation_work(c1, SK_rnode(:, k), & + & SI_rnode(:, i), pm, vscales, 1.0d0, node_m(:, k), & + & 0.0d0, node_l(:, i), work, work_complex) + do j = sfar(i)+1, sfar(i+1)-1 + k = far(j) + c1 = cnode(:, k) + r1 = rnode(k) + c1 = kappa*(c1 - c) + call fmm_m2l_bessel_rotation_work(c1, SK_rnode(:, k), & + & SI_rnode(:, i), pm, vscales, 1.0d0, node_m(:, k), & + & 1.0d0, node_l(:, i), work, work_complex) + end do end do - end do + end associate end subroutine tree_m2l_bessel_rotation !------------------------------------------------------------------------------ !> Adjoint transfer multipole local coefficients into local over a tree @@ -2465,8 +2473,8 @@ subroutine tree_m2p_bessel(params, constants, p, alpha, sph_p, sph_m, beta, grid ! Output real(dp), intent(inout) :: grid_v(params % ngrid, params % nsph) ! Local variables - integer :: isph, inode, jnear, jnode, jsph, igrid - real(dp) :: c(3) + integer :: isph, inode, jnear, jnode, jsph, igrid, nsph, ngrid + real(dp) :: c(3), kappa ! Temporary workspace real(dp) :: work(p+1) complex(dp) :: work_complex(p+1) @@ -2476,32 +2484,44 @@ subroutine tree_m2p_bessel(params, constants, p, alpha, sph_p, sph_m, beta, grid else grid_v = beta * grid_v end if + nsph = params % nsph + ngrid = params % ngrid + kappa = params % kappa ! Cycle over all spheres - !$omp parallel do default(none) shared(params,constants,grid_v,p, & - !$omp alpha,sph_m), private(isph,inode,jnear,jnode,jsph,igrid,c,work, & - !$omp work_complex) schedule(dynamic) - do isph = 1, params % nsph - ! Cycle over all near-field admissible pairs of spheres - inode = constants % snode(isph) - do jnear = constants % snear(inode), constants % snear(inode+1)-1 - ! Near-field interactions are possible only between leaf nodes, - ! which must contain only a single input sphere - jnode = constants % near(jnear) - jsph = constants % order(constants % cluster(1, jnode)) - ! Ignore self-interaction - !if(isph .eq. jsph) cycle - ! Accumulate interaction for external grid points only - do igrid = 1, params % ngrid - if(constants % ui(igrid, isph) .eq. zero) cycle - c = constants % cgrid(:, igrid)*params % rsph(isph) - & - & params % csph(:, jsph) + params % csph(:, isph) - c = c * params % kappa - call fmm_m2p_bessel_work(c, p, constants % vscales, & - & constants % SK_ri(:, jsph), alpha, sph_m(:, jsph), one, & - & grid_v(igrid, isph), work_complex, work) + associate(snode => constants % snode, snear => constants % snear, & + & near => constants % near, order => constants % order, & + & cluster => constants % cluster, ui => constants % ui, & + & cgrid => constants % cgrid, rsph => params % rsph, & + & csph => params % csph, vscales => constants % vscales, & + & sk_ri => constants % sk_ri) + !$omp parallel do schedule(dynamic,1) default(none) & + !$omp firstprivate(nsph,ngrid,kappa,alpha,p) & + !$omp private(isph,inode,jnear,jnode,jsph,igrid,c,work_complex, & + !$omp work) shared(snode,snear,near,order,cluster,ui,cgrid,rsph, & + !$omp vscales,sk_ri,sph_m,grid_v) + do isph = 1, nsph + ! Cycle over all near-field admissible pairs of spheres + inode = snode(isph) + do jnear = snear(inode), snear(inode+1)-1 + ! Near-field interactions are possible only between leaf nodes, + ! which must contain only a single input sphere + jnode = near(jnear) + jsph = order(cluster(1, jnode)) + ! Ignore self-interaction + !if(isph .eq. jsph) cycle + ! Accumulate interaction for external grid points only + do igrid = 1, ngrid + if(ui(igrid, isph) .eq. zero) cycle + c = cgrid(:, igrid)*rsph(isph) - csph(:, jsph) & + & + csph(:, isph) + c = c * kappa + call fmm_m2p_bessel_work(c, p, vscales, & + & sk_ri(:, jsph), alpha, sph_m(:, jsph), 1.0d0, & + & grid_v(igrid, isph), work_complex, work) + end do end do end do - end do + end associate end subroutine tree_m2p_bessel !------------------------------------------------------------------------------ diff --git a/src/ddx_operators.f90 b/src/ddx_operators.f90 index bbbf4e7e..b4ecf8b5 100644 --- a/src/ddx_operators.f90 +++ b/src/ddx_operators.f90 @@ -1110,7 +1110,6 @@ subroutine prec_tstarx(params, constants, workspace, x, y, ddx_error) & constants % inner_tol*1.0d-2, y(:,:,1), & & workspace % ddcosmo_guess, n_iter, x_rel_diff, lstarx, & & ldm1x, hnorm, ddx_error) - write(6,*) "lstarx", x_rel_diff(1:n_iter) if (ddx_error % flag .ne. 0) then call update_error(ddx_error, 'prec_tstarx: ddCOSMO failed to ' // & & 'converge, exiting') @@ -1124,7 +1123,6 @@ subroutine prec_tstarx(params, constants, workspace, x, y, ddx_error) call jacobi_diis(params, constants, workspace, & & constants % inner_tol*1.0d1, x(:,:,2), workspace % hsp_guess, & & n_iter, x_rel_diff, bstarx, bx_prec, hnorm, ddx_error) - write(6,*) "bstarx", x_rel_diff(1:n_iter) if (ddx_error % flag .ne. 0) then call update_error(ddx_error, 'prec_tstarx: HSP failed to ' // & & 'converge, exiting') @@ -1166,7 +1164,6 @@ subroutine prec_tx(params, constants, workspace, x, y, ddx_error) & 'converge, exiting') return end if - write(6,*) "lx", x_rel_diff(1:n_iter) ! Scale by the factor of (2l+1)/4Pi y(:,:,1) = workspace % ddcosmo_guess @@ -1181,7 +1178,6 @@ subroutine prec_tx(params, constants, workspace, x, y, ddx_error) & n_iter, x_rel_diff, bx, bx_prec, hnorm, ddx_error) y(:,:,2) = workspace % hsp_guess workspace % hsp_time = workspace % hsp_time + omp_get_wtime() - start_time - write(6,*) "bx", x_rel_diff(1:n_iter) if (ddx_error % flag .ne. 0) then call update_error(ddx_error, 'prec_tx: HSP failed to ' // & @@ -1326,6 +1322,8 @@ subroutine cx(params, constants, workspace, x, y, ddx_error) complex(dp) :: work_complex(constants % lmax0 + 1) real(dp) :: work(constants % lmax0 + 1) integer :: indl, inode, info + real(dp) :: epsp, eps, fac + integer :: nsph, lmax, nbasis, nbasis0, lmax0, ngrid real(dp), allocatable :: diff_re(:,:), diff0(:,:) @@ -1337,44 +1335,65 @@ subroutine cx(params, constants, workspace, x, y, ddx_error) return end if - ! diff_re = params % epsp/eps*l1/ri*Xr - i'(ri)/i(ri)*Xe, - diff_re = zero - !$omp parallel do default(none) shared(params,diff_re, & - !$omp constants,x) private(jsph,l,m,ind) - do jsph = 1, params % nsph - do l = 0, params % lmax - do m = -l,l - ind = l**2 + l + m + 1 - diff_re(ind,jsph) = (params % epsp/params % eps)* & - & (l/params % rsph(jsph))*x(ind,jsph,1) & - & - constants % termimat(l,jsph)*x(ind,jsph,2) + nsph = params % nsph + lmax = params % lmax + lmax0 = constants % lmax0 + epsp = params % epsp + eps = params % eps + nbasis = constants % nbasis + nbasis0 = constants % nbasis0 + ngrid = params % ngrid + + call time_push() + associate(rsph => params % rsph, termimat => constants % termimat) + !$omp parallel do collapse(2) default(none) schedule(static,100) & + !$omp firstprivate(nsph,lmax,epsp,eps) private(jsph,l,ind,m) & + !$omp shared(diff_re,rsph,x,termimat) + do jsph = 1, nsph + do l = 0, lmax + ind = l**2 + l + 1 + do m = -l, l + diff_re(ind+m,jsph) = (epsp/eps)*(l/rsph(jsph)) & + & *x(ind+m,jsph,1) - termimat(l,jsph)*x(ind+m,jsph,2) + end do + end do end do - end do - end do + end associate + call time_pull("cx1") + call time_push() ! diff0 = Pchi * diff_er, linear scaling - !$omp parallel do default(none) shared(constants,params, & - !$omp diff_re,diff0) private(jsph) - do jsph = 1, params % nsph - call dgemv('t', constants % nbasis, constants % nbasis0, one, & - & constants % pchi(1,1,jsph), constants % nbasis, & - & diff_re(1,jsph), 1, zero, diff0(1,jsph), 1) - end do + associate(pchi => constants % pchi) + !$omp parallel do default(none) schedule(static,10) & + !$omp firstprivate(nsph,nbasis,nbasis0) private(jsph) & + !$omp shared(pchi,diff_re,diff0) + do jsph = 1, nsph + call dgemv('t', nbasis, nbasis0, 1.0d0, pchi(1,1,jsph), nbasis, & + & diff_re(1,jsph), 1, 0.0d0, diff0(1,jsph), 1) + end do + end associate + call time_pull("cx2") + call time_push() ! Multiply diff0 by C_ik inplace - do isph = 1, params % nsph - do l = 0, constants % lmax0 - ind0 = l*l+l+1 - diff0(ind0-l:ind0+l, isph) = diff0(ind0-l:ind0+l, isph) * & - & constants % C_ik(l, isph) + associate(c_ik => constants % c_ik) + !$omp parallel do collapse(2) schedule(static,100) & + !$omp firstprivate(nsph,lmax0) private(isph,l,ind,fac,m) & + !$omp shared(diff0,c_ik) + do isph = 1, nsph + do l = 0, lmax0 + ind = l*l+l+1 + fac = c_ik(l, isph) + do m = -l, l + diff0(ind+m, isph) = diff0(ind+m, isph) * fac + end do + end do end do - end do + end associate + call time_pull("cx3") ! avoiding N^2 storage, this code does not use the cached coefY y(:,:,1) = zero if (params % fmm .eq. 0) then - !$omp parallel do default(none) shared(params,constants, & - !$omp diff0,y) private(isph,igrid,val,vij,work, & - !$omp ind0,ind,vtij,work_complex) do isph = 1, params % nsph do igrid = 1, params % ngrid if (constants % ui(igrid,isph).gt.zero) then @@ -1400,6 +1419,7 @@ subroutine cx(params, constants, workspace, x, y, ddx_error) end do else ! Load input harmonics into tree data + call time_push() workspace % tmp_node_m = zero workspace % tmp_node_l = zero workspace % tmp_sph = zero @@ -1424,37 +1444,55 @@ subroutine cx(params, constants, workspace, x, y, ddx_error) workspace % tmp_node_m(:, inode) = workspace % tmp_sph(1:indl, isph) end do end if + call time_pull("cx-fmmprep") + call time_push() ! Do FMM operations call tree_m2m_bessel_rotation(params, constants, workspace % tmp_node_m) + call time_pull("cx-m2m") + call time_push() call tree_m2l_bessel_rotation(params, constants, workspace % tmp_node_m, & & workspace % tmp_node_l) + call time_pull("cx-m2l") + call time_push() call tree_l2l_bessel_rotation(params, constants, workspace % tmp_node_l) + call time_pull("cx-l2l") + call time_push() workspace % tmp_grid = zero call tree_l2p_bessel(params, constants, one, workspace % tmp_node_l, zero, & & workspace % tmp_grid) + call time_pull("cx-l2p") + call time_push() call tree_m2p_bessel(params, constants, constants % lmax0, one, & & params % lmax, workspace % tmp_sph, one, & & workspace % tmp_grid) - - do isph = 1, params % nsph - do igrid = 1, params % ngrid - do ind = 1, constants % nbasis - y(ind,isph,1) = y(ind,isph,1) + workspace % tmp_grid(igrid, isph)*& - & constants % vwgrid(ind, igrid)*& - & constants % ui(igrid,isph) + call time_pull("cx-m2p") + call time_push() + + associate(tmp_grid => workspace % tmp_grid, & + & vwgrid => constants % vwgrid, ui => constants % ui) + !$omp parallel do collapse(2) schedule(static,100) & + !$omp firstprivate(nsph,ngrid,nbasis), shared(y,tmp_grid, & + !$omp vwgrid,ui) private(isph,igrid,ind) + do isph = 1, nsph + do ind = 1, nbasis + do igrid = 1, ngrid + y(ind,isph,1) = y(ind,isph,1) + tmp_grid(igrid, isph)*& + & vwgrid(ind, igrid)*ui(igrid,isph) + end do end do end do - end do + end associate + call time_pull("cx-fmmend") end if y(:,:,2) = y(:,:,1) + call time_pull("cx") deallocate(diff_re, diff0, stat=info) if (info.ne.0) then call update_error(ddx_error, "Deallocation failed in cx") return end if - call time_pull("cx") end subroutine cx From 7a139aac88b2c9b1c87873e58afadf7acb2d1d6d Mon Sep 17 00:00:00 2001 From: Michele Nottoli Date: Wed, 15 May 2024 11:58:44 +0200 Subject: [PATCH 20/21] Update. --- src/ddx_core.f90 | 15 ++++++---- src/ddx_operators.f90 | 67 +++++++++++++++++++++++-------------------- 2 files changed, 45 insertions(+), 37 deletions(-) diff --git a/src/ddx_core.f90 b/src/ddx_core.f90 index eb6b5b58..42661799 100644 --- a/src/ddx_core.f90 +++ b/src/ddx_core.f90 @@ -2314,7 +2314,7 @@ subroutine tree_l2p_bessel(params, constants, alpha, node_l, beta, grid_v) real(dp), intent(inout) :: grid_v(params % ngrid, params % nsph) ! Local variables real(dp) :: sph_l((params % pl+1)**2, params % nsph) - integer :: isph + integer :: isph, nsph external :: dgemm ! Init output if (beta .eq. zero) then @@ -2322,12 +2322,15 @@ subroutine tree_l2p_bessel(params, constants, alpha, node_l, beta, grid_v) else grid_v = beta * grid_v end if + nsph = params % nsph ! Get data from all clusters to spheres - !$omp parallel do default(none) shared(params,constants,node_l,sph_l) & - !$omp private(isph) schedule(dynamic) - do isph = 1, params % nsph - sph_l(:, isph) = node_l(:, constants % snode(isph)) - end do + associate(snode => constants % snode) + !$omp parallel do default(none) schedule(static,100) & + !$omp shared(node_l,sph_l,snode) private(isph) firstprivate(nsph) + do isph = 1, nsph + sph_l(:, isph) = node_l(:, snode(isph)) + end do + end associate ! Get values at grid points call dgemm('T', 'N', params % ngrid, params % nsph, & & (params % pl+1)**2, alpha, constants % vgrid, & diff --git a/src/ddx_operators.f90 b/src/ddx_operators.f90 index b4ecf8b5..7ea32768 100644 --- a/src/ddx_operators.f90 +++ b/src/ddx_operators.f90 @@ -1316,15 +1316,11 @@ subroutine cx(params, constants, workspace, x, y, ddx_error) real(dp), dimension(constants % nbasis, params % nsph, 2), intent(out) :: y type(ddx_error_type), intent(inout) :: ddx_error - integer :: isph, jsph, igrid, ind, l, m, ind0 - real(dp), dimension(3) :: vij, vtij - real(dp) :: val + integer :: isph, jsph, igrid, ind, l, m, indl, inode, info, & + & nsph, lmax, nbasis, nbasis0, lmax0, ngrid + real(dp) :: vij(3), vtij(3), val, epsp, eps, fac, & + & work(constants % lmax0 + 1) complex(dp) :: work_complex(constants % lmax0 + 1) - real(dp) :: work(constants % lmax0 + 1) - integer :: indl, inode, info - real(dp) :: epsp, eps, fac - integer :: nsph, lmax, nbasis, nbasis0, lmax0, ngrid - real(dp), allocatable :: diff_re(:,:), diff0(:,:) call time_push() @@ -1360,9 +1356,9 @@ subroutine cx(params, constants, workspace, x, y, ddx_error) end do end associate call time_pull("cx1") - call time_push() ! diff0 = Pchi * diff_er, linear scaling + call time_push() associate(pchi => constants % pchi) !$omp parallel do default(none) schedule(static,10) & !$omp firstprivate(nsph,nbasis,nbasis0) private(jsph) & @@ -1374,8 +1370,8 @@ subroutine cx(params, constants, workspace, x, y, ddx_error) end associate call time_pull("cx2") - call time_push() ! Multiply diff0 by C_ik inplace + call time_push() associate(c_ik => constants % c_ik) !$omp parallel do collapse(2) schedule(static,100) & !$omp firstprivate(nsph,lmax0) private(isph,l,ind,fac,m) & @@ -1391,10 +1387,11 @@ subroutine cx(params, constants, workspace, x, y, ddx_error) end do end associate call time_pull("cx3") + ! avoiding N^2 storage, this code does not use the cached coefY - y(:,:,1) = zero if (params % fmm .eq. 0) then do isph = 1, params % nsph + y(:,isph,1) = zero do igrid = 1, params % ngrid if (constants % ui(igrid,isph).gt.zero) then val = zero @@ -1422,27 +1419,33 @@ subroutine cx(params, constants, workspace, x, y, ddx_error) call time_push() workspace % tmp_node_m = zero workspace % tmp_node_l = zero - workspace % tmp_sph = zero - do isph = 1, params % nsph - do l = 0, constants % lmax0 - ind0 = l*l+l+1 - workspace % tmp_sph(ind0-l:ind0+l, isph) = & - & diff0(ind0-l:ind0+l, isph) - end do - end do if(constants % lmax0 .lt. params % pm) then - do isph = 1, params % nsph - inode = constants % snode(isph) - workspace % tmp_node_m(1:constants % nbasis0, inode) = & - & workspace % tmp_sph(1:constants % nbasis0, isph) - workspace % tmp_node_m(constants % nbasis0+1:, inode) = zero - end do + associate(tmp_node_m => workspace % tmp_node_m, & + & snode => constants % snode) + !$omp parallel do schedule(static,1) default(none) & + !$omp firstprivate(nsph,nbasis0) & + !$omp shared(snode,tmp_node_m,diff0) & + !$omp private(isph,inode) + do isph = 1, nsph + inode = snode(isph) + tmp_node_m(1:nbasis0, inode) = & + & diff0(1:nbasis0, isph) + tmp_node_m(nbasis0+1:, inode) = 0.0d0 + end do + end associate else indl = (params % pm+1)**2 - do isph = 1, params % nsph - inode = constants % snode(isph) - workspace % tmp_node_m(:, inode) = workspace % tmp_sph(1:indl, isph) - end do + associate(tmp_node_m => workspace % tmp_node_m, & + & snode => constants % snode) + !$omp parallel do schedule(static,1) default(none) & + !$omp firstprivate(nsph,nbasis0) & + !$omp shared(snode,tmp_node_m,diff0) & + !$omp private(isph,inode,indl) + do isph = 1, nsph + inode = snode(isph) + tmp_node_m(:, inode) = diff0(1:indl, isph) + end do + end associate end if call time_pull("cx-fmmprep") call time_push() @@ -1472,13 +1475,15 @@ subroutine cx(params, constants, workspace, x, y, ddx_error) & vwgrid => constants % vwgrid, ui => constants % ui) !$omp parallel do collapse(2) schedule(static,100) & !$omp firstprivate(nsph,ngrid,nbasis), shared(y,tmp_grid, & - !$omp vwgrid,ui) private(isph,igrid,ind) + !$omp vwgrid,ui) private(isph,igrid,ind,val) do isph = 1, nsph do ind = 1, nbasis + val = 0.0d0 do igrid = 1, ngrid - y(ind,isph,1) = y(ind,isph,1) + tmp_grid(igrid, isph)*& + val = val + tmp_grid(igrid, isph)*& & vwgrid(ind, igrid)*ui(igrid,isph) end do + y(ind,isph,1) = val end do end do end associate From 3e0ba812514202b208924d29ddaeb939feab3afd Mon Sep 17 00:00:00 2001 From: Michele Nottoli Date: Fri, 24 May 2024 12:50:21 +0200 Subject: [PATCH 21/21] Bug fix. --- MANIFEST.in | 3 --- src/ddx_core.f90 | 6 +++--- src/ddx_operators.f90 | 12 ++++++++++++ 3 files changed, 15 insertions(+), 6 deletions(-) delete mode 100644 MANIFEST.in diff --git a/MANIFEST.in b/MANIFEST.in deleted file mode 100644 index 588679d7..00000000 --- a/MANIFEST.in +++ /dev/null @@ -1,3 +0,0 @@ -global-include src/**/*.h src/**/*.cpp src/**/*.f src/**/*.f90 src/**/*.cmake src/**/CMakeLists.txt -include CMakeLists.txt -include README.md diff --git a/src/ddx_core.f90 b/src/ddx_core.f90 index 42661799..15860ef0 100644 --- a/src/ddx_core.f90 +++ b/src/ddx_core.f90 @@ -2499,9 +2499,9 @@ subroutine tree_m2p_bessel(params, constants, p, alpha, sph_p, sph_m, beta, grid & sk_ri => constants % sk_ri) !$omp parallel do schedule(dynamic,1) default(none) & !$omp firstprivate(nsph,ngrid,kappa,alpha,p) & - !$omp private(isph,inode,jnear,jnode,jsph,igrid,c,work_complex, & - !$omp work) shared(snode,snear,near,order,cluster,ui,cgrid,rsph, & - !$omp vscales,sk_ri,sph_m,grid_v) + !$omp private(isph,inode,jnear,jnode,jsph,igrid,c, & + !$omp work_complex,work) shared(snode,snear,near,order, & + !$omp csph,cluster,ui,cgrid,rsph,vscales,sk_ri,sph_m,grid_v) do isph = 1, nsph ! Cycle over all near-field admissible pairs of spheres inode = snode(isph) diff --git a/src/ddx_operators.f90 b/src/ddx_operators.f90 index 7ea32768..89d60296 100644 --- a/src/ddx_operators.f90 +++ b/src/ddx_operators.f90 @@ -1419,6 +1419,18 @@ subroutine cx(params, constants, workspace, x, y, ddx_error) call time_push() workspace % tmp_node_m = zero workspace % tmp_node_l = zero + workspace % tmp_sph = zero + associate(tmp_sph => workspace % tmp_sph) + !$omp parallel do collapse(2) schedule(static,100) & + !$omp firstprivate(nsph,lmax0), shared(tmp_sph,diff0) & + !$omp private(isph,l,ind) + do isph = 1, nsph + do l = 0, lmax0 + ind = l*l+l+1 + tmp_sph(ind-l:ind+l, isph) = diff0(ind-l:ind+l, isph) + end do + end do + end associate if(constants % lmax0 .lt. params % pm) then associate(tmp_node_m => workspace % tmp_node_m, & & snode => constants % snode)