diff --git a/src/ddx_constants.f90 b/src/ddx_constants.f90 index debbe22c..5ff7370b 100644 --- a/src/ddx_constants.f90 +++ b/src/ddx_constants.f90 @@ -219,6 +219,11 @@ module ddx_constants !! computation of forces (gradients). Allocated and used only if !! fmm=1. integer :: grad_nbasis + !> Number of layers in the bisection tree. Defined only if fmm=1. + integer :: nlayers + !> The first and the last clusters of each layer of the tree. The + !! dimension is (2, nlayers). Allocated and used only if fmm=1. + integer, allocatable :: layers(:, :) !> Inner tolerance for microiterations done when using ddLPB real(dp) :: inner_tol !> Whether the diagonal of the matrices has to be used in the mvp for @@ -837,7 +842,11 @@ subroutine constants_geometry_init(params, constants) & constants % parent, constants % cnode, constants % rnode, & & constants % snode, constants % error_message, & & constants % error_flag) - if (params % error_flag .ne. 0) return + if (constants % error_flag .ne. 0) return + + call compute_layers(constants) + if (constants % error_flag .ne. 0) return + ! Get number of far and near admissible pairs iwork = 0 jwork = 1 @@ -1429,6 +1438,73 @@ subroutine mkprec(lmax, nbasis, nsph, ngrid, eps, ui, wgrid, vgrid, & error_message = "" end subroutine mkprec +!> Compute and store information about the layers of the tree used in +!! the fmm operations. +!! +!! @param[inout] constants: Object containing all constants. +!! +subroutine compute_layers(constants) + implicit none + type(ddx_constants_type), intent(inout) :: constants + integer :: i, distance, info, max_distance + integer, allocatable :: distance_from_root(:) + + allocate(distance_from_root(constants % nclusters), stat=info) + if (info.ne.0) then + constants % error_flag = 1 + constants % error_message = "Allocation failed in compute_layers" + return + end if + + ! first we compute the distances from the root of the tree and + ! the number of layers + distance_from_root(1) = 0 + do i = 1, constants % nclusters + distance = distance_from_root(i) + if (constants % children(1, i) .ne. 0) & + & distance_from_root(constants % children(1, i)) = distance + 1 + if (constants % children(2, i) .ne. 0) & + & distance_from_root(constants % children(2, i)) = distance + 1 + if (distance .gt. max_distance) max_distance = distance + end do + constants % nlayers = max_distance + 1 + + ! now we can allocate the layer information using the actual number + ! of layers + allocate(constants % layers(2, constants % nlayers), stat=info) + if (info.ne.0) then + constants % error_flag = 1 + constants % error_message = "Allocation failed in compute_layers" + return + end if + + ! finally we can populate the layer information + distance = 0 + constants % layers(1, 1) = 1 + do i = 1, constants % nclusters + if (distance_from_root(i) .lt. distance) then + constants % error_flag = 1 + constants % error_message = "Error in compute_layers" + return + end if + if (distance_from_root(i) .ne. distance) then + constants % layers(2, distance + 1) = i + if (distance_from_root(i) .lt. constants % nlayers) & + & constants % layers(1, distance + 2) = i + 1 + distance = distance_from_root(i) + end if + end do + constants % layers(2, constants % nlayers) = constants % nclusters + + deallocate(distance_from_root, stat=info) + if (info.ne.0) then + constants % error_flag = 1 + constants % error_message = "Deallocation failed in compute_layers" + return + end if + +end subroutine compute_layers + !> Build a recursive inertial binary tree !! !! Uses inertial bisection in a recursive manner until each leaf node has only @@ -2249,6 +2325,14 @@ subroutine constants_free(constants) return end if end if + if (allocated(constants % layers)) then + deallocate(constants % layers, stat=istat) + if (istat .ne. 0) then + constants % error_message = "`layers` deallocation failed!" + constants % error_flag = 1 + return + end if + end if end subroutine constants_free end module ddx_constants diff --git a/src/ddx_core.f90 b/src/ddx_core.f90 index 73c701a6..56134ea9 100644 --- a/src/ddx_core.f90 +++ b/src/ddx_core.f90 @@ -187,6 +187,8 @@ module ddx_core real(dp), allocatable :: x_adj_r_grid(:, :) real(dp), allocatable :: x_adj_e_grid(:, :) real(dp), allocatable :: phi_n(:, :) + real(dp) :: hsp_time + real(dp) :: hsp_adj_time end type ddx_state_type @@ -1858,7 +1860,7 @@ subroutine tree_m2m_rotation(params, constants, node_m) ! Output real(dp), intent(inout) :: node_m((params % pm+1)**2, constants % nclusters) ! Temporary workspace - real(dp) :: work(6*params % pm**2 + 19*params % pm + 8) + real(dp) :: work(6*params % pm**2 + 19*params % pm + 8, params % nproc) ! Call corresponding work routine call tree_m2m_rotation_work(params, constants, node_m, work) end subroutine tree_m2m_rotation @@ -1874,36 +1876,40 @@ subroutine tree_m2m_rotation_work(params, constants, node_m, work) ! Output real(dp), intent(inout) :: node_m((params % pm+1)**2, constants % nclusters) ! Temporary workspace - real(dp), intent(out) :: work(6*params % pm**2 + 19*params % pm + 8) + real(dp), intent(out) :: work(6*params % pm**2 + 19*params % pm + 8, & + & params % nproc) ! Local variables - integer :: i, j + integer :: i, j, l, iproc real(dp) :: c1(3), c(3), r1, r ! Bottom-to-top pass - !!$omp parallel do default(none) shared(constants,params,node_m) & - !!$omp private(i,j,c1,c,r1,r,work) - do i = constants % nclusters, 1, -1 - ! Leaf node does not need any update - if (constants % children(1, i) == 0) cycle - c = constants % cnode(:, i) - r = constants % rnode(i) - ! First child initializes output - j = constants % children(1, i) - c1 = constants % cnode(:, j) - r1 = constants % rnode(j) - c1 = c1 - c - call fmm_m2m_rotation_work(c1, r1, r, & - & params % pm, & - & constants % vscales, & - & constants % vcnk, one, & - & node_m(:, j), zero, node_m(:, i), work) - ! All other children update the same output - do j = constants % children(1, i)+1, constants % children(2, i) + do l = constants % nlayers, 1, -1 + !$omp parallel do default(none) shared(constants,params,node_m,l, & + !$omp work) private(i,j,c1,c,r1,r,iproc) + do i = constants % layers(1, l), constants % layers(2, l) + iproc = omp_get_thread_num() + 1 + ! Leaf node does not need any update + if (constants % children(1, i) == 0) cycle + c = constants % cnode(:, i) + r = constants % rnode(i) + ! First child initializes output + j = constants % children(1, i) c1 = constants % cnode(:, j) r1 = constants % rnode(j) c1 = c1 - c - call fmm_m2m_rotation_work(c1, r1, r, params % pm, & - & constants % vscales, constants % vcnk, one, & - & node_m(:, j), one, node_m(:, i), work) + call fmm_m2m_rotation_work(c1, r1, r, & + & params % pm, & + & constants % vscales, & + & constants % vcnk, one, & + & node_m(:, j), zero, node_m(:, i), work(:, iproc)) + ! All other children update the same output + do j = constants % children(1, i)+1, constants % children(2, i) + c1 = constants % cnode(:, j) + r1 = constants % rnode(j) + c1 = c1 - c + call fmm_m2m_rotation_work(c1, r1, r, params % pm, & + & constants % vscales, constants % vcnk, one, & + & node_m(:, j), one, node_m(:, i), work(:, iproc)) + end do end do end do end subroutine tree_m2m_rotation_work @@ -1939,32 +1945,36 @@ subroutine tree_m2m_bessel_rotation_work(params, constants, node_m) real(dp) :: work(6*params % pm**2 + 19*params % pm + 8) complex(dp) :: work_complex(2*params % pm+1) ! Local variables - integer :: i, j + integer :: i, j, l real(dp) :: c1(3), c(3), r1, r ! Bottom-to-top pass - do i = constants % nclusters, 1, -1 - ! Leaf node does not need any update - if (constants % children(1, i) == 0) cycle - c = constants % cnode(:, i) - r = constants % rnode(i) - ! First child initializes output - j = constants % children(1, i) - c1 = constants % cnode(:, j) - r1 = constants % rnode(j) - c1 = params % kappa*(c1 - c) - call fmm_m2m_bessel_rotation_work(c1, & - & constants % SK_rnode(:, j), constants % SK_rnode(:, i), & - & params % pm, constants % vscales, one, node_m(:, j), zero, & - & node_m(:, i), work, work_complex) - ! All other children update the same output - do j = constants % children(1, i)+1, constants % children(2, i) + do l = constants % nlayers, 1, -1 + !$omp parallel do default(none) shared(constants,params,node_m,l) & + !$omp private(i,j,c1,c,r1,r,work,work_complex) + do i = constants % layers(1, l), constants % layers(2, l) + ! Leaf node does not need any update + if (constants % children(1, i) == 0) cycle + c = constants % cnode(:, i) + r = constants % rnode(i) + ! First child initializes output + j = constants % children(1, i) c1 = constants % cnode(:, j) r1 = constants % rnode(j) c1 = params % kappa*(c1 - c) call fmm_m2m_bessel_rotation_work(c1, & & constants % SK_rnode(:, j), constants % SK_rnode(:, i), & - & params % pm, constants % vscales, one, node_m(:, j), one, & + & params % pm, constants % vscales, one, node_m(:, j), zero, & & node_m(:, i), work, work_complex) + ! All other children update the same output + do j = constants % children(1, i)+1, constants % children(2, i) + c1 = constants % cnode(:, j) + r1 = constants % rnode(j) + c1 = params % kappa*(c1 - c) + call fmm_m2m_bessel_rotation_work(c1, & + & constants % SK_rnode(:, j), constants % SK_rnode(:, i), & + & params % pm, constants % vscales, one, node_m(:, j), one, & + & node_m(:, i), work, work_complex) + end do end do end do end subroutine tree_m2m_bessel_rotation_work @@ -1980,7 +1990,7 @@ subroutine tree_m2m_rotation_adj(params, constants, node_m) ! Output real(dp), intent(inout) :: node_m((params % pm+1)**2, constants % nclusters) ! Temporary workspace - real(dp) :: work(6*params % pm**2 + 19*params % pm + 8) + real(dp) :: work(6*params % pm**2 + 19*params % pm + 8, params % nproc) ! Call corresponding work routine call tree_m2m_rotation_adj_work(params, constants, node_m, work) end subroutine tree_m2m_rotation_adj @@ -1996,21 +2006,27 @@ subroutine tree_m2m_rotation_adj_work(params, constants, node_m, work) ! Output real(dp), intent(inout) :: node_m((params % pm+1)**2, constants % nclusters) ! Temporary workspace - real(dp), intent(out) :: work(6*params % pm**2 + 19*params % pm + 8) + real(dp), intent(out) :: work(6*params % pm**2 + 19*params % pm + 8, & + & params % nproc) ! Local variables - integer :: i, j + integer :: i, j, l, iproc real(dp) :: c1(3), c(3), r1, r ! Top-to-bottom pass - do i = 2, constants % nclusters - j = constants % parent(i) - c = constants % cnode(:, j) - r = constants % rnode(j) - c1 = constants % cnode(:, i) - r1 = constants % rnode(i) - c1 = c - c1 - call fmm_m2m_rotation_adj_work(c1, r, r1, params % pm, & - & constants % vscales, constants % vcnk, one, node_m(:, j), one, & - & node_m(:, i), work) + do l = 2, constants % nlayers + !$omp parallel do default(none) shared(constants,params,node_m,l, & + !$omp work) private(i,j,c1,c,r1,r,iproc) + do i = constants % layers(1, l), constants % layers(2, l) + iproc = omp_get_thread_num() + 1 + j = constants % parent(i) + c = constants % cnode(:, j) + r = constants % rnode(j) + c1 = constants % cnode(:, i) + r1 = constants % rnode(i) + c1 = c - c1 + call fmm_m2m_rotation_adj_work(c1, r, r1, params % pm, & + & constants % vscales, constants % vcnk, one, node_m(:, j), one, & + & node_m(:, i), work(:, iproc)) + end do end do end subroutine tree_m2m_rotation_adj_work !------------------------------------------------------------------------------ @@ -2042,18 +2058,21 @@ subroutine tree_m2m_bessel_rotation_adj_work(params, constants, node_m) real(dp) :: work(6*params % pm**2 + 19*params % pm + 8) complex(dp) :: work_complex(2*params % pm+1) ! Local variables - integer :: i, j + integer :: i, j, l real(dp) :: c1(3), c(3) ! Top-to-bottom pass - do i = 2, constants % nclusters - j = constants % parent(i) - c = constants % cnode(:, j) - c1 = constants % cnode(:, i) - c1 = params % kappa*(c1 - c) - ! Little bit confusion about the i and j indices - call fmm_m2m_bessel_rotation_adj_work(c1, constants % SK_rnode(:, i), & - & constants % SK_rnode(:, j), params % pm, constants % vscales, & - & one, node_m(:, j), one, node_m(:, i), work, work_complex) + do l = 2, constants % nlayers + !$omp parallel do default(none) shared(constants,params,node_m,l) & + !$omp private(i,j,c1,c,work,work_complex) + do i = constants % layers(1, l), constants % layers(2, l) + j = constants % parent(i) + c = constants % cnode(:, j) + c1 = constants % cnode(:, i) + c1 = params % kappa*(c1 - c) + call fmm_m2m_bessel_rotation_adj_work(c1, constants % SK_rnode(:, i), & + & constants % SK_rnode(:, j), params % pm, constants % vscales, & + & one, node_m(:, j), one, node_m(:, i), work, work_complex) + end do end do end subroutine tree_m2m_bessel_rotation_adj_work @@ -2068,7 +2087,7 @@ subroutine tree_l2l_rotation(params, constants, node_l) ! Output real(dp), intent(inout) :: node_l((params % pl+1)**2, constants % nclusters) ! Temporary workspace - real(dp) :: work(6*params % pl**2 + 19*params % pl + 8) + real(dp) :: work(6*params % pl**2 + 19*params % pl + 8, params % nproc) ! Call corresponding work routine call tree_l2l_rotation_work(params, constants, node_l, work) end subroutine tree_l2l_rotation @@ -2084,23 +2103,27 @@ subroutine tree_l2l_rotation_work(params, constants, node_l, work) ! Output real(dp), intent(inout) :: node_l((params % pl+1)**2, constants % nclusters) ! Temporary workspace - real(dp), intent(out) :: work(6*params % pl**2 + 19*params % pl + 8) + real(dp), intent(out) :: work(6*params % pl**2 + 19*params % pl + 8, & + & params % nproc) ! Local variables - integer :: i, j + integer :: i, j, l, iproc real(dp) :: c1(3), c(3), r1, r ! Top-to-bottom pass - !!$omp parallel do default(none) shared(constants,params,node_l) & - !!$omp private(i,j,c1,c,r1,r,work) - do i = 2, constants % nclusters - j = constants % parent(i) - c = constants % cnode(:, j) - r = constants % rnode(j) - c1 = constants % cnode(:, i) - r1 = constants % rnode(i) - c1 = c - c1 - call fmm_l2l_rotation_work(c1, r, r1, params % pl, & - & constants % vscales, constants % vfact, one, & - & node_l(:, j), one, node_l(:, i), work) + do l = 2, constants % nlayers + !$omp parallel do default(none) shared(constants,params,node_l,l, & + !$omp work) private(i,j,c1,c,r1,r,iproc) + do i = constants % layers(1, l), constants % layers(2, l) + iproc = omp_get_thread_num() + 1 + j = constants % parent(i) + c = constants % cnode(:, j) + r = constants % rnode(j) + c1 = constants % cnode(:, i) + r1 = constants % rnode(i) + c1 = c - c1 + call fmm_l2l_rotation_work(c1, r, r1, params % pl, & + & constants % vscales, constants % vfact, one, & + & node_l(:, j), one, node_l(:, i), work(:, iproc)) + end do end do end subroutine tree_l2l_rotation_work @@ -2135,18 +2158,22 @@ subroutine tree_l2l_bessel_rotation_work(params, constants, node_l) real(dp) :: work(6*params % pl**2 + 19*params % pl + 8) complex(dp) :: work_complex(2*params % pl+1) ! Local variables - integer :: i, j + integer :: i, j, l real(dp) :: c_child(3), c_parent(3), c_diff(3) ! Top-to-bottom pass - do i = 2, constants % nclusters - j = constants % parent(i) - c_child = constants % cnode(:, j) - c_parent = constants % cnode(:, i) - c_diff = params % kappa*(c_child - c_parent) - call fmm_l2l_bessel_rotation_work(c_diff, & - & constants % si_rnode(:, j), constants % si_rnode(:, i), & - & params % pl, constants % vscales, one, & - & node_l(:, j), one, node_l(:, i), work, work_complex) + do l = 2, constants % nlayers + !$omp parallel do default(none) shared(constants,params,node_l,l) & + !$omp private(i,j,c_child,c_parent,c_diff,work,work_complex) + do i = constants % layers(1, l), constants % layers(2, l) + j = constants % parent(i) + c_child = constants % cnode(:, j) + c_parent = constants % cnode(:, i) + c_diff = params % kappa*(c_child - c_parent) + call fmm_l2l_bessel_rotation_work(c_diff, & + & constants % si_rnode(:, j), constants % si_rnode(:, i), & + & params % pl, constants % vscales, one, & + & node_l(:, j), one, node_l(:, i), work, work_complex) + end do end do end subroutine tree_l2l_bessel_rotation_work @@ -2161,7 +2188,7 @@ subroutine tree_l2l_rotation_adj(params, constants, node_l) ! Output real(dp), intent(inout) :: node_l((params % pl+1)**2, constants % nclusters) ! Temporary workspace - real(dp) :: work(6*params % pl**2 + 19*params % pl + 8) + real(dp) :: work(6*params % pl**2 + 19*params % pl + 8, params % nproc) ! Call corresponding work routine call tree_l2l_rotation_adj_work(params, constants, node_l, work) end subroutine tree_l2l_rotation_adj @@ -2177,32 +2204,38 @@ subroutine tree_l2l_rotation_adj_work(params, constants, node_l, work) ! Output real(dp), intent(inout) :: node_l((params % pl+1)**2, constants % nclusters) ! Temporary workspace - real(dp), intent(out) :: work(6*params % pl**2 + 19*params % pl + 8) + real(dp), intent(out) :: work(6*params % pl**2 + 19*params % pl + 8, & + & params % nproc) ! Local variables - integer :: i, j + integer :: i, j, l, iproc real(dp) :: c1(3), c(3), r1, r ! Bottom-to-top pass - do i = constants % nclusters, 1, -1 - ! Leaf node does not need any update - if (constants % children(1, i) == 0) cycle - c = constants % cnode(:, i) - r = constants % rnode(i) - ! First child initializes output - j = constants % children(1, i) - c1 = constants % cnode(:, j) - r1 = constants % rnode(j) - c1 = c1 - c - call fmm_l2l_rotation_adj_work(c1, r1, r, params % pl, & - & constants % vscales, constants % vfact, one, & - & node_l(:, j), zero, node_l(:, i), work) - ! All other children update the same output - do j = constants % children(1, i)+1, constants % children(2, i) + do l = constants % nlayers, 1, -1 + !$omp parallel do default(none) shared(constants,params,node_l,l, & + !$omp work) private(i,j,c1,c,r1,r,iproc) + do i = constants % layers(1, l), constants % layers(2, l) + iproc = omp_get_thread_num() + 1 + ! Leaf node does not need any update + if (constants % children(1, i) == 0) cycle + c = constants % cnode(:, i) + r = constants % rnode(i) + ! First child initializes output + j = constants % children(1, i) c1 = constants % cnode(:, j) r1 = constants % rnode(j) c1 = c1 - c call fmm_l2l_rotation_adj_work(c1, r1, r, params % pl, & & constants % vscales, constants % vfact, one, & - & node_l(:, j), one, node_l(:, i), work) + & node_l(:, j), zero, node_l(:, i), work) + ! All other children update the same output + do j = constants % children(1, i)+1, constants % children(2, i) + c1 = constants % cnode(:, j) + r1 = constants % rnode(j) + c1 = c1 - c + call fmm_l2l_rotation_adj_work(c1, r1, r, params % pl, & + & constants % vscales, constants % vfact, one, & + & node_l(:, j), one, node_l(:, i), work(:, iproc)) + end do end do end do end subroutine tree_l2l_rotation_adj_work @@ -2221,19 +2254,23 @@ subroutine tree_l2l_bessel_rotation_adj(params, constants, node_l) real(dp) :: work(6*params % pl**2 + 19*params % pl + 8) complex(dp) :: work_complex(2*params % pl+1) ! Local variables - integer :: i, j + integer :: i, j, l real(dp) :: c_parent(3), c_child(3), c_diff(3) ! Bottom-to-top pass - do i = constants % nclusters, 1, -1 - c_child = constants % cnode(:, i) - j = constants % parent(i) - if (j == 0) cycle - c_parent = constants % cnode(:, j) - c_diff = params % kappa*(c_parent - c_child) - call fmm_l2l_bessel_rotation_adj_work(c_diff, & - & constants % si_rnode(:, i), constants % si_rnode(:, j), & - & params % pl, constants % vscales, one, & - & node_l(:, i), one, node_l(:, j), work, work_complex) + do l = constants % nlayers, 1, -1 + !$omp parallel do default(none) shared(constants,params,node_l,l) & + !$omp private(i,j,c_parent,c_child,c_diff,work,work_complex) + do i = constants % layers(1, l), constants % layers(2, l) + c_child = constants % cnode(:, i) + j = constants % parent(i) + if (j == 0) cycle + c_parent = constants % cnode(:, j) + c_diff = params % kappa*(c_parent - c_child) + call fmm_l2l_bessel_rotation_adj_work(c_diff, & + & constants % si_rnode(:, i), constants % si_rnode(:, j), & + & params % pl, constants % vscales, one, & + & node_l(:, i), one, node_l(:, j), work, work_complex) + end do end do end subroutine tree_l2l_bessel_rotation_adj @@ -2370,10 +2407,9 @@ subroutine tree_m2l_bessel_rotation_adj_work(params, constants, node_l, node_m) ! Local variables integer :: i, j, k real(dp) :: c1(3), c(3), r - ! Any order of this cycle is OK node_m = zero - !!$omp parallel do shared(constants,params,node_l,node_m) & - !!$omp private(i,c,r,k,c1,work,work_complex,j) schedule(dynamic) + !$omp parallel do shared(constants,params,node_l,node_m) & + !$omp private(i,c,r,k,c1,work,work_complex,j) schedule(dynamic) do i = 1, constants % nclusters ! If no far admissible pairs just set output to zero if (constants % nfar(i) .eq. 0) then @@ -2384,19 +2420,18 @@ subroutine tree_m2l_bessel_rotation_adj_work(params, constants, node_l, node_m) ! Use the first far admissible pair to initialize output k = constants % far(constants % sfar(i)) c1 = constants % cnode(:, k) - c1 = params % kappa*(c1 - c) - call fmm_m2l_bessel_rotation_adj_work(c1, constants % SI_rnode(:, i), & - & constants % SK_rnode(:, k), params % pm, & - & constants % vscales, one, & - & node_l(:, i), one, node_m(:, k), work, work_complex) + c1 = params % kappa*(c - c1) + call fmm_m2l_bessel_rotation_adj_work(c1, constants % SI_rnode(:, k), & + & constants % SK_rnode(:, i), params % pm, constants % vscales, & + & one, node_l(:, k), one, node_m(:, i), work, work_complex) do j = constants % sfar(i)+1, constants % sfar(i+1)-1 k = constants % far(j) c1 = constants % cnode(:, k) - c1 = params % kappa*(c1 - c) - call fmm_m2l_bessel_rotation_adj_work(c1, constants % SI_rnode(:, i), & - & constants % SK_rnode(:, k), params % pm, & - & constants % vscales, one, & - & node_l(:, i), one, node_m(:, k), work, work_complex) + c1 = params % kappa*(c - c1) + call fmm_m2l_bessel_rotation_adj_work(c1, & + & constants % SI_rnode(:, k), constants % SK_rnode(:, i), & + & params % pm, constants % vscales, one, node_l(:, k), & + & one, node_m(:, i), work, work_complex) end do end do end subroutine tree_m2l_bessel_rotation_adj_work @@ -2415,8 +2450,9 @@ subroutine tree_m2l_rotation_adj(params, constants, node_l, node_m) ! Local variables integer :: i, j, k real(dp) :: c1(3), c(3), r1, r - ! Any order of this cycle is OK node_m = zero + !$omp parallel do default(none) shared(constants,params,node_m,node_l) & + !$omp private(i,c,r,k,c1,r1,j) schedule(dynamic) do i = 1, constants % nclusters ! If no far admissible pairs just set output to zero if (constants % nfar(i) .eq. 0) then @@ -2428,18 +2464,18 @@ subroutine tree_m2l_rotation_adj(params, constants, node_l, node_m) k = constants % far(constants % sfar(i)) c1 = constants % cnode(:, k) r1 = constants % rnode(k) - c1 = c - c1 - call fmm_m2l_rotation_adj(c1, r, r1, params % pl, params % pm, & + c1 = c1 - c + call fmm_m2l_rotation_adj(c1, r1, r, params % pl, params % pm, & & constants % vscales, constants % m2l_ztranslate_adj_coef, one, & - & node_l(:, i), one, node_m(:, k)) + & node_l(:, k), one, node_m(:, i)) do j = constants % sfar(i)+1, constants % sfar(i+1)-1 k = constants % far(j) c1 = constants % cnode(:, k) r1 = constants % rnode(k) - c1 = c - c1 - call fmm_m2l_rotation_adj(c1, r, r1, params % pl, params % pm, & + c1 = c1 - c + call fmm_m2l_rotation_adj(c1, r1, r, params % pl, params % pm, & & constants % vscales, constants % m2l_ztranslate_adj_coef, one, & - & node_l(:, i), one, node_m(:, k)) + & node_l(:, k), one, node_m(:, i)) end do end do end subroutine tree_m2l_rotation_adj @@ -2720,9 +2756,9 @@ subroutine tree_m2p_adj(params, constants, p, alpha, grid_v, beta, sph_m) sph_m = beta * sph_m end if ! Cycle over all spheres - !!$omp parallel do default(none) shared(params,constants,grid_v,p, & - !!$omp alpha,sph_m), private(isph,inode,jnear,jnode,jsph,igrid,c,work) & - !!$omp schedule(dynamic) + !$omp parallel do default(none) shared(params,constants,grid_v,p, & + !$omp alpha,sph_m), private(isph,inode,jnear,jnode,jsph,igrid,c,work) & + !$omp schedule(dynamic) do isph = 1, params % nsph ! Cycle over all near-field admissible pairs of spheres inode = constants % snode(isph) @@ -2735,12 +2771,12 @@ subroutine tree_m2p_adj(params, constants, p, alpha, grid_v, beta, sph_m) if(isph .eq. jsph) cycle ! Accumulate interaction for external grid points only do igrid = 1, params % ngrid - if(constants % ui(igrid, isph) .eq. zero) cycle - c = constants % cgrid(:, igrid)*params % rsph(isph) - & - & params % csph(:, jsph) + params % csph(:, isph) - call fmm_m2p_adj_work(c, alpha*grid_v(igrid, isph), & - & params % rsph(jsph), p, constants % vscales_rel, one, & - & sph_m(:, jsph), work) + if(constants % ui(igrid, jsph) .eq. zero) cycle + c = constants % cgrid(:, igrid)*params % rsph(jsph) - & + & params % csph(:, isph) + params % csph(:, jsph) + call fmm_m2p_adj_work(c, alpha*grid_v(igrid, jsph), & + & params % rsph(isph), p, constants % vscales_rel, one, & + & sph_m(:, isph), work) end do end do end do @@ -2771,28 +2807,25 @@ subroutine tree_m2p_bessel_adj(params, constants, p, alpha, grid_v, beta, sph_p, sph_m = beta * sph_m end if ! Cycle over all spheres - !!$omp parallel do default(none) shared(params,constants,p,sph_m,tmp, & - !!$omp alpha,grid_v) private(isph,inode,jnear,jnode,jsph,igrid,c, & - !!$omp iproc) schedule(dynamic) + !$omp parallel do default(none) shared(params,constants,p,sph_m, & + !$omp alpha,grid_v) private(isph,inode,jnear,jnode,jsph,igrid,c) & + !$omp schedule(dynamic) do isph = 1, params % nsph ! Cycle over all near-field admissible pairs of spheres inode = constants % snode(isph) - ! iproc = omp_get_thread_num() + 1 do jnear = constants % snear(inode), constants % snear(inode+1)-1 ! Near-field interactions are possible only between leaf nodes, ! which must contain only a single input sphere jnode = constants % near(jnear) jsph = constants % order(constants % cluster(1, jnode)) - ! Ignore self-interaction - !if(isph .eq. jsph) cycle ! Accumulate interaction for external grid points only do igrid = 1, params % ngrid - if(constants % ui(igrid, isph) .eq. zero) cycle - c = constants % cgrid(:, igrid)*params % rsph(isph) - & - & params % csph(:, jsph) + params % csph(:, isph) - call fmm_m2p_bessel_adj(c, alpha*grid_v(igrid, isph), & - & params % rsph(jsph), params % kappa, p, & - & constants % vscales, one, sph_m(:, jsph)) + if(constants % ui(igrid, jsph) .eq. zero) cycle + c = constants % cgrid(:, igrid)*params % rsph(jsph) - & + & params % csph(:, isph) + params % csph(:, jsph) + call fmm_m2p_bessel_adj(c, alpha*grid_v(igrid, jsph), & + & params % rsph(isph), params % kappa, p, & + & constants % vscales, one, sph_m(:, isph)) end do end do end do diff --git a/src/ddx_driver.f90 b/src/ddx_driver.f90 index b3fcbe61..3157e3da 100644 --- a/src/ddx_driver.f90 +++ b/src/ddx_driver.f90 @@ -298,7 +298,12 @@ program main & state % x_lpb_rel_diff(i) end do ! Print number of iterations and time - write(6, 100) "ddlpb step time: ", state % x_lpb_time, " seconds" + write(6, 100) "ddlpb step time: ", state % x_lpb_time, & + & " seconds, of which:" + write(6, 100) " ddcosmo: ", state % xs_time, " seconds" + write(6, 100) " hsp: ", state % hsp_time, " seconds" + write(6, 100) " coupling terms: ", state % x_lpb_time & + & - state % xs_time - state % hsp_time, " seconds" write(6, 300) "ddlpb step iterations: ", state % x_lpb_niter end if @@ -348,7 +353,11 @@ program main end do ! Print number of iterations and time write(6, 100) "adjoint ddlpb step time: ", & - & state % x_adj_lpb_time, " seconds" + & state % x_adj_lpb_time, " seconds, of which:" + write(6, 100) " adjoint ddcosmo: ", state % s_time, " seconds" + write(6, 100) " adjoint hsp: ", state % hsp_adj_time, " seconds" + write(6, 100) " adjoint coupling terms: ", state % x_adj_lpb_time & + & - state % s_time - state % hsp_adj_time, " seconds" write(6, 200) "adjoint ddlpb step iterations: ", & & state % x_adj_lpb_niter end if diff --git a/src/ddx_lpb.f90 b/src/ddx_lpb.f90 index c6b15f34..1805293c 100644 --- a/src/ddx_lpb.f90 +++ b/src/ddx_lpb.f90 @@ -233,6 +233,8 @@ subroutine ddlpb_solve(params, constants, workspace, state, tol) real(dp) :: start_time state % x_lpb_niter = params % maxiter + workspace % xs_time = zero + workspace % hsp_time = zero ! solve LS using Jacobi/DIIS start_time = omp_get_wtime() @@ -251,6 +253,10 @@ subroutine ddlpb_solve(params, constants, workspace, state, tol) ! the density so that it is consistent with the ddCOSMO and ddPCM ! ones. call convert_ddcosmo(params, constants, -1, state % x_lpb(:,:,1)) + + ! put the timings in the right places + state % xs_time = workspace % xs_time + state % hsp_time = workspace % hsp_time end subroutine ddlpb_solve @@ -274,7 +280,8 @@ subroutine ddlpb_solve_adjoint(params, constants, workspace, state, tol) real(dp) :: start_time state % x_adj_lpb_niter = params % maxiter - constants % inner_tol = sqrt(tol) + workspace % s_time = zero + workspace % hsp_adj_time = zero ! solve adjoint LS using Jacobi/DIIS start_time = omp_get_wtime() @@ -291,6 +298,10 @@ subroutine ddlpb_solve_adjoint(params, constants, workspace, state, tol) state % q = state % x_adj_lpb(:, :, 1) + ! put the timings in the right places + state % s_time = workspace % s_time + state % hsp_adj_time = workspace % hsp_adj_time + call ddlpb_derivative_setup(params, constants, workspace, state) end subroutine ddlpb_solve_adjoint diff --git a/src/ddx_operators.f90 b/src/ddx_operators.f90 index d5ef9a1b..50c0e73c 100644 --- a/src/ddx_operators.f90 +++ b/src/ddx_operators.f90 @@ -370,11 +370,14 @@ subroutine dstarx_dense(params, constants, workspace, do_diag, x, y) ! Local variables real(dp) :: vji(3), sji(3) real(dp) :: vvji, tji, tt, f, rho, ctheta, stheta, cphi, sphi - integer :: its, isph, jsph, l, m, ind + integer :: its, isph, jsph, l, m, ind, iproc real(dp), external :: dnrm2 y = zero - ! this loop is easily parallelizable + !$omp parallel do default(none) shared(do_diag,params,constants, & + !$omp workspace,x,y) private(isph,jsph,its,vji,vvji,tji,sji,rho, & + !$omp ctheta,stheta,cphi,sphi,tt,l,ind,f,m,iproc) schedule(dynamic) do isph = 1, params % nsph + iproc = omp_get_thread_num() + 1 do jsph = 1, params % nsph if (jsph.ne.isph) then do its = 1, params % ngrid @@ -389,8 +392,10 @@ subroutine dstarx_dense(params, constants, workspace, do_diag, x, y) ! build the local basis call ylmbas(sji, rho, ctheta, stheta, cphi, sphi, & & params % lmax, constants % vscales, & - & workspace % tmp_vylm, workspace % tmp_vplm, & - & workspace % tmp_vcos, workspace % tmp_vsin) + & workspace % tmp_vylm(:(params % lmax + 1)**2, iproc), & + & workspace % tmp_vplm(:(params % lmax + 1)**2, iproc), & + & workspace % tmp_vcos(:(params % lmax + 1), iproc), & + & workspace % tmp_vsin(:(params % lmax + 1), iproc)) tt = constants % ui(its,jsph) & & *dot_product(constants % vwgrid(:constants % nbasis, its), & & x(:,jsph))/tji @@ -399,7 +404,7 @@ subroutine dstarx_dense(params, constants, workspace, do_diag, x, y) f = dble(l)*tt/ constants % vscales(ind)**2 do m = -l, l y(ind+m,isph) = y(ind+m,isph) + & - & f*workspace % tmp_vylm(ind+m, 1) + & f*workspace % tmp_vylm(ind+m, iproc) end do tt = tt/tji end do @@ -1355,26 +1360,34 @@ subroutine prec_tstarx(params, constants, workspace, x, y) real(dp), intent(inout) :: y(constants % nbasis, params % nsph, 2) integer :: n_iter real(dp), dimension(params % maxiter) :: x_rel_diff + real(dp) :: start_time + start_time = omp_get_wtime() y(:,:,1) = x(:,:,1) call convert_ddcosmo(params, constants, 1, y(:,:,1)) n_iter = params % maxiter - call jacobi_diis(params, constants, workspace, constants % inner_tol, y(:,:,1), & - & workspace % ddcosmo_guess, n_iter, x_rel_diff, lstarx, ldm1x, hnorm) + call jacobi_diis(params, constants, workspace, constants % inner_tol, & + & y(:,:,1), workspace % ddcosmo_guess, n_iter, x_rel_diff, lstarx, & + & ldm1x, hnorm) if (workspace % error_flag.ne.0) then workspace % error_message = 'prec_tstarx: ddCOSMO failed to converge' return end if y(:,:,1) = workspace % ddcosmo_guess + workspace % s_time = workspace % s_time + omp_get_wtime() - start_time + start_time = omp_get_wtime() n_iter = params % maxiter - call jacobi_diis(params, constants, workspace, constants % inner_tol, x(:,:,2), workspace % hsp_guess, & - & n_iter, x_rel_diff, bstarx, bx_prec, hnorm) + call jacobi_diis(params, constants, workspace, constants % inner_tol, & + & x(:,:,2), workspace % hsp_guess, n_iter, x_rel_diff, bstarx, & + & bx_prec, hnorm) if (workspace % error_flag.ne.0) then workspace % error_message = 'prec_tstarx: HSP failed to converge' return end if y(:,:,2) = workspace % hsp_guess + workspace % hsp_adj_time = workspace % hsp_adj_time & + & + omp_get_wtime() - start_time end subroutine prec_tstarx @@ -1393,11 +1406,14 @@ subroutine prec_tx(params, constants, workspace, x, y) real(dp), intent(inout) :: y(constants % nbasis, params % nsph, 2) integer :: n_iter real(dp), dimension(params % maxiter) :: x_rel_diff + real(dp) :: start_time ! perform A^-1 * Yr + start_time = omp_get_wtime() n_iter = params % maxiter - call jacobi_diis(params, constants, workspace, constants % inner_tol, x(:,:,1), & - & workspace % ddcosmo_guess, n_iter, x_rel_diff, lx, ldm1x, hnorm) + call jacobi_diis(params, constants, workspace, constants % inner_tol, & + & x(:,:,1), workspace % ddcosmo_guess, n_iter, x_rel_diff, lx, & + & ldm1x, hnorm) if (workspace % error_flag.ne.0) then workspace % error_message = 'prec_tx: ddCOSMO failed to converge' return @@ -1406,12 +1422,16 @@ subroutine prec_tx(params, constants, workspace, x, y) ! Scale by the factor of (2l+1)/4Pi y(:,:,1) = workspace % ddcosmo_guess call convert_ddcosmo(params, constants, 1, y(:,:,1)) + workspace % xs_time = workspace % xs_time + omp_get_wtime() - start_time ! perform B^-1 * Ye + start_time = omp_get_wtime() n_iter = params % maxiter - call jacobi_diis(params, constants, workspace, constants % inner_tol, x(:,:,2), workspace % hsp_guess, & - & n_iter, x_rel_diff, bx, bx_prec, hnorm) + call jacobi_diis(params, constants, workspace, constants % inner_tol, & + & x(:,:,2), workspace % hsp_guess, n_iter, x_rel_diff, bx, & + & bx_prec, hnorm) y(:,:,2) = workspace % hsp_guess + workspace % hsp_time = workspace % hsp_time + omp_get_wtime() - start_time if (workspace % error_flag.ne.0) then workspace % error_message = 'prec_tx: HSP failed to converge' diff --git a/src/ddx_workspace.f90 b/src/ddx_workspace.f90 index c59c20db..ae01160f 100644 --- a/src/ddx_workspace.f90 +++ b/src/ddx_workspace.f90 @@ -98,6 +98,8 @@ module ddx_workspace real(dp), allocatable :: tmp_bmat(:, :) !> ddLPB solutions for the microiterations real(dp), allocatable :: ddcosmo_guess(:,:), hsp_guess(:,:) + !> ddLPB temporary timings + real(dp) :: xs_time, s_time, hsp_time, hsp_adj_time !> Flag if there were an error integer :: error_flag = 2 !> Last error message