MODULE poisfft_hybrid_mod !------------------------------------------------------------------------------ ! ! Current revisions: ! ----------------- ! ! ! Former revisions: ! ----------------- ! $Id: poisfft_hybrid.f90 810 2012-01-30 13:40:12Z gryschka $ ! ! 809 2012-01-30 13:32:58Z maronga ! Bugfix: replaced .AND. and .NOT. with && and ! in the preprocessor directives ! ! 807 2012-01-25 11:53:51Z maronga ! New cpp directive "__check" implemented which is used by check_namelist_files ! (most of the code is unneeded by check_namelist_files). ! ! 667 2010-12-23 12:06:00Z suehring/gryschka ! ddzu replaced by ddzu_pres due to changes in zu(0) ! ! 415 2009-12-15 10:26:23Z raasch ! Dimension of array stat in cascade change to prevent type problems with___ ! mpi2 libraries ! ! 274 2009-03-26 15:11:21Z heinze ! Output of messages replaced by message handling routine. ! ! Feb. 2007 ! RCS Log replace by Id keyword, revision history cleaned up ! ! Revision 1.11 2004/04/30 12:43:14 raasch ! Renaming of fft routines, additional argument in calls of fft_y_m ! ! Revision 1.2 2002/12/19 16:08:31 raasch ! Preprocessor directive KKMP introduced (OMP does NOT work), ! array tri will be a shared array in OpenMP loop, to get better cache ! utilization, the i index (x-direction) will be executed in stride ! "istride" as outer loop and in a shorter inner loop, ! overlapping of computation and communication realized by new routine ! poisfft_hybrid_nodes, name of old routine poisfft_hybrid changed to ! poisfft_hybrid_omp, STOP statement replaced by call of subroutine local_stop ! ! ! Description: ! ------------ ! Solution of the Poisson equation with a 2D spectral method. ! Hybrid version for parallel computers using a 1D domain decomposition, ! realized with MPI, along x and parallelization with OPEN-MP along y ! (routine poisfft_hybrid_omp). In a second version (poisfft_hybrid_nodes), ! optimization is realized by overlapping of computation and communication ! and by simultaneously executing as many communication calls as switches ! per logical partition (LPAR) are available. This version comes into ! effect if more than one node is used and if the environment variable ! tasks_per_node is set in a way that it can be devided by switch_per_lpar ! without any rest. ! ! WARNING: In case of OpenMP, there are problems with allocating large ! arrays in parallel regions. ! ! Copyright Klaus Ketelsen / Siegfried Raasch May 2002 !------------------------------------------------------------------------------! USE fft_xy USE indices USE pegrid IMPLICIT NONE INTEGER, PARAMETER :: switch_per_lpar = 2 INTEGER, SAVE :: nxl_a, nxr_a, & ! total x dimension nxl_p, nxr_p, & ! partial x dimension nys_a, nyn_a, & ! total y dimension nys_p, nyn_p, & ! partial y dimension npe_s, & ! total number of PEs for solver nwords, & ! number of points to be exchanged ! with MPI_ALLTOALL n_omp_threads ! number of OpenMP threads ! !-- Variables for multi node version (cluster version) using routine !-- poisfft_hybrid_nodes INTEGER, SAVE :: comm_nodes, & ! communicater nodes comm_node_all, & ! communicater all PEs node version comm_tasks, & ! communicater tasks me, me_node, me_task,& ! identity of this PE nodes, & ! number of nodes tasks_per_logical_node = -1 ! default no cluster PRIVATE #if ! defined ( __check ) PUBLIC poisfft_hybrid, poisfft_hybrid_ini ! !-- Public interfaces INTERFACE poisfft_hybrid_ini MODULE PROCEDURE poisfft_hybrid_ini END INTERFACE poisfft_hybrid_ini INTERFACE poisfft_hybrid MODULE PROCEDURE poisfft_hybrid END INTERFACE poisfft_hybrid ! !-- Private interfaces INTERFACE poisfft_hybrid_omp MODULE PROCEDURE poisfft_hybrid_omp END INTERFACE poisfft_hybrid_omp INTERFACE poisfft_hybrid_omp_vec MODULE PROCEDURE poisfft_hybrid_omp_vec END INTERFACE poisfft_hybrid_omp_vec INTERFACE poisfft_hybrid_nodes MODULE PROCEDURE poisfft_hybrid_nodes END INTERFACE poisfft_hybrid_nodes INTERFACE tridia_hybrid MODULE PROCEDURE tridia_hybrid END INTERFACE tridia_hybrid INTERFACE cascade MODULE PROCEDURE cascade END INTERFACE cascade #else PUBLIC poisfft_hybrid_ini ! !-- Public interfaces INTERFACE poisfft_hybrid_ini MODULE PROCEDURE poisfft_hybrid_ini END INTERFACE poisfft_hybrid_ini #endif CONTAINS SUBROUTINE poisfft_hybrid_ini USE control_parameters USE pegrid IMPLICIT NONE CHARACTER(LEN=8) :: cdummy INTEGER :: idummy, istat INTEGER, DIMENSION(2) :: coords, dims LOGICAL, DIMENSION(2) :: period = .false., re_dims ! !-- Set the internal index values for the hybrid solver #if defined( __parallel ) npe_s = pdims(1) #else npe_s = 1 #endif nxl_a = 0 nxr_a = nx nxl_p = 0 nxr_p = ( ( nx+1 ) / npe_s ) - 1 nys_a = nys nyn_a = nyn nys_p = 0 nyn_p = ( ( ny+1 ) / npe_s ) - 1 nwords = ( nxr_p-nxl_p+1 ) * nz * ( nyn_p-nys_p+1 ) #if defined( __KKMP ) && ! defined ( __check ) CALL LOCAL_GETENV( 'OMP_NUM_THREADS', 15, cdummy, idummy ) READ ( cdummy, '(I8)' ) n_omp_threads IF ( n_omp_threads > 1 ) THEN WRITE( message_string, * ) 'Number of OpenMP threads = ', & n_omp_threads CALL message( 'poisfft_hybrid_ini', 'PA0280', 0, 0, 0, 6, 0 ) ENDIF #else n_omp_threads = 1 #endif ! !-- Initialize the one-dimensional FFT routines CALL fft_init ! !-- Setup for multi node version (poisfft_hybrid_nodes) IF ( n_omp_threads == 1 .AND. & ( host(1:4) == 'ibmh' .OR. host(1:4) == 'ibmb' ) ) THEN IF ( tasks_per_node /= -9999 ) THEN ! !-- Multi node version requires that the available number of !-- switches per logical partition must be an integral divisor !-- of the chosen number of tasks per node IF ( MOD( tasks_per_node, switch_per_lpar ) == 0 ) THEN ! !-- Set the switch which decides about usage of the multi node !-- version IF ( tasks_per_node / switch_per_lpar > 1 .AND. & numprocs > tasks_per_node ) THEN tasks_per_logical_node = tasks_per_node / switch_per_lpar ENDIF IF ( tasks_per_logical_node > -1 ) THEN WRITE( message_string, * ) 'running optimized ', & 'multinode version', & '&switch_per_lpar = ', & switch_per_lpar, & '&tasks_per_lpar = ', & tasks_per_node, & 'tasks_per_logical_node = ', & tasks_per_logical_node CALL message( 'poisfft_hybrid_ini', 'PA0281', 0, 0, 0, 6, 0 ) ENDIF ENDIF ENDIF ENDIF ! !-- Determine sub-topologies for multi node version IF ( tasks_per_logical_node >= 2 ) THEN #if defined( __parallel ) && ! defined ( __check ) nodes = ( numprocs + tasks_per_logical_node - 1 ) / & tasks_per_logical_node dims(1) = nodes dims(2) = tasks_per_logical_node CALL MPI_CART_CREATE( comm2d, 2, dims, period, .FALSE., & comm_node_all, istat ) CALL MPI_COMM_RANK( comm_node_all, me, istat ) re_dims(1) = .TRUE. re_dims(2) = .FALSE. CALL MPI_CART_SUB( comm_node_all, re_dims, comm_nodes, istat ) CALL MPI_COMM_RANK( comm_nodes, me_node, istat ) re_dims(1) = .FALSE. re_dims(2) = .TRUE. CALL MPI_CART_SUB( comm_node_all, re_dims, comm_tasks, istat ) CALL MPI_COMM_RANK( comm_tasks, me_task, istat ) ! write(0,*) 'who am i',myid,me,me_node,me_task,nodes,& ! tasks_per_logical_node #elif ! defined( __parallel ) message_string = 'parallel environment (MPI) required' CALL message( 'poisfft_hybrid_ini', 'PA0282', 1, 2, 0, 6, 0 ) #endif ENDIF END SUBROUTINE poisfft_hybrid_ini #if ! defined ( __check ) SUBROUTINE poisfft_hybrid( ar ) USE control_parameters USE interfaces IMPLICIT NONE REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar IF ( host(1:3) == 'nec' ) THEN CALL poisfft_hybrid_omp_vec( ar ) ELSE IF ( tasks_per_logical_node == -1 ) THEN CALL poisfft_hybrid_omp( ar ) ELSE CALL poisfft_hybrid_nodes( ar ) ENDIF ENDIF END SUBROUTINE poisfft_hybrid SUBROUTINE poisfft_hybrid_omp ( ar ) USE cpulog USE interfaces IMPLICIT NONE INTEGER, PARAMETER :: istride = 4 ! stride of i loop INTEGER :: i, ii, ir, iei, iouter, istat, j, jj, k, m, n, jthread REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar REAL, DIMENSION(0:nx) :: fftx_ar REAL, DIMENSION(0:ny,istride) :: ffty_ar REAL, DIMENSION(0:nx,nz) :: tri_ar REAL, DIMENSION(nxl_p:nxr_p,nz,nys_p:nyn_p,npe_s) :: work1, work2 #if defined( __KKMP ) INTEGER :: omp_get_thread_num REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri ALLOCATE( tri(5,0:nx,0:nz-1,n_omp_threads ) ) #else REAL, DIMENSION(5,0:nx,0:nz-1,1) :: tri #endif CALL cpu_log( log_point_s(30), 'poisfft_hybrid_omp', 'start' ) CALL cpu_log( log_point_s(7), 'fft_y', 'start' ) !$OMP PARALLEL PRIVATE (i,iouter,ii,ir,iei,j,k,m,n,ffty_ar) !$OMP DO ! !-- Store grid points to be transformed on a 1d-array, do the fft !-- and sample the results on a 4d-array DO iouter = nxl_p, nxr_p, istride ! stride loop, better cache iei = MIN( iouter+istride-1, nxr_p ) DO k = 1, nz DO i = iouter, iei ii = nxl + i ir = i - iouter + 1 DO j = nys_a, nyn_a ffty_ar(j,ir) = ar(k,j,ii) ENDDO CALL fft_y( ffty_ar(:,ir), 'forward' ) ENDDO m = nys_a DO n = 1, npe_s DO j = nys_p, nyn_p DO i = iouter, iei ir = i - iouter + 1 work1(i,k,j,n) = ffty_ar(m,ir) ENDDO m = m+1 ENDDO ENDDO ENDDO ENDDO !$OMP END PARALLEL CALL cpu_log( log_point_s(7), 'fft_y', 'pause' ) #if defined( __parallel ) CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' ) CALL MPI_ALLTOALL( work1(nxl_p,1,nys_p,1), nwords, MPI_REAL, & work2(nxl_p,1,nys_p,1), nwords, MPI_REAL, & comm2d, istat ) CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' ) #else work2 = work1 #endif CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'start' ) #if defined( __KKMP ) !$OMP PARALLEL PRIVATE (i,j,jj,k,m,n,fftx_ar,tri_ar,jthread) !$OMP DO DO j = nys_p, nyn_p jthread = omp_get_thread_num() + 1 #else DO j = nys_p, nyn_p jthread = 1 #endif DO k = 1, nz m = nxl_a DO n = 1, npe_s DO i = nxl_p, nxr_p fftx_ar(m) = work2(i,k,j,n) m = m+1 ENDDO ENDDO CALL fft_x( fftx_ar, 'forward' ) DO i = nxl_a, nxr_a tri_ar(i,k) = fftx_ar(i) ENDDO ENDDO jj = myid * (nyn_p-nys_p+1) + j CALL tridia_hybrid( jj, tri_ar, tri(:,:,:,jthread)) DO k = 1, nz DO i = nxl_a, nxr_a fftx_ar(i) = tri_ar (i,k) ENDDO CALL fft_x( fftx_ar, 'backward' ) m = nxl_a DO n = 1, npe_s DO i = nxl_p, nxr_p work2(i,k,j,n) = fftx_ar(m) m = m+1 ENDDO ENDDO ENDDO ENDDO #if defined( __KKMP ) !$OMP END PARALLEL #endif CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'stop' ) #if defined( __parallel ) CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' ) nwords = (nxr_p-nxl_p+1) * nz * (nyn_p-nys_p+1) CALL MPI_ALLTOALL( work2(nxl_p,1,nys_p,1), nwords, MPI_REAL, & work1(nxl_p,1,nys_p,1), nwords, MPI_REAL, & comm2d, istat ) CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' ) #else work1 = work2 #endif CALL cpu_log( log_point_s(7), 'fft_y', 'continue' ) !$OMP PARALLEL PRIVATE (i,iouter,ii,ir,iei,j,k,m,n,ffty_ar) !$OMP DO DO iouter = nxl_p, nxr_p, istride iei = MIN( iouter+istride-1, nxr_p ) DO k = 1, nz m = nys_a DO n = 1, npe_s DO j = nys_p, nyn_p DO i = iouter, iei ir = i - iouter + 1 ffty_ar(m,ir) = work1 (i,k,j,n) ENDDO m = m+1 ENDDO ENDDO DO i = iouter, iei ii = nxl + i ir = i - iouter + 1 CALL fft_y( ffty_ar(:,ir), 'backward' ) DO j = nys_a, nyn_a ar(k,j,ii) = ffty_ar(j,ir) ENDDO ENDDO ENDDO ENDDO !$OMP END PARALLEL CALL cpu_log( log_point_s(7), 'fft_y', 'stop' ) CALL cpu_log( log_point_s(30), 'poisfft_hybrid_omp', 'stop' ) #if defined( __KKMP ) DEALLOCATE( tri ) #endif END SUBROUTINE poisfft_hybrid_omp SUBROUTINE poisfft_hybrid_omp_vec ( ar ) USE cpulog USE interfaces IMPLICIT NONE INTEGER, PARAMETER :: istride = 4 ! stride of i loop INTEGER :: i, ii, ir, iei, iouter, istat, j, jj, k, m, n, jthread REAL, DIMENSION(0:nx,nz) :: tri_ar REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar REAL, DIMENSION(0:ny+3,nz,nxl_p:nxr_p) :: ffty_ar3 REAL, DIMENSION(0:nx+3,nz,nys_p:nyn_p) :: fftx_ar3 REAL, DIMENSION(nxl_p:nxr_p,nz,nys_p:nyn_p,npe_s) :: work1, work2 #if defined( __KKMP ) INTEGER :: omp_get_thread_num REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri ALLOCATE( tri(5,0:nx,0:nz-1,n_omp_threads ) ) #else REAL, DIMENSION(5,0:nx,0:nz-1,1) :: tri #endif CALL cpu_log( log_point_s(30), 'poisfft_hybrid_vec', 'start' ) CALL cpu_log( log_point_s(7), 'fft_y_m', 'start' ) !$OMP PARALLEL PRIVATE (i,j,k,m,n) !$OMP DO ! !-- Store grid points to be transformed on a 1d-array, do the fft !-- and sample the results on a 4d-array DO i = nxl_p, nxr_p DO j = nys_a, nyn_a DO k = 1, nz ffty_ar3(j,k,i) = ar(k,j,i+nxl) ENDDO ENDDO CALL fft_y_m( ffty_ar3(:,:,i), ny+3, 'forward' ) ENDDO !$OMP DO DO k = 1, nz m = nys_a DO n = 1, npe_s DO j = nys_p, nyn_p DO i = nxl_p, nxr_p work1(i,k,j,n) = ffty_ar3(m,k,i) ENDDO m = m+1 ENDDO ENDDO ENDDO !$OMP END PARALLEL CALL cpu_log( log_point_s(7), 'fft_y_m', 'pause' ) #if defined( __parallel ) CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' ) CALL MPI_ALLTOALL( work1(nxl_p,1,nys_p,1), nwords, MPI_REAL, & work2(nxl_p,1,nys_p,1), nwords, MPI_REAL, & comm2d, istat ) CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' ) #else work2 = work1 #endif CALL cpu_log( log_point_s(33), 'fft_x_m + tridia', 'start' ) #if defined( __KKMP ) !$OMP PARALLEL PRIVATE (i,j,jj,k,m,n,tri_ar,jthread) !$OMP DO DO j = nys_p, nyn_p jthread = omp_get_thread_num() + 1 #else DO j = nys_p, nyn_p jthread = 1 #endif DO k = 1, nz m = nxl_a DO n = 1, npe_s DO i = nxl_p, nxr_p fftx_ar3(m,k,j) = work2(i,k,j,n) m = m+1 ENDDO ENDDO ENDDO CALL fft_x_m( fftx_ar3(:,:,j), 'forward' ) DO k = 1, nz DO i = nxl_a, nxr_a tri_ar(i,k) = fftx_ar3(i,k,j) ENDDO ENDDO jj = myid * (nyn_p-nys_p+1) + j CALL tridia_hybrid( jj, tri_ar, tri(:,:,:,jthread)) DO k = 1, nz DO i = nxl_a, nxr_a fftx_ar3(i,k,j) = tri_ar (i,k) ENDDO ENDDO CALL fft_x_m( fftx_ar3(:,:,j), 'backward' ) DO k = 1, nz m = nxl_a DO n = 1, npe_s DO i = nxl_p, nxr_p work2(i,k,j,n) = fftx_ar3(m,k,j) m = m+1 ENDDO ENDDO ENDDO ENDDO #if defined( __KKMP ) !$OMP END PARALLEL #endif CALL cpu_log( log_point_s(33), 'fft_x_m + tridia', 'stop' ) #if defined( __parallel ) CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' ) nwords = (nxr_p-nxl_p+1) * nz * (nyn_p-nys_p+1) CALL MPI_ALLTOALL( work2(nxl_p,1,nys_p,1), nwords, MPI_REAL, & work1(nxl_p,1,nys_p,1), nwords, MPI_REAL, & comm2d, istat ) CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' ) #else work1 = work2 #endif CALL cpu_log( log_point_s(7), 'fft_y_m', 'continue' ) !$OMP PARALLEL PRIVATE (i,iouter,ii,ir,iei,j,k,m,n) !$OMP DO DO k = 1, nz m = nys_a DO n = 1, npe_s DO j = nys_p, nyn_p DO i = nxl_p, nxr_p ffty_ar3(m,k,i) = work1(i,k,j,n) ENDDO m = m+1 ENDDO ENDDO ENDDO !$OMP DO DO i = nxl_p, nxr_p CALL fft_y_m( ffty_ar3(:,:,i), ny+3, 'backward' ) DO j = nys_a, nyn_a DO k = 1, nz ar(k,j,i+nxl) = ffty_ar3(j,k,i) ENDDO ENDDO ENDDO !$OMP END PARALLEL CALL cpu_log( log_point_s(7), 'fft_y_m', 'stop' ) CALL cpu_log( log_point_s(30), 'poisfft_hybrid_vec', 'stop' ) #if defined( __KKMP ) DEALLOCATE( tri ) #endif END SUBROUTINE poisfft_hybrid_omp_vec SUBROUTINE poisfft_hybrid_nodes ( ar ) USE cpulog USE interfaces IMPLICIT NONE INTEGER, PARAMETER :: istride = 4 ! stride of i loop INTEGER :: i, iei, ii, iouter, ir, istat, j, jj, k, m, & n, nn, nt, nw1, nw2 REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar REAL, DIMENSION(0:nx) :: fftx_ar REAL, DIMENSION(0:ny,istride) :: ffty_ar REAL, DIMENSION(0:nx,nz) :: tri_ar REAL, DIMENSION(nxl_p:nxr_p,nz,tasks_per_logical_node, & nodes,nys_p:nyn_p) :: work1,work2 REAL, DIMENSION(5,0:nx,0:nz-1) :: tri CALL cpu_log( log_point_s(30), 'poisfft_hybrid_nodes', 'start' ) CALL cpu_log( log_point_s(7), 'fft_y', 'start' ) ! !-- Store grid points to be transformed on a 1d-array, do the fft !-- and sample the results on a 4d-array DO iouter = nxl_p, nxr_p, istride ! stride loop, better cache iei = MIN( iouter+istride-1, nxr_p ) DO k = 1, nz DO i = iouter, iei ii = nxl + i ir = i - iouter + 1 DO j = nys_a, nyn_a ffty_ar(j,ir) = ar(k,j,ii) ENDDO CALL fft_y( ffty_ar(:,ir), 'forward' ) ENDDO m = nys_a DO nn = 1, nodes DO nt = 1, tasks_per_logical_node DO j = nys_p, nyn_p DO i = iouter, iei ir = i - iouter + 1 work1(i,k,nt,nn,j) = ffty_ar(m,ir) ENDDO m = m+1 ENDDO ENDDO ENDDO ENDDO ENDDO CALL cpu_log( log_point_s(7), 'fft_y', 'pause' ) CALL cpu_log( log_point_s(32), 'alltoall_task', 'start' ) nw1 = SIZE( work1, 1 ) * SIZE( work1, 2 ) DO nn = 1, nodes DO j = nys_p, nyn_p #if defined( __parallel ) CALL MPI_ALLTOALL( work1(nxl_p,1,1,nn,j), nw1, MPI_REAL, & work2(nxl_p,1,1,nn,j), nw1, MPI_REAL, & comm_tasks, istat ) #endif ENDDO ENDDO CALL cpu_log( log_point_s(32), 'alltoall_task', 'stop' ) DO j = nys_p, nyn_p CALL cascade( 1, j, nys_p, nyn_p ) nw2 = nw1 * SIZE( work1, 3 ) CALL cpu_log( log_point_s(37), 'alltoall_node', 'start' ) #if defined( __parallel ) CALL MPI_ALLTOALL( work2(nxl_p,1,1,1,j), nw2, MPI_REAL, & work1(nxl_p,1,1,1,j), nw2, MPI_REAL, & comm_nodes, istat ) #endif CALL cpu_log( log_point_s(37), 'alltoall_node', 'pause' ) CALL cascade( 2, j, nys_p, nyn_p ) CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'start' ) DO k = 1, nz m = nxl_a DO nn = 1, nodes DO nt = 1, tasks_per_logical_node DO i = nxl_p, nxr_p fftx_ar(m) = work1(i,k,nt,nn,j) m = m+1 ENDDO ENDDO ENDDO CALL fft_x( fftx_ar, 'forward' ) DO i = nxl_a, nxr_a tri_ar(i,k) = fftx_ar(i) ENDDO ENDDO jj = myid * (nyn_p-nys_p+1) + j CALL tridia_hybrid( jj, tri_ar, tri(:,:,:) ) DO k = 1, nz DO i = nxl_a, nxr_a fftx_ar(i) = tri_ar(i,k) ENDDO CALL fft_x( fftx_ar, 'backward' ) m = nxl_a DO nn = 1, nodes DO nt = 1, tasks_per_logical_node DO i = nxl_p, nxr_p work1(i,k,nt,nn,j) = fftx_ar(m) m = m+1 ENDDO ENDDO ENDDO ENDDO CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'stop' ) nw2 = nw1 * SIZE( work1, 3 ) CALL cpu_log( log_point_s(37), 'alltoall_node', 'continue' ) #if defined( __parallel ) CALL MPI_ALLTOALL( work1(nxl_p,1,1,1,j), nw2, MPI_REAL, & work2(nxl_p,1,1,1,j), nw2, MPI_REAL, & comm_nodes, istat ) #endif CALL cpu_log( log_point_s(37), 'alltoall_node', 'stop' ) ENDDO CALL cpu_log( log_point_s(32), 'alltoall_task', 'start' ) DO nn = 1, nodes DO j = nys_p, nyn_p #if defined( __parallel ) CALL MPI_ALLTOALL( work2(nxl_p,1,1,nn,j), nw1, MPI_REAL, & work1(nxl_p,1,1,nn,j), nw1, MPI_REAL, & comm_tasks, istat ) #endif ENDDO ENDDO CALL cpu_log( log_point_s(32), 'alltoall_task', 'stop' ) CALL cpu_log( log_point_s(7), 'fft_y', 'continue' ) DO iouter = nxl_p, nxr_p, istride iei = MIN( iouter+istride-1, nxr_p ) DO k = 1, nz m = nys_a DO nn = 1, nodes DO nt = 1, tasks_per_logical_node DO j = nys_p, nyn_p DO i = iouter, iei ir = i - iouter + 1 ffty_ar(m,ir) = work1(i,k,nt,nn,j) ENDDO m = m+1 ENDDO ENDDO ENDDO DO i = iouter, iei ii = nxl + i ir = i - iouter + 1 CALL fft_y( ffty_ar(:,ir), 'backward' ) DO j = nys_a, nyn_a ar(k,j,ii) = ffty_ar(j,ir) ENDDO ENDDO ENDDO ENDDO CALL cpu_log( log_point_s(7), 'fft_y', 'stop' ) CALL cpu_log( log_point_s(30), 'poisfft_hybrid_nodes', 'stop' ) END SUBROUTINE poisfft_hybrid_nodes SUBROUTINE tridia_hybrid( j, ar, tri ) USE arrays_3d USE control_parameters USE grid_variables IMPLICIT NONE INTEGER :: i, j, k, nnyh REAL, DIMENSION(0:nx,nz) :: ar REAL, DIMENSION(0:nx,0:nz-1) :: ar1 REAL, DIMENSION(5,0:nx,0:nz-1) :: tri nnyh = (ny+1) / 2 tri = 0.0 ! !-- Define constant elements of the tridiagonal matrix. DO k = 0, nz-1 DO i = 0,nx tri(2,i,k) = ddzu_pres(k+1) * ddzw(k+1) tri(3,i,k) = ddzu_pres(k+2) * ddzw(k+1) ENDDO ENDDO IF ( j <= nnyh ) THEN CALL maketri_hybrid( j ) ELSE CALL maketri_hybrid( ny+1-j) ENDIF CALL zerleg_hybrid CALL substi_hybrid( ar, tri ) CONTAINS SUBROUTINE maketri_hybrid( j ) !----------------------------------------------------------------------! ! maketri ! ! ! ! computes the i- and j-dependent component of the matrix ! !----------------------------------------------------------------------! USE constants IMPLICIT NONE INTEGER :: i, j, k, nnxh REAL :: a, c REAL, DIMENSION(0:nx) :: l nnxh = (nx+1) / 2 ! !-- Provide the tridiagonal matrix for solution of the Poisson equation !-- in Fourier space. The coefficients are computed following the method !-- of Schmidt et al. (DFVLR-Mitteilung 84-15) --> departs from Stephan !-- Siano's original version. DO i = 0,nx IF ( i >= 0 .AND. i < nnxh ) THEN l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / & FLOAT( nx+1 ) ) ) / ( dx * dx ) + & 2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / & FLOAT( ny+1 ) ) ) / ( dy * dy ) ELSEIF ( i == nnxh ) THEN l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / & FLOAT( nx+1 ) ) ) / ( dx * dx ) + & 2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / & FLOAT(ny+1) ) ) / ( dy * dy ) ELSE l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / & FLOAT( nx+1 ) ) ) / ( dx * dx ) + & 2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / & FLOAT( ny+1 ) ) ) / ( dy * dy ) ENDIF ENDDO DO k = 0,nz-1 DO i = 0, nx a = -1.0 * ddzu_pres(k+2) * ddzw(k+1) c = -1.0 * ddzu_pres(k+1) * ddzw(k+1) tri(1,i,k) = a + c - l(i) ENDDO ENDDO IF ( ibc_p_b == 1 .OR. ibc_p_b == 2 ) THEN DO i = 0,nx tri(1,i,0) = tri(1,i,0) + tri(2,i,0) ENDDO ENDIF IF ( ibc_p_t == 1 ) THEN DO i = 0,nx tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1) ENDDO ENDIF END SUBROUTINE maketri_hybrid SUBROUTINE zerleg_hybrid !----------------------------------------------------------------------! ! zerleg ! ! ! ! Splitting of the tridiagonal matrix (Thomas algorithm) ! !----------------------------------------------------------------------! USE indices IMPLICIT NONE INTEGER :: i, k ! !-- Splitting DO i = 0, nx tri(4,i,0) = tri(1,i,0) ENDDO DO k = 1, nz-1 DO i = 0,nx tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1) tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k) ENDDO ENDDO END SUBROUTINE zerleg_hybrid SUBROUTINE substi_hybrid( ar, tri ) !----------------------------------------------------------------------! ! substi ! ! ! ! Substitution (Forward and Backward) (Thomas algorithm) ! !----------------------------------------------------------------------! IMPLICIT NONE INTEGER :: i, j, k REAL, DIMENSION(0:nx,nz) :: ar REAL, DIMENSION(0:nx,0:nz-1) :: ar1 REAL, DIMENSION(5,0:nx,0:nz-1) :: tri ! !-- Forward substitution DO i = 0, nx ar1(i,0) = ar(i,1) ENDDO DO k = 1, nz - 1 DO i = 0,nx ar1(i,k) = ar(i,k+1) - tri(5,i,k) * ar1(i,k-1) ENDDO ENDDO ! !-- Backward substitution DO i = 0,nx ar(i,nz) = ar1(i,nz-1) / tri(4,i,nz-1) ENDDO DO k = nz-2, 0, -1 DO i = 0,nx ar(i,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,k+2) ) & / tri(4,i,k) ENDDO ENDDO END SUBROUTINE substi_hybrid END SUBROUTINE tridia_hybrid SUBROUTINE cascade( loca, j, nys_p, nyn_p ) USE cpulog IMPLICIT NONE INTEGER :: ier, j, loca, nyn_p, nys_p, req, reqa(1) INTEGER, SAVE :: tag = 10 #if defined( __parallel ) INTEGER, DIMENSION(MPI_STATUS_SIZE) :: stat INTEGER, DIMENSION(MPI_STATUS_SIZE,1) :: stata #endif REAL :: buf, buf1 buf = 1.0 buf1 = 1.1 IF ( me_node == 0 ) THEN ! first node only SELECT CASE ( loca ) CASE ( 1 ) ! before alltoall IF( me_task > 0 ) THEN ! first task does not wait #if defined( __parallel ) CALL MPI_SENDRECV( buf, 1, MPI_REAL, me_task-1, 0, & buf1, 1, MPI_REAL, me_task-1, 0, & comm_tasks, stat, ierr ) #endif ELSEIF ( j > nys_p ) THEN req = 0 tag = MOD( tag-10, 10 ) + 10 #if defined( __parallel ) CALL MPI_IRECV( buf, 1, MPI_REAL, tasks_per_logical_node-1,& tag, comm_tasks, req, ierr ) reqa = req CALL MPI_WAITALL( 1, reqa, stata, ierr ) #endif ENDIF CASE ( 2 ) ! after alltoall IF ( me_task < tasks_per_logical_node-1 ) THEN ! last task #if defined( __parallel ) CALL MPI_SENDRECV( buf, 1, MPI_REAL, me_task+1, 0, & buf1, 1, MPI_REAL, me_task+1, 0, & comm_tasks, stat, ierr) #endif ELSEIF ( j < nyn_p ) THEN req = 0 tag = MOD( tag-10, 10 ) + 10 #if defined( __parallel ) CALL MPI_ISEND( buf, 1, MPI_REAL, 0, tag, comm_tasks, req, & ierr ) #endif ENDIF END SELECT ENDIF END SUBROUTINE cascade #endif END MODULE poisfft_hybrid_mod