MODULE poisfft_mod !------------------------------------------------------------------------------! ! Current revisions: ! ----------------- ! ! ! Former revisions: ! ----------------- ! $Id: poisfft.f90 697 2011-03-18 07:10:52Z helmke $ ! ! 696 2011-03-18 07:03:49Z raasch ! work_fftx removed from PRIVATE clauses in fftx_tr_xy and tr_yx_fftx ! ! 683 2011-02-09 14:25:15Z raasch ! openMP parallelization for 2d-domain-decomposition ! ! 667 2010-12-23 12:06:00Z suehring/gryschka ! ddzu replaced by ddzu_pres due to changes in zu(0) ! ! 622 2010-12-10 08:08:13Z raasch ! optional barriers included in order to speed up collective operations ! ! 377 2009-09-04 11:09:00Z raasch ! __lcmuk changed to __lc to avoid problems with Intel compiler on sgi-ice ! ! 164 2008-05-15 08:46:15Z raasch ! Arguments removed from transpose routines ! ! 128 2007-10-26 13:11:14Z raasch ! Bugfix: wavenumber calculation for even nx in routines maketri ! ! 85 2007-05-11 09:35:14Z raasch ! Bugfix: work_fft*_vec removed from some PRIVATE-declarations ! ! 76 2007-03-29 00:58:32Z raasch ! Tridiagonal coefficients adjusted for Neumann boundary conditions both at ! the bottom and the top. ! ! RCS Log replace by Id keyword, revision history cleaned up ! ! Revision 1.24 2006/08/04 15:00:24 raasch ! Default setting of the thread number tn in case of not using OpenMP ! ! Revision 1.23 2006/02/23 12:48:38 raasch ! Additional compiler directive in routine tridia_1dd for preventing loop ! exchange on NEC-SX6 ! ! Revision 1.20 2004/04/30 12:38:09 raasch ! Parts of former poisfft_hybrid moved to this subroutine, ! former subroutine changed to a module, renaming of FFT-subroutines and ! -module, FFTs completely substituted by calls of fft_x and fft_y, ! NAG fft used in the non-parallel case completely removed, l in maketri ! is now a 1d-array, variables passed by modules instead of using parameter ! lists, enlarged transposition arrays introduced ! ! Revision 1.1 1997/07/24 11:24:14 raasch ! Initial revision ! ! ! Description: ! ------------ ! See below. !------------------------------------------------------------------------------! !--------------------------------------------------------------------------! ! poisfft ! ! ! ! Original version: Stephan Siano (pois3d) ! ! ! ! Institute of Meteorology and Climatology, University of Hannover ! ! Germany ! ! ! ! Version as of July 23,1996 ! ! ! ! ! ! Version for parallel computers: Siegfried Raasch ! ! ! ! Version as of July 03,1997 ! ! ! ! Solves the Poisson equation with a 2D spectral method ! ! d^2 p / dx^2 + d^2 p / dy^2 + d^2 p / dz^2 = s ! ! ! ! Input: ! ! real ar contains in the (nnx,nny,nnz) elements, ! ! starting from the element (1,nys,nxl), the ! ! values for s ! ! real work Temporary array ! ! ! ! Output: ! ! real ar contains the solution for p ! !--------------------------------------------------------------------------! USE fft_xy USE indices USE transpose_indices IMPLICIT NONE PRIVATE PUBLIC poisfft, poisfft_init INTERFACE poisfft MODULE PROCEDURE poisfft END INTERFACE poisfft INTERFACE poisfft_init MODULE PROCEDURE poisfft_init END INTERFACE poisfft_init CONTAINS SUBROUTINE poisfft_init CALL fft_init END SUBROUTINE poisfft_init SUBROUTINE poisfft( ar, work ) USE cpulog USE interfaces USE pegrid IMPLICIT NONE REAL, DIMENSION(1:nza,nys:nyna,nxl:nxra) :: ar, work CALL cpu_log( log_point_s(3), 'poisfft', 'start' ) ! !-- Two-dimensional Fourier Transformation in x- and y-direction. #if defined( __parallel ) IF ( pdims(2) == 1 ) THEN ! !-- 1d-domain-decomposition along x: !-- FFT along y and transposition y --> x CALL ffty_tr_yx( ar, work, ar ) ! !-- FFT along x, solving the tridiagonal system and backward FFT CALL fftx_tri_fftx( ar ) ! !-- Transposition x --> y and backward FFT along y CALL tr_xy_ffty( ar, work, ar ) ELSEIF ( pdims(1) == 1 ) THEN ! !-- 1d-domain-decomposition along y: !-- FFT along x and transposition x --> y CALL fftx_tr_xy( ar, work, ar ) ! !-- FFT along y, solving the tridiagonal system and backward FFT CALL ffty_tri_ffty( ar ) ! !-- Transposition y --> x and backward FFT along x CALL tr_yx_fftx( ar, work, ar ) ELSE ! !-- 2d-domain-decomposition !-- Transposition z --> x CALL cpu_log( log_point_s(5), 'transpo forward', 'start' ) CALL transpose_zx( ar, work, ar ) CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' ) CALL cpu_log( log_point_s(4), 'fft_x', 'start' ) CALL fftxp( ar, 'forward' ) CALL cpu_log( log_point_s(4), 'fft_x', 'pause' ) ! !-- Transposition x --> y CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' ) CALL transpose_xy( ar, work, ar ) CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' ) CALL cpu_log( log_point_s(7), 'fft_y', 'start' ) CALL fftyp( ar, 'forward' ) CALL cpu_log( log_point_s(7), 'fft_y', 'pause' ) ! !-- Transposition y --> z CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' ) CALL transpose_yz( ar, work, ar ) CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' ) ! !-- Solve the Poisson equation in z-direction in cartesian space. CALL cpu_log( log_point_s(6), 'tridia', 'start' ) CALL tridia( ar ) CALL cpu_log( log_point_s(6), 'tridia', 'stop' ) ! !-- Inverse Fourier Transformation !-- Transposition z --> y CALL cpu_log( log_point_s(8), 'transpo invers', 'start' ) CALL transpose_zy( ar, work, ar ) CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' ) CALL cpu_log( log_point_s(7), 'fft_y', 'continue' ) CALL fftyp( ar, 'backward' ) CALL cpu_log( log_point_s(7), 'fft_y', 'stop' ) ! !-- Transposition y --> x CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' ) CALL transpose_yx( ar, work, ar ) CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' ) CALL cpu_log( log_point_s(4), 'fft_x', 'continue' ) CALL fftxp( ar, 'backward' ) CALL cpu_log( log_point_s(4), 'fft_x', 'stop' ) ! !-- Transposition x --> z CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' ) CALL transpose_xz( ar, work, ar ) CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' ) ENDIF #else ! !-- Two-dimensional Fourier Transformation along x- and y-direction. CALL cpu_log( log_point_s(4), 'fft_x', 'start' ) CALL fftx( ar, 'forward' ) CALL cpu_log( log_point_s(4), 'fft_x', 'pause' ) CALL cpu_log( log_point_s(7), 'fft_y', 'start' ) CALL ffty( ar, 'forward' ) CALL cpu_log( log_point_s(7), 'fft_y', 'pause' ) ! !-- Solve the Poisson equation in z-direction in cartesian space. CALL cpu_log( log_point_s(6), 'tridia', 'start' ) CALL tridia( ar ) CALL cpu_log( log_point_s(6), 'tridia', 'stop' ) ! !-- Inverse Fourier Transformation. CALL cpu_log( log_point_s(7), 'fft_y', 'continue' ) CALL ffty( ar, 'backward' ) CALL cpu_log( log_point_s(7), 'fft_y', 'stop' ) CALL cpu_log( log_point_s(4), 'fft_x', 'continue' ) CALL fftx( ar, 'backward' ) CALL cpu_log( log_point_s(4), 'fft_x', 'stop' ) #endif CALL cpu_log( log_point_s(3), 'poisfft', 'stop' ) END SUBROUTINE poisfft SUBROUTINE tridia( ar ) !------------------------------------------------------------------------------! ! solves the linear system of equations: ! ! -(4 pi^2(i^2/(dx^2*nnx^2)+j^2/(dy^2*nny^2))+ ! 1/(dzu(k)*dzw(k))+1/(dzu(k-1)*dzw(k)))*p(i,j,k)+ ! 1/(dzu(k)*dzw(k))*p(i,j,k+1)+1/(dzu(k-1)*dzw(k))*p(i,j,k-1)=d(i,j,k) ! ! by using the Thomas algorithm !------------------------------------------------------------------------------! USE arrays_3d IMPLICIT NONE INTEGER :: i, j, k, nnyh REAL, DIMENSION(nxl_z:nxr_z,0:nz-1) :: ar1 REAL, DIMENSION(5,nxl_z:nxr_z,0:nz-1) :: tri #if defined( __parallel ) REAL :: ar(nxl_z:nxr_za,nys_z:nyn_za,1:nza) #else REAL :: ar(1:nz,nys_z:nyn_z,nxl_z:nxr_z) #endif nnyh = (ny+1) / 2 ! !-- Define constant elements of the tridiagonal matrix. !$OMP PARALLEL PRIVATE ( k, i ) !$OMP DO DO k = 0, nz-1 DO i = nxl_z, nxr_z tri(2,i,k) = ddzu_pres(k+1) * ddzw(k+1) tri(3,i,k) = ddzu_pres(k+2) * ddzw(k+1) ENDDO ENDDO !$OMP END PARALLEL #if defined( __parallel ) ! !-- Repeat for all y-levels. !$OMP PARALLEL FIRSTPRIVATE( tri ) PRIVATE ( ar1, j ) !$OMP DO DO j = nys_z, nyn_z IF ( j <= nnyh ) THEN CALL maketri( tri, j ) ELSE CALL maketri( tri, ny+1-j ) ENDIF CALL split( tri ) CALL substi( ar, ar1, tri, j ) ENDDO !$OMP END PARALLEL #else ! !-- First y-level. CALL maketri( tri, nys_z ) CALL split( tri ) CALL substi( ar, ar1, tri, 0 ) ! !-- Further y-levels. DO j = 1, nnyh - 1 CALL maketri( tri, j ) CALL split( tri ) CALL substi( ar, ar1, tri, j ) CALL substi( ar, ar1, tri, ny+1-j ) ENDDO CALL maketri( tri, nnyh ) CALL split( tri ) CALL substi( ar, ar1, tri, nnyh+nys ) #endif CONTAINS SUBROUTINE maketri( tri, j ) !------------------------------------------------------------------------------! ! Computes the i- and j-dependent component of the matrix !------------------------------------------------------------------------------! USE arrays_3d USE constants USE control_parameters USE grid_variables IMPLICIT NONE INTEGER :: i, j, k, nnxh REAL :: a, c REAL :: ll(nxl_z:nxr_z) REAL :: tri(5,nxl_z:nxr_z,0:nz-1) nnxh = ( nx + 1 ) / 2 ! !-- Provide the tridiagonal matrix for solution of the Poisson equation in !-- Fourier space. The coefficients are computed following the method of !-- Schmidt et al. (DFVLR-Mitteilung 84-15), which departs from Stephan !-- Siano's original version by discretizing the Poisson equation, !-- before it is Fourier-transformed #if defined( __parallel ) DO i = nxl_z, nxr_z IF ( i >= 0 .AND. i <= nnxh ) THEN ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / & FLOAT( nx+1 ) ) ) / ( dx * dx ) + & 2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / & FLOAT( ny+1 ) ) ) / ( dy * dy ) ELSE ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / & FLOAT( nx+1 ) ) ) / ( dx * dx ) + & 2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / & FLOAT( ny+1 ) ) ) / ( dy * dy ) ENDIF DO k = 0,nz-1 a = -1.0 * ddzu_pres(k+2) * ddzw(k+1) c = -1.0 * ddzu_pres(k+1) * ddzw(k+1) tri(1,i,k) = a + c - ll(i) ENDDO ENDDO #else DO i = 0, nnxh ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / FLOAT( nx+1 ) ) ) / & ( dx * dx ) + & 2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / FLOAT( ny+1 ) ) ) / & ( dy * dy ) DO k = 0, nz-1 a = -1.0 * ddzu_pres(k+2) * ddzw(k+1) c = -1.0 * ddzu_pres(k+1) * ddzw(k+1) tri(1,i,k) = a + c - ll(i) IF ( i >= 1 .and. i < nnxh ) THEN tri(1,nx+1-i,k) = tri(1,i,k) ENDIF ENDDO ENDDO #endif IF ( ibc_p_b == 1 .OR. ibc_p_b == 2 ) THEN DO i = nxl_z, nxr_z tri(1,i,0) = tri(1,i,0) + tri(2,i,0) ENDDO ENDIF IF ( ibc_p_t == 1 ) THEN DO i = nxl_z, nxr_z tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1) ENDDO ENDIF END SUBROUTINE maketri SUBROUTINE substi( ar, ar1, tri, j ) !------------------------------------------------------------------------------! ! Substitution (Forward and Backward) (Thomas algorithm) !------------------------------------------------------------------------------! USE control_parameters IMPLICIT NONE INTEGER :: i, j, k REAL :: ar1(nxl_z:nxr_z,0:nz-1) REAL :: tri(5,nxl_z:nxr_z,0:nz-1) #if defined( __parallel ) REAL :: ar(nxl_z:nxr_za,nys_z:nyn_za,1:nza) #else REAL :: ar(1:nz,nys_z:nyn_z,nxl_z:nxr_z) #endif ! !-- Forward substitution. DO i = nxl_z, nxr_z #if defined( __parallel ) ar1(i,0) = ar(i,j,1) #else ar1(i,0) = ar(1,j,i) #endif ENDDO DO k = 1, nz - 1 DO i = nxl_z, nxr_z #if defined( __parallel ) ar1(i,k) = ar(i,j,k+1) - tri(5,i,k) * ar1(i,k-1) #else ar1(i,k) = ar(k+1,j,i) - tri(5,i,k) * ar1(i,k-1) #endif ENDDO ENDDO ! !-- Backward substitution. DO i = nxl_z, nxr_z #if defined( __parallel ) ar(i,j,nz) = ar1(i,nz-1) / tri(4,i,nz-1) #else ar(nz,j,i) = ar1(i,nz-1) / tri(4,i,nz-1) #endif ENDDO DO k = nz-2, 0, -1 DO i = nxl_z, nxr_z #if defined( __parallel ) ar(i,j,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,j,k+2) ) & / tri(4,i,k) #else ar(k+1,j,i) = ( ar1(i,k) - tri(3,i,k) * ar(k+2,j,i) ) & / tri(4,i,k) #endif ENDDO ENDDO ! !-- Indices i=0, j=0 correspond to horizontally averaged pressure. !-- The respective values of ar should be zero at all k-levels if !-- acceleration of horizontally averaged vertical velocity is zero. IF ( ibc_p_b == 1 .AND. ibc_p_t == 1 ) THEN IF ( j == 0 .AND. nxl_z == 0 ) THEN #if defined( __parallel ) DO k = 1, nz ar(nxl_z,j,k) = 0.0 ENDDO #else DO k = 1, nz ar(k,j,nxl_z) = 0.0 ENDDO #endif ENDIF ENDIF END SUBROUTINE substi SUBROUTINE split( tri ) !------------------------------------------------------------------------------! ! Splitting of the tridiagonal matrix (Thomas algorithm) !------------------------------------------------------------------------------! IMPLICIT NONE INTEGER :: i, k REAL :: tri(5,nxl_z:nxr_z,0:nz-1) ! !-- Splitting. DO i = nxl_z, nxr_z tri(4,i,0) = tri(1,i,0) ENDDO DO k = 1, nz-1 DO i = nxl_z, nxr_z tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1) tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k) ENDDO ENDDO END SUBROUTINE split END SUBROUTINE tridia #if defined( __parallel ) SUBROUTINE fftxp( ar, direction ) !------------------------------------------------------------------------------! ! Fourier-transformation along x-direction Parallelized version !------------------------------------------------------------------------------! IMPLICIT NONE CHARACTER (LEN=*) :: direction INTEGER :: j, k REAL :: ar(0:nxa,nys_x:nyn_xa,nzb_x:nzt_xa) ! !-- Performing the fft with one of the methods implemented !$OMP PARALLEL PRIVATE ( j, k ) !$OMP DO DO k = nzb_x, nzt_x DO j = nys_x, nyn_x CALL fft_x( ar(0:nx,j,k), direction ) ENDDO ENDDO !$OMP END PARALLEL END SUBROUTINE fftxp #else SUBROUTINE fftx( ar, direction ) !------------------------------------------------------------------------------! ! Fourier-transformation along x-direction Non parallel version !------------------------------------------------------------------------------! IMPLICIT NONE CHARACTER (LEN=*) :: direction INTEGER :: i, j, k REAL :: ar(1:nz,0:ny,0:nx) ! !-- Performing the fft with one of the methods implemented !$OMP PARALLEL PRIVATE ( j, k ) !$OMP DO DO k = 1, nz DO j = 0, ny CALL fft_x( ar(k,j,0:nx), direction ) ENDDO ENDDO !$OMP END PARALLEL END SUBROUTINE fftx #endif #if defined( __parallel ) SUBROUTINE fftyp( ar, direction ) !------------------------------------------------------------------------------! ! Fourier-transformation along y-direction Parallelized version !------------------------------------------------------------------------------! IMPLICIT NONE CHARACTER (LEN=*) :: direction INTEGER :: i, k REAL :: ar(0:nya,nxl_y:nxr_ya,nzb_y:nzt_ya) ! !-- Performing the fft with one of the methods implemented !$OMP PARALLEL PRIVATE ( i, k ) !$OMP DO DO k = nzb_y, nzt_y DO i = nxl_y, nxr_y CALL fft_y( ar(0:ny,i,k), direction ) ENDDO ENDDO !$OMP END PARALLEL END SUBROUTINE fftyp #else SUBROUTINE ffty( ar, direction ) !------------------------------------------------------------------------------! ! Fourier-transformation along y-direction Non parallel version !------------------------------------------------------------------------------! IMPLICIT NONE CHARACTER (LEN=*) :: direction INTEGER :: i, k REAL :: ar(1:nz,0:ny,0:nx) ! !-- Performing the fft with one of the methods implemented !$OMP PARALLEL PRIVATE ( i, k ) !$OMP DO DO k = 1, nz DO i = 0, nx CALL fft_y( ar(k,0:ny,i), direction ) ENDDO ENDDO !$OMP END PARALLEL END SUBROUTINE ffty #endif #if defined( __parallel ) SUBROUTINE ffty_tr_yx( f_in, work, f_out ) !------------------------------------------------------------------------------! ! Fourier-transformation along y with subsequent transposition y --> x for ! a 1d-decomposition along x ! ! ATTENTION: The performance of this routine is much faster on the NEC-SX6, ! if the first index of work_ffty_vec is odd. Otherwise ! memory bank conflicts may occur (especially if the index is a ! multiple of 128). That's why work_ffty_vec is dimensioned as ! 0:ny+1. ! Of course, this will not work if users are using an odd number ! of gridpoints along y. !------------------------------------------------------------------------------! USE control_parameters USE cpulog USE indices USE interfaces USE pegrid USE transpose_indices IMPLICIT NONE INTEGER :: i, iend, iouter, ir, j, k INTEGER, PARAMETER :: stridex = 4 REAL, DIMENSION(0:ny,stridex) :: work_ffty #if defined( __nec ) REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec #endif REAL, DIMENSION(1:nza,0:nya,nxl:nxra) :: f_in REAL, DIMENSION(nnx,1:nza,nys_x:nyn_xa,pdims(1)) :: f_out REAL, DIMENSION(nxl:nxra,1:nza,0:nya) :: work ! !-- Carry out the FFT along y, where all data are present due to the !-- 1d-decomposition along x. Resort the data in a way that x becomes !-- the first index. CALL cpu_log( log_point_s(7), 'fft_y', 'start' ) IF ( host(1:3) == 'nec' ) THEN #if defined( __nec ) ! !-- Code optimized for vector processors !$OMP PARALLEL PRIVATE ( i, j, k ) !$OMP DO DO i = nxl, nxr DO j = 0, ny DO k = 1, nz work_ffty_vec(j,k,i) = f_in(k,j,i) ENDDO ENDDO CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' ) ENDDO !$OMP DO DO k = 1, nz DO j = 0, ny DO i = nxl, nxr work(i,k,j) = work_ffty_vec(j,k,i) ENDDO ENDDO ENDDO !$OMP END PARALLEL #endif ELSE ! !-- Cache optimized code. !-- The i-(x-)direction is split into a strided outer loop and an inner !-- loop for better cache performance !$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty) !$OMP DO DO iouter = nxl, nxr, stridex iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop DO k = 1, nz DO i = iouter, iend ir = i-iouter+1 ! counter within a stride DO j = 0, ny work_ffty(j,ir) = f_in(k,j,i) ENDDO ! !-- FFT along y CALL fft_y( work_ffty(:,ir), 'forward' ) ENDDO ! !-- Resort DO j = 0, ny DO i = iouter, iend work(i,k,j) = work_ffty(j,i-iouter+1) ENDDO ENDDO ENDDO ENDDO !$OMP END PARALLEL ENDIF CALL cpu_log( log_point_s(7), 'fft_y', 'pause' ) ! !-- Transpose array CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' ) IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr ) CALL MPI_ALLTOALL( work(nxl,1,0), sendrecvcount_xy, MPI_REAL, & f_out(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, & comm1dx, ierr ) CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' ) END SUBROUTINE ffty_tr_yx SUBROUTINE tr_xy_ffty( f_in, work, f_out ) !------------------------------------------------------------------------------! ! Transposition x --> y with a subsequent backward Fourier transformation for ! a 1d-decomposition along x !------------------------------------------------------------------------------! USE control_parameters USE cpulog USE indices USE interfaces USE pegrid USE transpose_indices IMPLICIT NONE INTEGER :: i, iend, iouter, ir, j, k INTEGER, PARAMETER :: stridex = 4 REAL, DIMENSION(0:ny,stridex) :: work_ffty #if defined( __nec ) REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec #endif REAL, DIMENSION(nnx,1:nza,nys_x:nyn_xa,pdims(1)) :: f_in REAL, DIMENSION(1:nza,0:nya,nxl:nxra) :: f_out REAL, DIMENSION(nxl:nxra,1:nza,0:nya) :: work ! !-- Transpose array CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' ) IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr ) CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, & work(nxl,1,0), sendrecvcount_xy, MPI_REAL, & comm1dx, ierr ) CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' ) ! !-- Resort the data in a way that y becomes the first index and carry out the !-- backward fft along y. CALL cpu_log( log_point_s(7), 'fft_y', 'continue' ) IF ( host(1:3) == 'nec' ) THEN #if defined( __nec ) ! !-- Code optimized for vector processors !$OMP PARALLEL PRIVATE ( i, j, k ) !$OMP DO DO k = 1, nz DO j = 0, ny DO i = nxl, nxr work_ffty_vec(j,k,i) = work(i,k,j) ENDDO ENDDO ENDDO !$OMP DO DO i = nxl, nxr CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' ) DO j = 0, ny DO k = 1, nz f_out(k,j,i) = work_ffty_vec(j,k,i) ENDDO ENDDO ENDDO !$OMP END PARALLEL #endif ELSE ! !-- Cache optimized code. !-- The i-(x-)direction is split into a strided outer loop and an inner !-- loop for better cache performance !$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty ) !$OMP DO DO iouter = nxl, nxr, stridex iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop DO k = 1, nz ! !-- Resort DO j = 0, ny DO i = iouter, iend work_ffty(j,i-iouter+1) = work(i,k,j) ENDDO ENDDO DO i = iouter, iend ! !-- FFT along y ir = i-iouter+1 ! counter within a stride CALL fft_y( work_ffty(:,ir), 'backward' ) DO j = 0, ny f_out(k,j,i) = work_ffty(j,ir) ENDDO ENDDO ENDDO ENDDO !$OMP END PARALLEL ENDIF CALL cpu_log( log_point_s(7), 'fft_y', 'stop' ) END SUBROUTINE tr_xy_ffty SUBROUTINE fftx_tri_fftx( ar ) !------------------------------------------------------------------------------! ! FFT along x, solution of the tridiagonal system and backward FFT for ! a 1d-decomposition along x ! ! WARNING: this subroutine may still not work for hybrid parallelization ! with OpenMP (for possible necessary changes see the original ! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002) !------------------------------------------------------------------------------! USE control_parameters USE cpulog USE grid_variables USE indices USE interfaces USE pegrid USE transpose_indices IMPLICIT NONE character(len=3) :: myth_char INTEGER :: i, j, k, m, n, omp_get_thread_num, tn REAL, DIMENSION(0:nx) :: work_fftx REAL, DIMENSION(0:nx,1:nz) :: work_trix REAL, DIMENSION(nnx,1:nza,nys_x:nyn_xa,pdims(1)) :: ar REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'start' ) ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) ) tn = 0 ! Default thread number in case of one thread !$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_fftx, work_trix ) DO j = nys_x, nyn_x !$ tn = omp_get_thread_num() IF ( host(1:3) == 'nec' ) THEN ! !-- Code optimized for vector processors DO k = 1, nz m = 0 DO n = 1, pdims(1) DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!! work_trix(m,k) = ar(i,k,j,n) m = m + 1 ENDDO ENDDO ENDDO CALL fft_x_m( work_trix, 'forward' ) ELSE ! !-- Cache optimized code DO k = 1, nz m = 0 DO n = 1, pdims(1) DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!! work_fftx(m) = ar(i,k,j,n) m = m + 1 ENDDO ENDDO CALL fft_x( work_fftx, 'forward' ) DO i = 0, nx work_trix(i,k) = work_fftx(i) ENDDO ENDDO ENDIF ! !-- Solve the linear equation system CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) ) IF ( host(1:3) == 'nec' ) THEN ! !-- Code optimized for vector processors CALL fft_x_m( work_trix, 'backward' ) DO k = 1, nz m = 0 DO n = 1, pdims(1) DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!! ar(i,k,j,n) = work_trix(m,k) m = m + 1 ENDDO ENDDO ENDDO ELSE ! !-- Cache optimized code DO k = 1, nz DO i = 0, nx work_fftx(i) = work_trix(i,k) ENDDO CALL fft_x( work_fftx, 'backward' ) m = 0 DO n = 1, pdims(1) DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!! ar(i,k,j,n) = work_fftx(m) m = m + 1 ENDDO ENDDO ENDDO ENDIF ENDDO DEALLOCATE( tri ) CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'stop' ) END SUBROUTINE fftx_tri_fftx SUBROUTINE fftx_tr_xy( f_in, work, f_out ) !------------------------------------------------------------------------------! ! Fourier-transformation along x with subsequent transposition x --> y for ! a 1d-decomposition along y ! ! ATTENTION: The NEC-branch of this routine may significantly profit from ! further optimizations. So far, performance is much worse than ! for routine ffty_tr_yx (more than three times slower). !------------------------------------------------------------------------------! USE control_parameters USE cpulog USE indices USE interfaces USE pegrid USE transpose_indices IMPLICIT NONE INTEGER :: i, j, k REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx REAL, DIMENSION(1:nza,nys:nyna,0:nxa) :: f_in REAL, DIMENSION(nny,1:nza,nxl_y:nxr_ya,pdims(2)) :: f_out REAL, DIMENSION(nys:nyna,1:nza,0:nxa) :: work ! !-- Carry out the FFT along x, where all data are present due to the !-- 1d-decomposition along y. Resort the data in a way that y becomes !-- the first index. CALL cpu_log( log_point_s(4), 'fft_x', 'start' ) IF ( host(1:3) == 'nec' ) THEN ! !-- Code for vector processors !$OMP PARALLEL PRIVATE ( i, j, k ) !$OMP DO DO i = 0, nx DO j = nys, nyn DO k = 1, nz work_fftx(i,k,j) = f_in(k,j,i) ENDDO ENDDO ENDDO !$OMP DO DO j = nys, nyn CALL fft_x_m( work_fftx(:,:,j), 'forward' ) DO k = 1, nz DO i = 0, nx work(j,k,i) = work_fftx(i,k,j) ENDDO ENDDO ENDDO !$OMP END PARALLEL ELSE ! !-- Cache optimized code (there might be still a potential for better !-- optimization). !$OMP PARALLEL PRIVATE (i,j,k) !$OMP DO DO i = 0, nx DO j = nys, nyn DO k = 1, nz work_fftx(i,k,j) = f_in(k,j,i) ENDDO ENDDO ENDDO !$OMP DO DO j = nys, nyn DO k = 1, nz CALL fft_x( work_fftx(0:nx,k,j), 'forward' ) DO i = 0, nx work(j,k,i) = work_fftx(i,k,j) ENDDO ENDDO ENDDO !$OMP END PARALLEL ENDIF CALL cpu_log( log_point_s(4), 'fft_x', 'pause' ) ! !-- Transpose array CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' ) IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr ) CALL MPI_ALLTOALL( work(nys,1,0), sendrecvcount_xy, MPI_REAL, & f_out(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, & comm1dy, ierr ) CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' ) END SUBROUTINE fftx_tr_xy SUBROUTINE tr_yx_fftx( f_in, work, f_out ) !------------------------------------------------------------------------------! ! Transposition y --> x with a subsequent backward Fourier transformation for ! a 1d-decomposition along x !------------------------------------------------------------------------------! USE control_parameters USE cpulog USE indices USE interfaces USE pegrid USE transpose_indices IMPLICIT NONE INTEGER :: i, j, k REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx REAL, DIMENSION(nny,1:nza,nxl_y:nxr_ya,pdims(2)) :: f_in REAL, DIMENSION(1:nza,nys:nyna,0:nxa) :: f_out REAL, DIMENSION(nys:nyna,1:nza,0:nxa) :: work ! !-- Transpose array CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' ) IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr ) CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, & work(nys,1,0), sendrecvcount_xy, MPI_REAL, & comm1dy, ierr ) CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' ) ! !-- Carry out the FFT along x, where all data are present due to the !-- 1d-decomposition along y. Resort the data in a way that y becomes !-- the first index. CALL cpu_log( log_point_s(4), 'fft_x', 'continue' ) IF ( host(1:3) == 'nec' ) THEN ! !-- Code optimized for vector processors !$OMP PARALLEL PRIVATE ( i, j, k ) !$OMP DO DO j = nys, nyn DO k = 1, nz DO i = 0, nx work_fftx(i,k,j) = work(j,k,i) ENDDO ENDDO CALL fft_x_m( work_fftx(:,:,j), 'backward' ) ENDDO !$OMP DO DO i = 0, nx DO j = nys, nyn DO k = 1, nz f_out(k,j,i) = work_fftx(i,k,j) ENDDO ENDDO ENDDO !$OMP END PARALLEL ELSE ! !-- Cache optimized code (there might be still a potential for better !-- optimization). !$OMP PARALLEL PRIVATE (i,j,k) !$OMP DO DO j = nys, nyn DO k = 1, nz DO i = 0, nx work_fftx(i,k,j) = work(j,k,i) ENDDO CALL fft_x( work_fftx(0:nx,k,j), 'backward' ) ENDDO ENDDO !$OMP DO DO i = 0, nx DO j = nys, nyn DO k = 1, nz f_out(k,j,i) = work_fftx(i,k,j) ENDDO ENDDO ENDDO !$OMP END PARALLEL ENDIF CALL cpu_log( log_point_s(4), 'fft_x', 'stop' ) END SUBROUTINE tr_yx_fftx SUBROUTINE ffty_tri_ffty( ar ) !------------------------------------------------------------------------------! ! FFT along y, solution of the tridiagonal system and backward FFT for ! a 1d-decomposition along y ! ! WARNING: this subroutine may still not work for hybrid parallelization ! with OpenMP (for possible necessary changes see the original ! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002) !------------------------------------------------------------------------------! USE control_parameters USE cpulog USE grid_variables USE indices USE interfaces USE pegrid USE transpose_indices IMPLICIT NONE INTEGER :: i, j, k, m, n, omp_get_thread_num, tn REAL, DIMENSION(0:ny) :: work_ffty REAL, DIMENSION(0:ny,1:nz) :: work_triy REAL, DIMENSION(nny,1:nza,nxl_y:nxr_ya,pdims(2)) :: ar REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri CALL cpu_log( log_point_s(39), 'fft_y + tridia', 'start' ) ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) ) tn = 0 ! Default thread number in case of one thread !$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy ) DO i = nxl_y, nxr_y !$ tn = omp_get_thread_num() IF ( host(1:3) == 'nec' ) THEN ! !-- Code optimized for vector processors DO k = 1, nz m = 0 DO n = 1, pdims(2) DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!! work_triy(m,k) = ar(j,k,i,n) m = m + 1 ENDDO ENDDO ENDDO CALL fft_y_m( work_triy, ny, 'forward' ) ELSE ! !-- Cache optimized code DO k = 1, nz m = 0 DO n = 1, pdims(2) DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!! work_ffty(m) = ar(j,k,i,n) m = m + 1 ENDDO ENDDO CALL fft_y( work_ffty, 'forward' ) DO j = 0, ny work_triy(j,k) = work_ffty(j) ENDDO ENDDO ENDIF ! !-- Solve the linear equation system CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) ) IF ( host(1:3) == 'nec' ) THEN ! !-- Code optimized for vector processors CALL fft_y_m( work_triy, ny, 'backward' ) DO k = 1, nz m = 0 DO n = 1, pdims(2) DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!! ar(j,k,i,n) = work_triy(m,k) m = m + 1 ENDDO ENDDO ENDDO ELSE ! !-- Cache optimized code DO k = 1, nz DO j = 0, ny work_ffty(j) = work_triy(j,k) ENDDO CALL fft_y( work_ffty, 'backward' ) m = 0 DO n = 1, pdims(2) DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!! ar(j,k,i,n) = work_ffty(m) m = m + 1 ENDDO ENDDO ENDDO ENDIF ENDDO DEALLOCATE( tri ) CALL cpu_log( log_point_s(39), 'fft_y + tridia', 'stop' ) END SUBROUTINE ffty_tri_ffty SUBROUTINE tridia_1dd( ddx2, ddy2, nx, ny, j, ar, tri ) !------------------------------------------------------------------------------! ! Solves the linear system of equations for a 1d-decomposition along x (see ! tridia) ! ! Attention: when using the intel compiler, array tri must be passed as an ! argument to the contained subroutines. Otherwise addres faults ! will occur. ! On NEC, tri should not be passed (except for routine substi_1dd) ! because this causes very bad performance. !------------------------------------------------------------------------------! USE arrays_3d USE control_parameters USE pegrid IMPLICIT NONE INTEGER :: i, j, k, nnyh, nx, ny, omp_get_thread_num, tn REAL :: ddx2, ddy2 REAL, DIMENSION(0:nx,1:nz) :: ar REAL, DIMENSION(0:nx,0:nz-1) :: ar1 REAL, DIMENSION(5,0:nx,0:nz-1) :: tri nnyh = ( ny + 1 ) / 2 ! !-- Define constant elements of the tridiagonal matrix. !-- The compiler on SX6 does loop exchange. If 0:nx is a high power of 2, !-- the exchanged loops create bank conflicts. The following directive !-- prohibits loop exchange and the loops perform much better. ! tn = omp_get_thread_num() ! WRITE( 120+tn, * ) '+++ id=',myid,' nx=',nx,' thread=', omp_get_thread_num() ! CALL local_flush( 120+tn ) !CDIR NOLOOPCHG DO k = 0, nz-1 DO i = 0,nx tri(2,i,k) = ddzu_pres(k+1) * ddzw(k+1) tri(3,i,k) = ddzu_pres(k+2) * ddzw(k+1) ENDDO ENDDO ! WRITE( 120+tn, * ) '+++ id=',myid,' end of first tridia loop thread=', omp_get_thread_num() ! CALL local_flush( 120+tn ) IF ( j <= nnyh ) THEN #if defined( __lc ) CALL maketri_1dd( j, tri ) #else CALL maketri_1dd( j ) #endif ELSE #if defined( __lc ) CALL maketri_1dd( ny+1-j, tri ) #else CALL maketri_1dd( ny+1-j ) #endif ENDIF #if defined( __lc ) CALL split_1dd( tri ) #else CALL split_1dd #endif CALL substi_1dd( ar, tri ) CONTAINS #if defined( __lc ) SUBROUTINE maketri_1dd( j, tri ) #else SUBROUTINE maketri_1dd( j ) #endif !------------------------------------------------------------------------------! ! computes the i- and j-dependent component of the matrix !------------------------------------------------------------------------------! USE constants IMPLICIT NONE INTEGER :: i, j, k, nnxh REAL :: a, c REAL, DIMENSION(0:nx) :: l #if defined( __lc ) REAL, DIMENSION(5,0:nx,0:nz-1) :: tri #endif nnxh = ( nx + 1 ) / 2 ! !-- Provide the tridiagonal matrix for solution of the Poisson equation in !-- Fourier space. The coefficients are computed following the method of !-- Schmidt et al. (DFVLR-Mitteilung 84-15), which departs from Stephan !-- Siano's original version by discretizing the Poisson equation, !-- before it is Fourier-transformed DO i = 0, nx IF ( i >= 0 .AND. i <= nnxh ) THEN l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / & FLOAT( nx+1 ) ) ) * ddx2 + & 2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / & FLOAT( ny+1 ) ) ) * ddy2 ELSE l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / & FLOAT( nx+1 ) ) ) * ddx2 + & 2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / & FLOAT( ny+1 ) ) ) * ddy2 ENDIF ENDDO DO k = 0, nz-1 DO i = 0, nx a = -1.0 * ddzu_pres(k+2) * ddzw(k+1) c = -1.0 * ddzu_pres(k+1) * ddzw(k+1) tri(1,i,k) = a + c - l(i) ENDDO ENDDO IF ( ibc_p_b == 1 .OR. ibc_p_b == 2 ) THEN DO i = 0, nx tri(1,i,0) = tri(1,i,0) + tri(2,i,0) ENDDO ENDIF IF ( ibc_p_t == 1 ) THEN DO i = 0, nx tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1) ENDDO ENDIF END SUBROUTINE maketri_1dd #if defined( __lc ) SUBROUTINE split_1dd( tri ) #else SUBROUTINE split_1dd #endif !------------------------------------------------------------------------------! ! Splitting of the tridiagonal matrix (Thomas algorithm) !------------------------------------------------------------------------------! IMPLICIT NONE INTEGER :: i, k #if defined( __lc ) REAL, DIMENSION(5,0:nx,0:nz-1) :: tri #endif ! !-- Splitting DO i = 0, nx tri(4,i,0) = tri(1,i,0) ENDDO DO k = 1, nz-1 DO i = 0, nx tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1) tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k) ENDDO ENDDO END SUBROUTINE split_1dd SUBROUTINE substi_1dd( ar, tri ) !------------------------------------------------------------------------------! ! Substitution (Forward and Backward) (Thomas algorithm) !------------------------------------------------------------------------------! IMPLICIT NONE INTEGER :: i, k REAL, DIMENSION(0:nx,nz) :: ar REAL, DIMENSION(0:nx,0:nz-1) :: ar1 REAL, DIMENSION(5,0:nx,0:nz-1) :: tri ! !-- Forward substitution DO i = 0, nx ar1(i,0) = ar(i,1) ENDDO DO k = 1, nz-1 DO i = 0, nx ar1(i,k) = ar(i,k+1) - tri(5,i,k) * ar1(i,k-1) ENDDO ENDDO ! !-- Backward substitution DO i = 0, nx ar(i,nz) = ar1(i,nz-1) / tri(4,i,nz-1) ENDDO DO k = nz-2, 0, -1 DO i = 0, nx ar(i,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,k+2) ) & / tri(4,i,k) ENDDO ENDDO ! !-- Indices i=0, j=0 correspond to horizontally averaged pressure. !-- The respective values of ar should be zero at all k-levels if !-- acceleration of horizontally averaged vertical velocity is zero. IF ( ibc_p_b == 1 .AND. ibc_p_t == 1 ) THEN IF ( j == 0 ) THEN DO k = 1, nz ar(0,k) = 0.0 ENDDO ENDIF ENDIF END SUBROUTINE substi_1dd END SUBROUTINE tridia_1dd #endif END MODULE poisfft_mod