MODULE poisfft_mod

!------------------------------------------------------------------------------!
! Actual revisions:
! -----------------
! 
!
! Former revisions:
! -----------------
! $Id: poisfft.f90 198 2008-09-17 08:55:28Z steinfeld $
!
! 164 2008-05-15 08:46:15Z raasch
! Arguments removed from transpose routines
!
! 128 2007-10-26 13:11:14Z raasch
! Bugfix: wavenumber calculation for even nx in routines maketri
!
! 85 2007-05-11 09:35:14Z raasch
! Bugfix: work_fft*_vec removed from some PRIVATE-declarations
!
! 76 2007-03-29 00:58:32Z raasch
! Tridiagonal coefficients adjusted for Neumann boundary conditions both at
! the bottom and the top.
!
! RCS Log replace by Id keyword, revision history cleaned up
!
! Revision 1.24  2006/08/04 15:00:24  raasch
! Default setting of the thread number tn in case of not using OpenMP
!
! Revision 1.23  2006/02/23 12:48:38  raasch
! Additional compiler directive in routine tridia_1dd for preventing loop
! exchange on NEC-SX6
!
! Revision 1.20  2004/04/30 12:38:09  raasch
! Parts of former poisfft_hybrid moved to this subroutine,
! former subroutine changed to a module, renaming of FFT-subroutines and
! -module, FFTs completely substituted by calls of fft_x and fft_y,
! NAG fft used in the non-parallel case completely removed, l in maketri
! is now a 1d-array, variables passed by modules instead of using parameter
! lists, enlarged transposition arrays introduced
!
! Revision 1.1  1997/07/24 11:24:14  raasch
! Initial revision
!
!
! Description:
! ------------
! See below.
!------------------------------------------------------------------------------!

!--------------------------------------------------------------------------!
!                             poisfft                                      !
!                                                                          !
!                Original version: Stephan Siano (pois3d)                  !
!                                                                          !
!  Institute of Meteorology and Climatology, University of Hannover        !
!                             Germany                                      !
!                                                                          !
!  Version as of July 23,1996                                              !
!                                                                          !
!                                                                          !
!        Version for parallel computers: Siegfried Raasch                  !
!                                                                          !
!  Version as of July 03,1997                                              !
!                                                                          !
!  Solves the Poisson equation with a 2D spectral method                   !
!        d^2 p / dx^2 + d^2 p / dy^2 + d^2 p / dz^2 = s                    !
!                                                                          !
!  Input:                                                                  !
!  real    ar                 contains in the (nnx,nny,nnz) elements,      !
!                             starting from the element (1,nys,nxl), the   !
!                             values for s                                 !
!  real    work               Temporary array                              !
!                                                                          !
!  Output:                                                                 !
!  real    ar                 contains the solution for p                  !
!--------------------------------------------------------------------------!

    USE fft_xy
    USE indices
    USE transpose_indices

    IMPLICIT NONE

    PRIVATE
    PUBLIC  poisfft, poisfft_init

    INTERFACE poisfft
       MODULE PROCEDURE poisfft
    END INTERFACE poisfft

    INTERFACE poisfft_init
       MODULE PROCEDURE poisfft_init
    END INTERFACE poisfft_init

 CONTAINS

    SUBROUTINE poisfft_init

       CALL fft_init

    END SUBROUTINE poisfft_init


    SUBROUTINE poisfft( ar, work )

       USE cpulog
       USE interfaces
       USE pegrid

       IMPLICIT NONE

       REAL, DIMENSION(1:nza,nys:nyna,nxl:nxra) ::  ar, work


       CALL cpu_log( log_point_s(3), 'poisfft', 'start' )

!
!--    Two-dimensional Fourier Transformation in x- and y-direction.
#if defined( __parallel )
       IF ( pdims(2) == 1 )  THEN

!
!--       1d-domain-decomposition along x:
!--       FFT along y and transposition y --> x
          CALL ffty_tr_yx( ar, work, ar )

!
!--       FFT along x, solving the tridiagonal system and backward FFT
          CALL fftx_tri_fftx( ar )

!
!--       Transposition x --> y and backward FFT along y
          CALL tr_xy_ffty( ar, work, ar )

       ELSEIF ( pdims(1) == 1 )  THEN

!
!--       1d-domain-decomposition along y:
!--       FFT along x and transposition x --> y
          CALL fftx_tr_xy( ar, work, ar )

!
!--       FFT along y, solving the tridiagonal system and backward FFT
          CALL ffty_tri_ffty( ar )

!
!--       Transposition y --> x and backward FFT along x
          CALL tr_yx_fftx( ar, work, ar )

       ELSE

!
!--       2d-domain-decomposition
!--       Transposition z --> x
          CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
          CALL transpose_zx( ar, work, ar )
          CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )

          CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
          CALL fftxp( ar, 'forward' )
          CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )

!
!--       Transposition x --> y
          CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
          CALL transpose_xy( ar, work, ar )
          CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )

          CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
          CALL fftyp( ar, 'forward' )
          CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )

!
!--       Transposition y --> z
          CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
          CALL transpose_yz( ar, work, ar )
          CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )

!
!--       Solve the Poisson equation in z-direction in cartesian space.
          CALL cpu_log( log_point_s(6), 'tridia', 'start' )
          CALL tridia( ar )
          CALL cpu_log( log_point_s(6), 'tridia', 'stop' )

!
!--       Inverse Fourier Transformation
!--       Transposition z --> y
          CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
          CALL transpose_zy( ar, work, ar )
          CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )

          CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
          CALL fftyp( ar, 'backward' )
          CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )

!
!--       Transposition y --> x
          CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
          CALL transpose_yx( ar, work, ar )
          CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )

          CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
          CALL fftxp( ar, 'backward' )
          CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )

!
!--       Transposition x --> z
          CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
          CALL transpose_xz( ar, work, ar )
          CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )

       ENDIF

#else

!
!--    Two-dimensional Fourier Transformation along x- and y-direction.
       CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
       CALL fftx( ar, 'forward' )
       CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
       CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
       CALL ffty( ar, 'forward' )
       CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )

!
!--    Solve the Poisson equation in z-direction in cartesian space.
       CALL cpu_log( log_point_s(6), 'tridia', 'start' )
       CALL tridia( ar )
       CALL cpu_log( log_point_s(6), 'tridia', 'stop' )

!
!--    Inverse Fourier Transformation.
       CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
       CALL ffty( ar, 'backward' )
       CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
       CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
       CALL fftx( ar, 'backward' )
       CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )

#endif

       CALL cpu_log( log_point_s(3), 'poisfft', 'stop' )

    END SUBROUTINE poisfft


    SUBROUTINE tridia( ar )

!------------------------------------------------------------------------------!
! solves the linear system of equations:
!
! -(4 pi^2(i^2/(dx^2*nnx^2)+j^2/(dy^2*nny^2))+
!   1/(dzu(k)*dzw(k))+1/(dzu(k-1)*dzw(k)))*p(i,j,k)+
! 1/(dzu(k)*dzw(k))*p(i,j,k+1)+1/(dzu(k-1)*dzw(k))*p(i,j,k-1)=d(i,j,k)
!
! by using the Thomas algorithm
!------------------------------------------------------------------------------!

       USE arrays_3d

       IMPLICIT NONE

       INTEGER ::  i, j, k, nnyh

       REAL, DIMENSION(nxl_z:nxr_z,0:nz-1)   ::  ar1
       REAL, DIMENSION(5,nxl_z:nxr_z,0:nz-1) ::  tri

#if defined( __parallel )
       REAL    ::  ar(nxl_z:nxr_za,nys_z:nyn_za,1:nza)
#else
       REAL    ::  ar(1:nz,nys_z:nyn_z,nxl_z:nxr_z)
#endif


       nnyh = (ny+1) / 2

!
!--    Define constant elements of the tridiagonal matrix.
       DO  k = 0, nz-1
          DO  i = nxl_z, nxr_z
             tri(2,i,k) = ddzu(k+1) * ddzw(k+1)
             tri(3,i,k) = ddzu(k+2) * ddzw(k+1)
          ENDDO
       ENDDO

#if defined( __parallel )
!
!--    Repeat for all y-levels.
       DO  j = nys_z, nyn_z
          IF ( j <= nnyh )  THEN
             CALL maketri( tri, j )
          ELSE
             CALL maketri( tri, ny+1-j )
          ENDIF
          CALL split( tri )
          CALL substi( ar, ar1, tri, j )
       ENDDO
#else
!
!--    First y-level.
       CALL maketri( tri, nys_z )
       CALL split( tri )
       CALL substi( ar, ar1, tri, 0 )

!
!--    Further y-levels.
       DO  j = 1, nnyh - 1
          CALL maketri( tri, j )
          CALL split( tri )
          CALL substi( ar, ar1, tri, j )
          CALL substi( ar, ar1, tri, ny+1-j )
       ENDDO
       CALL maketri( tri, nnyh )
       CALL split( tri )
       CALL substi( ar, ar1, tri, nnyh+nys )
#endif

    CONTAINS

       SUBROUTINE maketri( tri, j )

!------------------------------------------------------------------------------!
! Computes the i- and j-dependent component of the matrix
!------------------------------------------------------------------------------!

          USE arrays_3d
          USE constants
          USE control_parameters
          USE grid_variables

          IMPLICIT NONE

          INTEGER ::  i, j, k, nnxh
          REAL    ::  a, c
          REAL    ::  ll(nxl_z:nxr_z)
          REAL    ::  tri(5,nxl_z:nxr_z,0:nz-1)


          nnxh = ( nx + 1 ) / 2

!
!--       Provide the tridiagonal matrix for solution of the Poisson equation in
!--       Fourier space. The coefficients are computed following the method of
!--       Schmidt et al. (DFVLR-Mitteilung 84-15), which departs from Stephan
!--       Siano's original version by discretizing the Poisson equation,
!--       before it is Fourier-transformed
#if defined( __parallel )
          DO  i = nxl_z, nxr_z
             IF ( i >= 0 .AND. i <= nnxh )  THEN
                ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
                                          FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
                        2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
                                          FLOAT( ny+1 ) ) ) / ( dy * dy )
             ELSE
                ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
                                          FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
                        2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
                                          FLOAT( ny+1 ) ) ) / ( dy * dy )
             ENDIF
             DO  k = 0,nz-1
                a = -1.0 * ddzu(k+2) * ddzw(k+1)
                c = -1.0 * ddzu(k+1) * ddzw(k+1)
                tri(1,i,k) = a + c - ll(i)
             ENDDO
          ENDDO
#else
          DO  i = 0, nnxh
             ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / FLOAT( nx+1 ) ) ) / &
                           ( dx * dx ) + &
                     2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / FLOAT( ny+1 ) ) ) / &
                           ( dy * dy )
             DO  k = 0, nz-1
                a = -1.0 * ddzu(k+2) * ddzw(k+1)
                c = -1.0 * ddzu(k+1) * ddzw(k+1)
                tri(1,i,k) = a + c - ll(i)
                IF ( i >= 1 .and. i < nnxh )  THEN
                   tri(1,nx+1-i,k) = tri(1,i,k)
                ENDIF
             ENDDO
          ENDDO
#endif
          IF ( ibc_p_b == 1  .OR.  ibc_p_b == 2 )  THEN
             DO  i = nxl_z, nxr_z
                tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
             ENDDO
          ENDIF
          IF ( ibc_p_t == 1 )  THEN
             DO  i = nxl_z, nxr_z
                tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
             ENDDO
          ENDIF

       END SUBROUTINE maketri


       SUBROUTINE substi( ar, ar1, tri, j )

!------------------------------------------------------------------------------!
! Substitution (Forward and Backward) (Thomas algorithm)
!------------------------------------------------------------------------------!

          USE control_parameters

          IMPLICIT NONE

          INTEGER ::  i, j, k
          REAL    ::  ar1(nxl_z:nxr_z,0:nz-1)
          REAL    ::  tri(5,nxl_z:nxr_z,0:nz-1)
#if defined( __parallel )
          REAL    ::  ar(nxl_z:nxr_za,nys_z:nyn_za,1:nza)
#else
          REAL    ::  ar(1:nz,nys_z:nyn_z,nxl_z:nxr_z)
#endif

!
!--       Forward substitution.
          DO  i = nxl_z, nxr_z
#if defined( __parallel )
             ar1(i,0) = ar(i,j,1)
#else
             ar1(i,0) = ar(1,j,i)
#endif
          ENDDO
          DO  k = 1, nz - 1
             DO  i = nxl_z, nxr_z
#if defined( __parallel )
                ar1(i,k) = ar(i,j,k+1) - tri(5,i,k) * ar1(i,k-1)
#else
                ar1(i,k) = ar(k+1,j,i) - tri(5,i,k) * ar1(i,k-1)
#endif
             ENDDO
          ENDDO

!
!--       Backward substitution.
          DO  i = nxl_z, nxr_z
#if defined( __parallel )
             ar(i,j,nz) = ar1(i,nz-1) / tri(4,i,nz-1)
#else
             ar(nz,j,i) = ar1(i,nz-1) / tri(4,i,nz-1)
#endif
          ENDDO
          DO  k = nz-2, 0, -1
             DO  i = nxl_z, nxr_z
#if defined( __parallel )
                ar(i,j,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,j,k+2) ) &
                              / tri(4,i,k)
#else
                ar(k+1,j,i) = ( ar1(i,k) - tri(3,i,k) * ar(k+2,j,i) ) &
                              / tri(4,i,k)
#endif
             ENDDO
          ENDDO

!
!--       Indices i=0, j=0 correspond to horizontally averaged pressure.
!--       The respective values of ar should be zero at all k-levels if
!--       acceleration of horizontally averaged vertical velocity is zero.
          IF ( ibc_p_b == 1  .AND.  ibc_p_t == 1 )  THEN
             IF ( j == 0  .AND.  nxl_z == 0 )  THEN
#if defined( __parallel )
                DO  k = 1, nz
                   ar(nxl_z,j,k) = 0.0
                ENDDO
#else
                DO  k = 1, nz
                   ar(k,j,nxl_z) = 0.0
                ENDDO
#endif
             ENDIF
          ENDIF

       END SUBROUTINE substi


       SUBROUTINE split( tri )

!------------------------------------------------------------------------------!
! Splitting of the tridiagonal matrix (Thomas algorithm)
!------------------------------------------------------------------------------!

          IMPLICIT NONE

          INTEGER ::  i, k
          REAL    ::  tri(5,nxl_z:nxr_z,0:nz-1)

!
!--       Splitting.
          DO  i = nxl_z, nxr_z
             tri(4,i,0) = tri(1,i,0)
          ENDDO
          DO  k = 1, nz-1
             DO  i = nxl_z, nxr_z
                tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
                tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
             ENDDO
          ENDDO

       END SUBROUTINE split

    END SUBROUTINE tridia


#if defined( __parallel )
    SUBROUTINE fftxp( ar, direction )

!------------------------------------------------------------------------------!
! Fourier-transformation along x-direction                 Parallelized version
!------------------------------------------------------------------------------!

       IMPLICIT NONE

       CHARACTER (LEN=*) ::  direction
       INTEGER           ::  j, k
       REAL              ::  ar(0:nxa,nys_x:nyn_xa,nzb_x:nzt_xa)

!
!--    Performing the fft with one of the methods implemented
       DO  k = nzb_x, nzt_x
          DO  j = nys_x, nyn_x
             CALL fft_x( ar(0:nx,j,k), direction )
          ENDDO
       ENDDO

    END SUBROUTINE fftxp

#else
    SUBROUTINE fftx( ar, direction )

!------------------------------------------------------------------------------!
! Fourier-transformation along x-direction                 Non parallel version
!------------------------------------------------------------------------------!

       IMPLICIT NONE

       CHARACTER (LEN=*) ::  direction
       INTEGER           ::  i, j, k
       REAL              ::  ar(1:nz,0:ny,0:nx)

!
!--    Performing the fft with one of the methods implemented
       DO  k = 1, nz
          DO  j = 0, ny
             CALL fft_x( ar(k,j,0:nx), direction )
          ENDDO
       ENDDO

    END SUBROUTINE fftx
#endif


#if defined( __parallel )
    SUBROUTINE fftyp( ar, direction )

!------------------------------------------------------------------------------!
! Fourier-transformation along y-direction                 Parallelized version
!------------------------------------------------------------------------------!

       IMPLICIT NONE

       CHARACTER (LEN=*) ::  direction
       INTEGER           ::  i, k
       REAL              ::  ar(0:nya,nxl_y:nxr_ya,nzb_y:nzt_ya)

!
!--    Performing the fft with one of the methods implemented
       DO  k = nzb_y, nzt_y
          DO  i = nxl_y, nxr_y
             CALL fft_y( ar(0:ny,i,k), direction )
          ENDDO
       ENDDO

    END SUBROUTINE fftyp

#else
    SUBROUTINE ffty( ar, direction )

!------------------------------------------------------------------------------!
! Fourier-transformation along y-direction                 Non parallel version
!------------------------------------------------------------------------------!

       IMPLICIT NONE

       CHARACTER (LEN=*) ::  direction
       INTEGER           ::  i, k
       REAL              ::  ar(1:nz,0:ny,0:nx)

!
!--    Performing the fft with one of the methods implemented
       DO  k = 1, nz
          DO  i = 0, nx
             CALL fft_y( ar(k,0:ny,i), direction )
          ENDDO
       ENDDO

    END SUBROUTINE ffty
#endif

#if defined( __parallel )
    SUBROUTINE ffty_tr_yx( f_in, work, f_out )

!------------------------------------------------------------------------------!
!  Fourier-transformation along y with subsequent transposition y --> x for
!  a 1d-decomposition along x
!
!  ATTENTION: The performance of this routine is much faster on the NEC-SX6,
!             if the first index of work_ffty_vec is odd. Otherwise
!             memory bank conflicts may occur (especially if the index is a
!             multiple of 128). That's why work_ffty_vec is dimensioned as
!             0:ny+1.
!             Of course, this will not work if users are using an odd number
!             of gridpoints along y.
!------------------------------------------------------------------------------!

       USE control_parameters
       USE cpulog
       USE indices
       USE interfaces
       USE pegrid
       USE transpose_indices

       IMPLICIT NONE

       INTEGER            ::  i, iend, iouter, ir, j, k
       INTEGER, PARAMETER ::  stridex = 4

       REAL, DIMENSION(0:ny,stridex)                    ::  work_ffty
#if defined( __nec )
       REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr)             ::  work_ffty_vec
#endif
       REAL, DIMENSION(1:nza,0:nya,nxl:nxra)            ::  f_in
       REAL, DIMENSION(nnx,1:nza,nys_x:nyn_xa,pdims(1)) ::  f_out
       REAL, DIMENSION(nxl:nxra,1:nza,0:nya)            ::  work

!
!--    Carry out the FFT along y, where all data are present due to the
!--    1d-decomposition along x. Resort the data in a way that x becomes
!--    the first index.
       CALL cpu_log( log_point_s(7), 'fft_y', 'start' )

       IF ( host(1:3) == 'nec' )  THEN
#if defined( __nec )
!
!--       Code optimized for vector processors
!$OMP     PARALLEL PRIVATE ( i, j, k )
!$OMP     DO
          DO  i = nxl, nxr

             DO  j = 0, ny
                DO  k = 1, nz
                   work_ffty_vec(j,k,i) = f_in(k,j,i)
                ENDDO
             ENDDO

             CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' )

          ENDDO

!$OMP     DO
          DO  k = 1, nz
             DO  j = 0, ny
                DO  i = nxl, nxr
                   work(i,k,j) = work_ffty_vec(j,k,i)
                ENDDO
             ENDDO
          ENDDO
!$OMP     END PARALLEL
#endif

       ELSE

!
!--       Cache optimized code.
!--       The i-(x-)direction is split into a strided outer loop and an inner
!--       loop for better cache performance
!$OMP     PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty)
!$OMP     DO
          DO  iouter = nxl, nxr, stridex

             iend = MIN( iouter+stridex-1, nxr )  ! Upper bound for inner i loop

             DO  k = 1, nz

                DO  i = iouter, iend

                   ir = i-iouter+1  ! counter within a stride
                   DO  j = 0, ny
                      work_ffty(j,ir) = f_in(k,j,i)
                   ENDDO
!
!--                FFT along y
                   CALL fft_y( work_ffty(:,ir), 'forward' )

                ENDDO

!
!--             Resort
                DO  j = 0, ny
                   DO  i = iouter, iend
                      work(i,k,j) = work_ffty(j,i-iouter+1)
                   ENDDO
                ENDDO

             ENDDO

          ENDDO
!$OMP     END PARALLEL

       ENDIF
       CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )

!
!--    Transpose array
       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
       CALL MPI_ALLTOALL( work(nxl,1,0),      sendrecvcount_xy, MPI_REAL, &
                          f_out(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
                          comm1dx, ierr )
       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )

    END SUBROUTINE ffty_tr_yx


    SUBROUTINE tr_xy_ffty( f_in, work, f_out )

!------------------------------------------------------------------------------!
!  Transposition x --> y with a subsequent backward Fourier transformation for
!  a 1d-decomposition along x
!------------------------------------------------------------------------------!

       USE control_parameters
       USE cpulog
       USE indices
       USE interfaces
       USE pegrid
       USE transpose_indices

       IMPLICIT NONE

       INTEGER            ::  i, iend, iouter, ir, j, k
       INTEGER, PARAMETER ::  stridex = 4

       REAL, DIMENSION(0:ny,stridex)                    ::  work_ffty
#if defined( __nec )
       REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr)             ::  work_ffty_vec
#endif
       REAL, DIMENSION(nnx,1:nza,nys_x:nyn_xa,pdims(1)) ::  f_in
       REAL, DIMENSION(1:nza,0:nya,nxl:nxra)            ::  f_out
       REAL, DIMENSION(nxl:nxra,1:nza,0:nya)            ::  work

!
!--    Transpose array
       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
       CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
                          work(nxl,1,0),     sendrecvcount_xy, MPI_REAL, &
                          comm1dx, ierr )
       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )

!
!--    Resort the data in a way that y becomes the first index and carry out the
!--    backward fft along y.
       CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )

       IF ( host(1:3) == 'nec' )  THEN
#if defined( __nec )
!
!--       Code optimized for vector processors
!$OMP     PARALLEL PRIVATE ( i, j, k )
!$OMP     DO
          DO  k = 1, nz
             DO  j = 0, ny
                DO  i = nxl, nxr
                   work_ffty_vec(j,k,i) = work(i,k,j)
                ENDDO
             ENDDO
          ENDDO

!$OMP     DO
          DO  i = nxl, nxr

             CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' )

             DO  j = 0, ny
                DO  k = 1, nz
                   f_out(k,j,i) = work_ffty_vec(j,k,i)
                ENDDO
             ENDDO

          ENDDO
!$OMP     END PARALLEL
#endif

       ELSE

!
!--       Cache optimized code.
!--       The i-(x-)direction is split into a strided outer loop and an inner
!--       loop for better cache performance
!$OMP     PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty )
!$OMP     DO
          DO  iouter = nxl, nxr, stridex

             iend = MIN( iouter+stridex-1, nxr )  ! Upper bound for inner i loop

             DO  k = 1, nz
!
!--             Resort
                DO  j = 0, ny
                   DO  i = iouter, iend
                      work_ffty(j,i-iouter+1) = work(i,k,j)
                   ENDDO
                ENDDO

                DO  i = iouter, iend

!
!--                FFT along y
                   ir = i-iouter+1  ! counter within a stride
                   CALL fft_y( work_ffty(:,ir), 'backward' )

                   DO  j = 0, ny
                      f_out(k,j,i) = work_ffty(j,ir)
                   ENDDO
                ENDDO

             ENDDO

          ENDDO
!$OMP     END PARALLEL

       ENDIF

       CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )

    END SUBROUTINE tr_xy_ffty


    SUBROUTINE fftx_tri_fftx( ar )

!------------------------------------------------------------------------------!
!  FFT along x, solution of the tridiagonal system and backward FFT for
!  a 1d-decomposition along x
!
!  WARNING: this subroutine may still not work for hybrid parallelization
!           with OpenMP (for possible necessary changes see the original
!           routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
!------------------------------------------------------------------------------!

       USE control_parameters
       USE cpulog
       USE grid_variables
       USE indices
       USE interfaces
       USE pegrid
       USE transpose_indices

       IMPLICIT NONE

       character(len=3) ::  myth_char

       INTEGER ::  i, j, k, m, n, omp_get_thread_num, tn

       REAL, DIMENSION(0:nx)                            ::  work_fftx
       REAL, DIMENSION(0:nx,1:nz)                       ::  work_trix
       REAL, DIMENSION(nnx,1:nza,nys_x:nyn_xa,pdims(1)) ::  ar
       REAL, DIMENSION(:,:,:,:), ALLOCATABLE            ::  tri


       CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'start' )

       ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) )

       tn = 0              ! Default thread number in case of one thread
!$OMP  PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_fftx, work_trix )
       DO  j = nys_x, nyn_x

!$        tn = omp_get_thread_num()

          IF ( host(1:3) == 'nec' )  THEN
!
!--          Code optimized for vector processors
             DO  k = 1, nz

                m = 0
                DO  n = 1, pdims(1)
                   DO  i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
                      work_trix(m,k) = ar(i,k,j,n)
                      m = m + 1
                   ENDDO
                ENDDO

             ENDDO

             CALL fft_x_m( work_trix, 'forward' )

          ELSE
!
!--          Cache optimized code
             DO  k = 1, nz

                m = 0
                DO  n = 1, pdims(1)
                   DO  i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
                      work_fftx(m) = ar(i,k,j,n)
                      m = m + 1
                   ENDDO
                ENDDO

                CALL fft_x( work_fftx, 'forward' )

                DO  i = 0, nx
                   work_trix(i,k) = work_fftx(i)
                ENDDO

             ENDDO

          ENDIF

!
!--       Solve the linear equation system
          CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) )

          IF ( host(1:3) == 'nec' )  THEN
!
!--          Code optimized for vector processors
             CALL fft_x_m( work_trix, 'backward' )

             DO  k = 1, nz

                m = 0
                DO  n = 1, pdims(1)
                   DO  i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
                      ar(i,k,j,n) = work_trix(m,k)
                      m = m + 1
                   ENDDO
                ENDDO

             ENDDO

          ELSE
!
!--          Cache optimized code
             DO  k = 1, nz

                DO  i = 0, nx
                   work_fftx(i) = work_trix(i,k)
                ENDDO

                CALL fft_x( work_fftx, 'backward' )

                m = 0
                DO  n = 1, pdims(1)
                   DO  i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
                      ar(i,k,j,n) = work_fftx(m)
                      m = m + 1
                   ENDDO
                ENDDO

             ENDDO

          ENDIF

       ENDDO

       DEALLOCATE( tri )

       CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'stop' )

    END SUBROUTINE fftx_tri_fftx


    SUBROUTINE fftx_tr_xy( f_in, work, f_out )

!------------------------------------------------------------------------------!
!  Fourier-transformation along x with subsequent transposition x --> y for
!  a 1d-decomposition along y
!
!  ATTENTION: The NEC-branch of this routine may significantly profit from
!             further optimizations. So far, performance is much worse than
!             for routine ffty_tr_yx (more than three times slower).
!------------------------------------------------------------------------------!

       USE control_parameters
       USE cpulog
       USE indices
       USE interfaces
       USE pegrid
       USE transpose_indices

       IMPLICIT NONE

       INTEGER            ::  i, j, k

       REAL, DIMENSION(0:nx,1:nz,nys:nyn)               ::  work_fftx
       REAL, DIMENSION(1:nza,nys:nyna,0:nxa)            ::  f_in
       REAL, DIMENSION(nny,1:nza,nxl_y:nxr_ya,pdims(2)) ::  f_out
       REAL, DIMENSION(nys:nyna,1:nza,0:nxa)            ::  work

!
!--    Carry out the FFT along x, where all data are present due to the
!--    1d-decomposition along y. Resort the data in a way that y becomes
!--    the first index.
       CALL cpu_log( log_point_s(4), 'fft_x', 'start' )

       IF ( host(1:3) == 'nec' )  THEN
!
!--       Code for vector processors
!$OMP     PARALLEL PRIVATE ( i, j, k )
!$OMP     DO
          DO  i = 0, nx

             DO  j = nys, nyn
                DO  k = 1, nz
                   work_fftx(i,k,j) = f_in(k,j,i)
                ENDDO
             ENDDO

          ENDDO

!$OMP     DO
          DO  j = nys, nyn

             CALL fft_x_m( work_fftx(:,:,j), 'forward' )

             DO  k = 1, nz
                DO  i = 0, nx
                   work(j,k,i) = work_fftx(i,k,j)
                ENDDO
             ENDDO

          ENDDO
!$OMP     END PARALLEL

       ELSE

!
!--       Cache optimized code (there might be still a potential for better
!--       optimization).
!$OMP     PARALLEL PRIVATE (i,j,k,work_fftx)
!$OMP     DO
          DO  i = 0, nx

             DO  j = nys, nyn
                DO  k = 1, nz
                   work_fftx(i,k,j) = f_in(k,j,i)
                ENDDO
             ENDDO

          ENDDO

!$OMP     DO
          DO  j = nys, nyn
             DO  k = 1, nz

                CALL fft_x( work_fftx(0:nx,k,j), 'forward' )

                DO  i = 0, nx
                   work(j,k,i) = work_fftx(i,k,j)
                ENDDO
             ENDDO

          ENDDO
!$OMP     END PARALLEL

       ENDIF
       CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )

!
!--    Transpose array
       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
       CALL MPI_ALLTOALL( work(nys,1,0),      sendrecvcount_xy, MPI_REAL, &
                          f_out(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
                          comm1dy, ierr )
       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )

    END SUBROUTINE fftx_tr_xy


    SUBROUTINE tr_yx_fftx( f_in, work, f_out )

!------------------------------------------------------------------------------!
!  Transposition y --> x with a subsequent backward Fourier transformation for
!  a 1d-decomposition along x
!------------------------------------------------------------------------------!

       USE control_parameters
       USE cpulog
       USE indices
       USE interfaces
       USE pegrid
       USE transpose_indices

       IMPLICIT NONE

       INTEGER            ::  i, j, k

       REAL, DIMENSION(0:nx,1:nz,nys:nyn)               ::  work_fftx
       REAL, DIMENSION(nny,1:nza,nxl_y:nxr_ya,pdims(2)) ::  f_in
       REAL, DIMENSION(1:nza,nys:nyna,0:nxa)            ::  f_out
       REAL, DIMENSION(nys:nyna,1:nza,0:nxa)            ::  work

!
!--    Transpose array
       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
       CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
                          work(nys,1,0),     sendrecvcount_xy, MPI_REAL, &
                          comm1dy, ierr )
       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )

!
!--    Carry out the FFT along x, where all data are present due to the
!--    1d-decomposition along y. Resort the data in a way that y becomes
!--    the first index.
       CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )

       IF ( host(1:3) == 'nec' )  THEN
!
!--       Code optimized for vector processors
!$OMP     PARALLEL PRIVATE ( i, j, k )
!$OMP     DO
          DO  j = nys, nyn

             DO  k = 1, nz
                DO  i = 0, nx
                   work_fftx(i,k,j) = work(j,k,i)
                ENDDO
             ENDDO

             CALL fft_x_m( work_fftx(:,:,j), 'backward' )

          ENDDO

!$OMP     DO
          DO  i = 0, nx
             DO  j = nys, nyn
                DO  k = 1, nz
                   f_out(k,j,i) = work_fftx(i,k,j)
                ENDDO
             ENDDO
          ENDDO
!$OMP     END PARALLEL

       ELSE

!
!--       Cache optimized code (there might be still a potential for better
!--       optimization).
!$OMP     PARALLEL PRIVATE (i,j,k,work_fftx)
!$OMP     DO
          DO  j = nys, nyn
             DO  k = 1, nz

                DO  i = 0, nx
                   work_fftx(i,k,j) = work(j,k,i)
                ENDDO

                CALL fft_x( work_fftx(0:nx,k,j), 'backward' )

             ENDDO
          ENDDO

!$OMP     DO
          DO  i = 0, nx
             DO  j = nys, nyn
                DO  k = 1, nz
                   f_out(k,j,i) = work_fftx(i,k,j)
                ENDDO
             ENDDO
          ENDDO
!$OMP     END PARALLEL

       ENDIF
       CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )

    END SUBROUTINE tr_yx_fftx


    SUBROUTINE ffty_tri_ffty( ar )

!------------------------------------------------------------------------------!
!  FFT along y, solution of the tridiagonal system and backward FFT for
!  a 1d-decomposition along y
!
!  WARNING: this subroutine may still not work for hybrid parallelization
!           with OpenMP (for possible necessary changes see the original
!           routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
!------------------------------------------------------------------------------!

       USE control_parameters
       USE cpulog
       USE grid_variables
       USE indices
       USE interfaces
       USE pegrid
       USE transpose_indices

       IMPLICIT NONE

       INTEGER ::  i, j, k, m, n, omp_get_thread_num, tn

       REAL, DIMENSION(0:ny)                            ::  work_ffty
       REAL, DIMENSION(0:ny,1:nz)                       ::  work_triy
       REAL, DIMENSION(nny,1:nza,nxl_y:nxr_ya,pdims(2)) ::  ar
       REAL, DIMENSION(:,:,:,:), ALLOCATABLE            ::  tri


       CALL cpu_log( log_point_s(39), 'fft_y + tridia', 'start' )

       ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) )

       tn = 0           ! Default thread number in case of one thread
!$OMP  PARALLEL PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy )
!$OMP  DO
       DO  i = nxl_y, nxr_y

!$        tn = omp_get_thread_num()

          IF ( host(1:3) == 'nec' )  THEN
!
!--          Code optimized for vector processors
             DO  k = 1, nz

                m = 0
                DO  n = 1, pdims(2)
                   DO  j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
                      work_triy(m,k) = ar(j,k,i,n)
                      m = m + 1
                   ENDDO
                ENDDO

             ENDDO

             CALL fft_y_m( work_triy, ny, 'forward' )

          ELSE
!
!--          Cache optimized code
             DO  k = 1, nz

                m = 0
                DO  n = 1, pdims(2)
                   DO  j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
                      work_ffty(m) = ar(j,k,i,n)
                      m = m + 1
                   ENDDO
                ENDDO

                CALL fft_y( work_ffty, 'forward' )

                DO  j = 0, ny
                   work_triy(j,k) = work_ffty(j)
                ENDDO

             ENDDO

          ENDIF

!
!--       Solve the linear equation system
          CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) )

          IF ( host(1:3) == 'nec' )  THEN
!
!--          Code optimized for vector processors
             CALL fft_y_m( work_triy, ny, 'backward' )

             DO  k = 1, nz

                m = 0
                DO  n = 1, pdims(2)
                   DO  j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
                      ar(j,k,i,n) = work_triy(m,k)
                      m = m + 1
                   ENDDO
                ENDDO

             ENDDO

          ELSE
!
!--          Cache optimized code
             DO  k = 1, nz

                DO  j = 0, ny
                   work_ffty(j) = work_triy(j,k)
                ENDDO

                CALL fft_y( work_ffty, 'backward' )

                m = 0
                DO  n = 1, pdims(2)
                   DO  j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
                      ar(j,k,i,n) = work_ffty(m)
                      m = m + 1
                   ENDDO
                ENDDO

             ENDDO

          ENDIF

       ENDDO
!$OMP  END PARALLEL

       DEALLOCATE( tri )

       CALL cpu_log( log_point_s(39), 'fft_y + tridia', 'stop' )

    END SUBROUTINE ffty_tri_ffty


    SUBROUTINE tridia_1dd( ddx2, ddy2, nx, ny, j, ar, tri )

!------------------------------------------------------------------------------!
! Solves the linear system of equations for a 1d-decomposition along x (see
! tridia)
!
! Attention:  when using the intel compiler, array tri must be passed as an
!             argument to the contained subroutines. Otherwise addres faults
!             will occur.
!             On NEC, tri should not be passed (except for routine substi_1dd)
!             because this causes very bad performance.
!------------------------------------------------------------------------------!

       USE arrays_3d
       USE control_parameters

       USE pegrid

       IMPLICIT NONE

       INTEGER ::  i, j, k, nnyh, nx, ny, omp_get_thread_num, tn

       REAL    ::  ddx2, ddy2

       REAL, DIMENSION(0:nx,1:nz)     ::  ar
       REAL, DIMENSION(0:nx,0:nz-1)   ::  ar1
       REAL, DIMENSION(5,0:nx,0:nz-1) ::  tri


       nnyh = ( ny + 1 ) / 2

!
!--    Define constant elements of the tridiagonal matrix.
!--    The compiler on SX6 does loop exchange. If 0:nx is a high power of 2, 
!--    the exchanged loops create bank conflicts. The following directive
!--    prohibits loop exchange and the loops perform much better.
!       tn = omp_get_thread_num()
!       WRITE( 120+tn, * ) '+++ id=',myid,' nx=',nx,' thread=', omp_get_thread_num()
!       CALL local_flush( 120+tn )
!CDIR NOLOOPCHG
       DO  k = 0, nz-1
          DO  i = 0,nx
             tri(2,i,k) = ddzu(k+1) * ddzw(k+1)
             tri(3,i,k) = ddzu(k+2) * ddzw(k+1)
          ENDDO
       ENDDO
!       WRITE( 120+tn, * ) '+++ id=',myid,' end of first tridia loop   thread=', omp_get_thread_num()
!       CALL local_flush( 120+tn )

       IF ( j <= nnyh )  THEN
#if defined( __lcmuk )
          CALL maketri_1dd( j, tri )
#else
          CALL maketri_1dd( j )
#endif
       ELSE
#if defined( __lcmuk )
          CALL maketri_1dd( ny+1-j, tri )
#else
          CALL maketri_1dd( ny+1-j )
#endif
       ENDIF
#if defined( __lcmuk )
       CALL split_1dd( tri )
#else
       CALL split_1dd
#endif
       CALL substi_1dd( ar, tri )

    CONTAINS

#if defined( __lcmuk )
       SUBROUTINE maketri_1dd( j, tri )
#else
       SUBROUTINE maketri_1dd( j )
#endif

!------------------------------------------------------------------------------!
! computes the i- and j-dependent component of the matrix
!------------------------------------------------------------------------------!

          USE constants

          IMPLICIT NONE

          INTEGER ::  i, j, k, nnxh
          REAL    ::  a, c

          REAL, DIMENSION(0:nx) ::  l

#if defined( __lcmuk )
          REAL, DIMENSION(5,0:nx,0:nz-1) ::  tri
#endif


          nnxh = ( nx + 1 ) / 2
!
!--       Provide the tridiagonal matrix for solution of the Poisson equation in
!--       Fourier space. The coefficients are computed following the method of
!--       Schmidt et al. (DFVLR-Mitteilung 84-15), which departs from Stephan
!--       Siano's original version by discretizing the Poisson equation,
!--       before it is Fourier-transformed
          DO  i = 0, nx
             IF ( i >= 0 .AND. i <= nnxh ) THEN
                l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
                                         FLOAT( nx+1 ) ) ) * ddx2 + &
                       2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
                                         FLOAT( ny+1 ) ) ) * ddy2
             ELSE
                l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
                                         FLOAT( nx+1 ) ) ) * ddx2 + &
                       2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
                                         FLOAT( ny+1 ) ) ) * ddy2
             ENDIF
          ENDDO

          DO  k = 0, nz-1
             DO  i = 0, nx
                a = -1.0 * ddzu(k+2) * ddzw(k+1)
                c = -1.0 * ddzu(k+1) * ddzw(k+1)
                tri(1,i,k) = a + c - l(i)
             ENDDO
          ENDDO
          IF ( ibc_p_b == 1  .OR.  ibc_p_b == 2 )  THEN
             DO  i = 0, nx
                tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
             ENDDO
          ENDIF
          IF ( ibc_p_t == 1 )  THEN
             DO  i = 0, nx
                tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
             ENDDO
          ENDIF

       END SUBROUTINE maketri_1dd


#if defined( __lcmuk )
       SUBROUTINE split_1dd( tri )
#else
       SUBROUTINE split_1dd
#endif

!------------------------------------------------------------------------------!
! Splitting of the tridiagonal matrix (Thomas algorithm)
!------------------------------------------------------------------------------!

          IMPLICIT NONE

          INTEGER ::  i, k

#if defined( __lcmuk )
          REAL, DIMENSION(5,0:nx,0:nz-1) ::  tri
#endif


!
!--       Splitting
          DO  i = 0, nx
             tri(4,i,0) = tri(1,i,0)
          ENDDO
          DO  k = 1, nz-1
             DO  i = 0, nx
                tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
                tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
             ENDDO
          ENDDO

       END SUBROUTINE split_1dd


       SUBROUTINE substi_1dd( ar, tri )

!------------------------------------------------------------------------------!
! Substitution (Forward and Backward) (Thomas algorithm)
!------------------------------------------------------------------------------!

          IMPLICIT NONE

          INTEGER ::  i, k

          REAL, DIMENSION(0:nx,nz)       ::  ar
          REAL, DIMENSION(0:nx,0:nz-1)   ::  ar1
          REAL, DIMENSION(5,0:nx,0:nz-1) ::  tri

!
!--       Forward substitution
          DO  i = 0, nx
             ar1(i,0) = ar(i,1)
          ENDDO
          DO  k = 1, nz-1
             DO  i = 0, nx
                ar1(i,k) = ar(i,k+1) - tri(5,i,k) * ar1(i,k-1)
             ENDDO
          ENDDO

!
!--       Backward substitution
          DO  i = 0, nx
             ar(i,nz) = ar1(i,nz-1) / tri(4,i,nz-1)
          ENDDO
          DO  k = nz-2, 0, -1
             DO  i = 0, nx
                ar(i,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,k+2) ) &
                            / tri(4,i,k)
             ENDDO
          ENDDO

!
!--       Indices i=0, j=0 correspond to horizontally averaged pressure.
!--       The respective values of ar should be zero at all k-levels if
!--       acceleration of horizontally averaged vertical velocity is zero.
          IF ( ibc_p_b == 1  .AND.  ibc_p_t == 1 )  THEN
             IF ( j == 0 )  THEN
                DO  k = 1, nz
                   ar(0,k) = 0.0
                ENDDO
             ENDIF
          ENDIF

       END SUBROUTINE substi_1dd

    END SUBROUTINE tridia_1dd

#endif

 END MODULE poisfft_mod