Home

Context Navigation

← Previous Change
Next Change →

poisfft_mod.f90

Timestamp:

Aug 25, 2020 12:11:17 PM (4 years ago)

Author:

raasch

Message:

files re-formatted to follow the PALM coding standard

File:

: 1 edited

palm/trunk/SOURCE/poisfft_mod.f90 (modified) (5 diffs)

Legend:

: Unmodified
: Added
: Removed

palm/trunk/SOURCE/poisfft_mod.f90

-                      r4429
+                      r4649
 !> @file poisfft_mod.f90
 !------------------------------------------------------------------------------!
+!--------------------------------------------------------------------------------------------------!
 ! This file is part of the PALM model system.
+!
+! PALM is free software: you can redistribute it and/or modify it under the
+! terms of the GNU General Public License as published by the Free Software
+! Foundation, either version 3 of the License, or (at your option) any later
+! version.
+!
+! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
+! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
+! A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+!
+! You should have received a copy of the GNU General Public License along with
+! PALM. If not, see <http://www.gnu.org/licenses/>.
+! PALM is free software: you can redistribute it and/or modify it under the terms of the GNU General
+! Public License as published by the Free Software Foundation, either version 3 of the License, or
+! (at your option) any later version.
+!
+! PALM is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the
+! implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
+! Public License for more details.
+!
+! You should have received a copy of the GNU General Public License along with PALM. If not, see
+! <http://www.gnu.org/licenses/>.
+!
 ! Copyright 1997-2020 Leibniz Universitaet Hannover
+!------------------------------------------------------------------------------!
+!--------------------------------------------------------------------------------------------------!
+!
+!
 ! Current revisions:
 ! -----------------
+!
+!
+!
+!
 ! Former revisions:
 ! -----------------
 ! $Id$
+! File re-formatted to follow the PALM coding standard
+!
+!
+! 4429 2020-02-27 15:24:30Z raasch
 ! Statements added to avoid compile errors due to unused dummy arguments in serial mode
+!
+!
 ! 4366 2020-01-09 08:12:43Z raasch
 ! modification concerning NEC vectorizatio
+!
+! Modification concerning NEC vectorizatio
+!
 ! 4360 2020-01-07 11:25:50Z suehring
 ! Corrected "Former revisions" section
+!
+!
 ! 3690 2019-01-22 22:56:42Z knoop
 ! OpenACC port for SPEC
 …
+!
+!
+!--------------------------------------------------------------------------------------------------!
 ! Description:
 ! ------------
 …
 !>
 !> Input:
+!> real    ar   contains (nnz,nny,nnx) elements of the velocity divergence,
+!>              starting from (1,nys,nxl)
+!> real   ar   contains (nnz,nny,nnx) elements of the velocity divergence, starting from (1,nys,nxl)
 !>
 !> Output:
 !> real    ar   contains the solution for perturbation pressure p
 !------------------------------------------------------------------------------!
+!> real   ar   contains the solution for perturbation pressure p
+!--------------------------------------------------------------------------------------------------!
  MODULE poisfft_mod
+    USE fft_xy,                                                                &
+        ONLY:  fft_init, fft_y, fft_y_1d, fft_y_m, fft_x, fft_x_1d, fft_x_m,   &
+    USE fft_xy,                                                                                    &
+        ONLY:  fft_init,                                                                           &
+               fft_y,                                                                              &
+               fft_y_1d,                                                                           &
+               fft_y_m,                                                                            &
+               fft_x,                                                                              &
+               fft_x_1d,                                                                           &
+               fft_x_m,                                                                            &
                temperton_fft_vec
+    USE indices,                                                               &
+        ONLY:  nnx, nny, nx, nxl, nxr, ny, nys, nyn, nz
+    USE transpose_indices,                                                     &
+        ONLY:  nxl_y, nxl_z, nxr_y, nxr_z, nys_x, nys_z, nyn_x, nyn_z, nzb_x,  &
+               nzb_y, nzt_x, nzt_y
+    USE tridia_solver,                                                         &
+        ONLY:  tridia_1dd, tridia_init, tridia_substi, tridia_substi_overlap
+    USE indices,                                                                                   &
+        ONLY:  nnx,                                                                                &
+               nny,                                                                                &
+               nx,                                                                                 &
+               nxl,                                                                                &
+               nxr,                                                                                &
+               ny,                                                                                 &
+               nys,                                                                                &
+               nyn,                                                                                &
+               nz
+    USE transpose_indices,                                                                         &
+        ONLY:  nxl_y,                                                                              &
+               nxl_z,                                                                              &
+               nxr_y,                                                                              &
+               nxr_z,                                                                              &
+               nys_x,                                                                              &
+               nys_z,                                                                              &
+               nyn_x,                                                                              &
+               nyn_z,                                                                              &
+               nzb_x,                                                                              &
+               nzb_y,                                                                              &
+               nzt_x,                                                                              &
+               nzt_y
+    USE tridia_solver,                                                                             &
+        ONLY:  tridia_1dd,                                                                         &
+               tridia_init,                                                                        &
+               tridia_substi,                                                                      &
+               tridia_substi_overlap
     IMPLICIT NONE
     LOGICAL, SAVE ::  poisfft_initialized = .FALSE.
+    LOGICAL, SAVE ::  poisfft_initialized = .FALSE.  !<
     PRIVATE
 …
  CONTAINS
 !------------------------------------------------------------------------------!
+!--------------------------------------------------------------------------------------------------!
 ! Description:
 ! ------------
 !> Setup coefficients for FFT and the tridiagonal solver
 !------------------------------------------------------------------------------!
     SUBROUTINE poisfft_init
        IMPLICIT NONE
        CALL fft_init
        CALL tridia_init
        poisfft_initialized = .TRUE.
     END SUBROUTINE poisfft_init
 !------------------------------------------------------------------------------!
+!--------------------------------------------------------------------------------------------------!
+ SUBROUTINE poisfft_init
+    IMPLICIT NONE
+    CALL fft_init
+    CALL tridia_init
+    poisfft_initialized = .TRUE.
+ END SUBROUTINE poisfft_init
+!--------------------------------------------------------------------------------------------------!
 ! Description:
 ! ------------
 !> Two-dimensional Fourier Transformation in x- and y-direction.
+!------------------------------------------------------------------------------!
+    SUBROUTINE poisfft( ar )
+       USE control_parameters,                                                 &
+           ONLY:  transpose_compute_overlap
+       USE cpulog,                                                             &
+           ONLY:  cpu_log, cpu_log_nowait, log_point_s
+       USE kinds
+       USE pegrid
+       IMPLICIT NONE
+       INTEGER(iwp) ::  ii           !<
+       INTEGER(iwp) ::  iind         !<
+       INTEGER(iwp) ::  inew         !<
+       INTEGER(iwp) ::  jj           !<
+       INTEGER(iwp) ::  jind         !<
+       INTEGER(iwp) ::  jnew         !<
+       INTEGER(iwp) ::  ki           !<
+       INTEGER(iwp) ::  kk           !<
+       INTEGER(iwp) ::  knew         !<
+       INTEGER(iwp) ::  n            !<
+       INTEGER(iwp) ::  nblk         !<
+       INTEGER(iwp) ::  nnx_y        !<
+       INTEGER(iwp) ::  nny_z        !<
+       INTEGER(iwp) ::  nnz_x        !<
+       INTEGER(iwp) ::  nxl_y_bound  !<
+       INTEGER(iwp) ::  nxr_y_bound  !<
+       INTEGER(iwp), DIMENSION(4) ::  isave  !<
+       REAL(wp), DIMENSION(1:nz,nys:nyn,nxl:nxr) ::  ar      !<
+       REAL(wp), DIMENSION(nys:nyn,nxl:nxr,1:nz) ::  ar_inv  !<
+!--------------------------------------------------------------------------------------------------!
+ SUBROUTINE poisfft( ar )
+    USE control_parameters,                                                                        &
+        ONLY:  transpose_compute_overlap
+    USE cpulog,                                                                                    &
+        ONLY:  cpu_log,                                                                            &
+               cpu_log_nowait,                                                                     &
+               log_point_s
+    USE kinds
+    USE pegrid
+    IMPLICIT NONE
+    INTEGER(iwp) ::  ii           !<
+    INTEGER(iwp) ::  iind         !<
+    INTEGER(iwp) ::  inew         !<
+    INTEGER(iwp) ::  jj           !<
+    INTEGER(iwp) ::  jind         !<
+    INTEGER(iwp) ::  jnew         !<
+    INTEGER(iwp) ::  ki           !<
+    INTEGER(iwp) ::  kk           !<
+    INTEGER(iwp) ::  knew         !<
+    INTEGER(iwp) ::  n            !<
+    INTEGER(iwp) ::  nblk         !<
+    INTEGER(iwp) ::  nnx_y        !<
+    INTEGER(iwp) ::  nny_z        !<
+    INTEGER(iwp) ::  nnz_x        !<
+    INTEGER(iwp) ::  nxl_y_bound  !<
+    INTEGER(iwp) ::  nxr_y_bound  !<
+    INTEGER(iwp), DIMENSION(4) ::  isave  !<
+    REAL(wp), DIMENSION(1:nz,nys:nyn,nxl:nxr) ::  ar      !<
+    REAL(wp), DIMENSION(nys:nyn,nxl:nxr,1:nz) ::  ar_inv  !<
 #define __acc_fft_device ( defined( _OPENACC ) && ( defined ( __cuda_fft ) ) )
 #if __acc_fft_device
        !$ACC DECLARE CREATE(ar_inv)
+    !$ACC DECLARE CREATE(ar_inv)
 #endif
        REAL(wp), DIMENSION(:,:,:),   ALLOCATABLE ::  ar1      !<
        REAL(wp), DIMENSION(:,:,:),   ALLOCATABLE ::  f_in     !<
        REAL(wp), DIMENSION(:,:,:),   ALLOCATABLE ::  f_inv    !<
        REAL(wp), DIMENSION(:,:,:),   ALLOCATABLE ::  f_out_y  !<
        REAL(wp), DIMENSION(:,:,:),   ALLOCATABLE ::  f_out_z  !<
        CALL cpu_log( log_point_s(3), 'poisfft', 'start' )
        IF ( .NOT. poisfft_initialized )  CALL poisfft_init
+    REAL(wp), DIMENSION(:,:,:),   ALLOCATABLE ::  ar1      !<
+    REAL(wp), DIMENSION(:,:,:),   ALLOCATABLE ::  f_in     !<
+    REAL(wp), DIMENSION(:,:,:),   ALLOCATABLE ::  f_inv    !<
+    REAL(wp), DIMENSION(:,:,:),   ALLOCATABLE ::  f_out_y  !<
+    REAL(wp), DIMENSION(:,:,:),   ALLOCATABLE ::  f_out_z  !<
+    CALL cpu_log( log_point_s(3), 'poisfft', 'start' )
+    IF ( .NOT. poisfft_initialized )  CALL poisfft_init
 #if !__acc_fft_device
        !$ACC UPDATE HOST(ar)
+    !$ACC UPDATE HOST(ar)
 #endif
 #ifndef _OPENACC
+!
 !--    Two-dimensional Fourier Transformation in x- and y-direction.
        IF ( pdims(2) == 1  .AND.  pdims(1) > 1 )  THEN
+!
 !--       1d-domain-decomposition along x:
 !--       FFT along y and transposition y --> x
           CALL ffty_tr_yx( ar, ar )
+!
 !--       FFT along x, solving the tridiagonal system and backward FFT
           CALL fftx_tri_fftx( ar )
+!
 !--       Transposition x --> y and backward FFT along y
           CALL tr_xy_ffty( ar, ar )
        ELSEIF ( pdims(1) == 1 .AND. pdims(2) > 1 )  THEN
+!
 !--       1d-domain-decomposition along y:
 !--       FFT along x and transposition x --> y
           CALL fftx_tr_xy( ar, ar )
+!
 !--       FFT along y, solving the tridiagonal system and backward FFT
           CALL ffty_tri_ffty( ar )
+!
 !--       Transposition y --> x and backward FFT along x
           CALL tr_yx_fftx( ar, ar )
        ELSEIF ( .NOT. transpose_compute_overlap )  THEN
+!-- Two-dimensional Fourier Transformation in x- and y-direction.
+    IF ( pdims(2) == 1  .AND.  pdims(1) > 1 )  THEN
+!
+!--    1d-domain-decomposition along x:
+!--    FFT along y and transposition y --> x
+       CALL ffty_tr_yx( ar, ar )
+!
+!--    FFT along x, solving the tridiagonal system and backward FFT
+       CALL fftx_tri_fftx( ar )
+!
+!--    Transposition x --> y and backward FFT along y
+       CALL tr_xy_ffty( ar, ar )
+    ELSEIF ( pdims(1) == 1 .AND. pdims(2) > 1 )  THEN
+!
+!--    1d-domain-decomposition along y:
+!--    FFT along x and transposition x --> y
+       CALL fftx_tr_xy( ar, ar )
+!
+!--    FFT along y, solving the tridiagonal system and backward FFT
+       CALL ffty_tri_ffty( ar )
+!
+!--    Transposition y --> x and backward FFT along x
+       CALL tr_yx_fftx( ar, ar )
+    ELSEIF ( .NOT. transpose_compute_overlap )  THEN
 #endif
+!
+!--       2d-domain-decomposition or no decomposition (1 PE run)
+!--       Transposition z --> x
+          CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
+          CALL resort_for_zx( ar, ar_inv )
+          CALL transpose_zx( ar_inv, ar )
+!--    2d-domain-decomposition or no decomposition (1 PE run)
+!--    Transposition z --> x
+       CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
+       CALL resort_for_zx( ar, ar_inv )
+       CALL transpose_zx( ar_inv, ar )
+       CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
+       CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
+       IF ( temperton_fft_vec )  THEN
+!
+!--       Vector version outputs a transformed array ar_inv that does not require resorting
+!--       (which is done for ar further below)
+          CALL fft_x( ar, 'forward',  ar_inv=ar_inv)
+       ELSE
+          CALL fft_x( ar, 'forward')
+       ENDIF
+       CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
+!
+!--    Transposition x --> y
+       CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
+       IF( .NOT. temperton_fft_vec )  CALL resort_for_xy( ar, ar_inv )
+       CALL transpose_xy( ar_inv, ar )
+       CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
+       CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
+       IF ( temperton_fft_vec )  THEN
+!
+!--       Input array ar_inv from fft_x can be directly used here.
+!--       The output (also in array ar_inv) does not require resorting below.
+          CALL fft_y( ar, 'forward', ar_inv = ar_inv, nxl_y_bound = nxl_y, nxr_y_bound = nxr_y,    &
+                      nxl_y_l = nxl_y, nxr_y_l = nxr_y )
+       ELSE
+          CALL fft_y( ar, 'forward', ar_tr = ar, nxl_y_bound = nxl_y, nxr_y_bound = nxr_y,         &
+                      nxl_y_l = nxl_y, nxr_y_l = nxr_y )
+       ENDIF
+       CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
+!
+!--    Transposition y --> z
+       CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
+       IF ( .NOT. temperton_fft_vec )  CALL resort_for_yz( ar, ar_inv )
+       CALL transpose_yz( ar_inv, ar )
+       CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
+!
+!--    Solve the tridiagonal equation system along z
+       CALL cpu_log( log_point_s(6), 'tridia', 'start' )
+       CALL tridia_substi( ar )
+       CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
+!
+!--    Inverse Fourier Transformation
+!--    Transposition z --> y
+       CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
+       CALL transpose_zy( ar, ar_inv )
+!
+!--    The fft_y below (vector branch) can directly process ar_inv (i.e. does not require a
+!--    resorting)
+       IF ( .NOT. temperton_fft_vec )  CALL resort_for_zy( ar_inv, ar )
+       CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
+       CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
+       IF ( temperton_fft_vec )  THEN
+!
+!--       Output array ar_inv can be used as input to the below fft_x routine without resorting
+          CALL fft_y( ar, 'backward', ar_inv = ar_inv, nxl_y_bound = nxl_y, nxr_y_bound = nxr_y,   &
+                      nxl_y_l = nxl_y, nxr_y_l = nxr_y )
+       ELSE
+          CALL fft_y( ar, 'backward', ar_tr = ar, nxl_y_bound = nxl_y, nxr_y_bound = nxr_y,        &
+                      nxl_y_l = nxl_y, nxr_y_l = nxr_y )
+       ENDIF
+       CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
+!
+!--    Transposition y --> x
+       CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
+       CALL transpose_yx( ar, ar_inv )
+       IF ( .NOT. temperton_fft_vec )  CALL resort_for_yx( ar_inv, ar )
+       CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
+       CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
+       IF ( temperton_fft_vec )  THEN
+          CALL fft_x( ar, 'backward',  ar_inv=ar_inv )
+       ELSE
+          CALL fft_x( ar, 'backward' )
+       ENDIF
+       CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
+!
+!--    Transposition x --> z
+       CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
+       CALL transpose_xz( ar, ar_inv )
+       CALL resort_for_xz( ar_inv, ar )
+       CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
+#ifndef _OPENACC
+    ELSE
+!
+!--    2d-domain-decomposition or no decomposition (1 PE run) with overlapping transposition / fft
+!--    cputime logging must not use barriers, which would prevent overlapping
+       ALLOCATE( f_out_y(0:ny,nxl_y:nxr_y,nzb_y:nzt_y), &
+                 f_out_z(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
+!
+!--    Transposition z --> x + subsequent fft along x
+       ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
+       CALL resort_for_zx( ar, f_inv )
+!
+!--    Save original indices and gridpoint counter
+       isave(1) = nz
+       isave(2) = nzb_x
+       isave(3) = nzt_x
+       isave(4) = sendrecvcount_zx
+!
+!--    Set new indices for transformation
+       nblk  = nz / pdims(1)
+       nz    = pdims(1)
+       nnz_x = 1
+       nzb_x = 1 + myidx * nnz_x
+       nzt_x = ( myidx + 1 ) * nnz_x
+       sendrecvcount_zx = nnx * nny * nnz_x
+       ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
+       ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
+       DO  kk = 1, nblk
+          IF ( kk == 1 )  THEN
+             CALL cpu_log( log_point_s(5), 'transpo forward', 'start', cpu_log_nowait )
+          ELSE
+             CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
+          ENDIF
+          DO  knew = 1, nz
+             ki = kk + nblk * ( knew - 1 )
+             f_in(:,:,knew) = f_inv(:,:,ki)
+          ENDDO
+          CALL transpose_zx( f_in, ar1(:,:,:))
           CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
+          CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
+          IF ( temperton_fft_vec )  THEN
+!
+!--          Vector version outputs a transformed array ar_inv that does not require resorting
+!--          (which is done for ar further below)
+             CALL fft_x( ar, 'forward',  ar_inv=ar_inv)
+          IF ( kk == 1 )  THEN
+             CALL cpu_log( log_point_s(4), 'fft_x', 'start', cpu_log_nowait )
           ELSE
              CALL fft_x( ar, 'forward')
+             CALL cpu_log( log_point_s(4), 'fft_x', 'continue', cpu_log_nowait )
           ENDIF
+          n = isave(2) + kk - 1
+          CALL fft_x( ar1(:,:,:), 'forward',  ar_2d = f_out_z(:,:,n) )
           CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
+!
+!--       Transposition x --> y
+          CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
+          IF( .NOT. temperton_fft_vec )  CALL resort_for_xy( ar, ar_inv )
+          CALL transpose_xy( ar_inv, ar )
+       ENDDO
+!
+!--    Restore original indices/counters
+       nz               = isave(1)
+       nzb_x            = isave(2)
+       nzt_x            = isave(3)
+       sendrecvcount_zx = isave(4)
+       DEALLOCATE( ar1, f_in, f_inv )
+!
+!--    Transposition x --> y + subsequent fft along y
+       ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
+       CALL resort_for_xy( f_out_z, f_inv )
+!
+!--    Save original indices and gridpoint counter
+       isave(1) = nx
+       isave(2) = nxl_y
+       isave(3) = nxr_y
+       isave(4) = sendrecvcount_xy
+!
+!--    Set new indices for transformation
+       nblk  = ( ( nx+1 ) / pdims(2) ) - 1
+       nx    = pdims(2)
+       nnx_y = 1
+       nxl_y = myidy * nnx_y
+       nxr_y = ( myidy + 1 ) * nnx_y - 1
+       sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
+       ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
+       ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
+       DO  ii = 0, nblk
+          CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
+          DO  inew = 0, nx-1
+             iind = ii + ( nblk + 1 ) * inew
+             f_in(:,:,inew) = f_inv(:,:,iind)
+          ENDDO
+          CALL transpose_xy( f_in, ar1(:,:,:) )
           CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
+          CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
+          IF ( temperton_fft_vec )  THEN
+!
+!--          Input array ar_inv from fft_x can be directly used here.
+!--          The output (also in array ar_inv) does not require resorting below.
+             CALL fft_y( ar, 'forward', ar_inv = ar_inv, nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
+                         nxl_y_l = nxl_y, nxr_y_l = nxr_y )
+          IF ( ii == 1 )  THEN
+             CALL cpu_log( log_point_s(7), 'fft_y', 'start', cpu_log_nowait )
           ELSE
+             CALL fft_y( ar, 'forward', ar_tr = ar, nxl_y_bound = nxl_y, nxr_y_bound = nxr_y,      &
+                         nxl_y_l = nxl_y, nxr_y_l = nxr_y )
+             CALL cpu_log( log_point_s(7), 'fft_y', 'continue', cpu_log_nowait )
           ENDIF
+          nxl_y_bound = isave(2)
+          nxr_y_bound = isave(3)
+          n           = isave(2) + ii
+          CALL fft_y( ar1(:,:,:), 'forward', ar_tr = f_out_y, nxl_y_bound = nxl_y_bound,           &
+                      nxr_y_bound = nxr_y_bound, nxl_y_l = n, nxr_y_l = n )
           CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
+!
+       ENDDO
+!
+!--    Restore original indices/counters
+       nx               = isave(1)
+       nxl_y            = isave(2)
+       nxr_y            = isave(3)
+       sendrecvcount_xy = isave(4)
+       DEALLOCATE( ar1, f_in, f_inv )
+!
+!--    Transposition y --> z + subsequent tridia + resort for z --> y
+       ALLOCATE( f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
+       CALL resort_for_yz( f_out_y, f_inv )
+!
+!--    Save original indices and gridpoint counter
+       isave(1) = ny
+       isave(2) = nys_z
+       isave(3) = nyn_z
+       isave(4) = sendrecvcount_yz
+!
+!--    Set new indices for transformation
+       nblk             = ( ( ny+1 ) / pdims(1) ) - 1
+       ny               = pdims(1)
+       nny_z            = 1
+       nys_z            = myidx * nny_z
+       nyn_z            = ( myidx + 1 ) * nny_z - 1
+       sendrecvcount_yz = ( nxr_y-nxl_y+1 ) * nny_z * ( nzt_y-nzb_y+1 )
+       ALLOCATE( ar1(nxl_z:nxr_z,nys_z:nyn_z,1:nz) )
+       ALLOCATE( f_in(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
+       DO  jj = 0, nblk
+!
+!--       Forward Fourier Transformation
 !--       Transposition y --> z
+          CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
+          IF ( .NOT. temperton_fft_vec )  CALL resort_for_yz( ar, ar_inv )
+          CALL transpose_yz( ar_inv, ar )
+          CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
+          CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
+          DO  jnew = 0, ny-1
+             jind = jj + ( nblk + 1 ) * jnew
+             f_in(:,:,jnew) = f_inv(:,:,jind)
+          ENDDO
+          CALL transpose_yz( f_in, ar1(:,:,:) )
+          IF ( jj == nblk )  THEN
+             CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
+          ELSE
+             CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
+          ENDIF
+!
 !--       Solve the tridiagonal equation system along z
+          CALL cpu_log( log_point_s(6), 'tridia', 'start' )
+          CALL tridia_substi( ar )
+          CALL cpu_log( log_point_s(6), 'tridia', 'start', cpu_log_nowait )
+          n = isave(2) + jj
+          CALL tridia_substi_overlap( ar1(:,:,:), n )
           CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
 …
 !--       Inverse Fourier Transformation
 !--       Transposition z --> y
+          CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
+          CALL transpose_zy( ar, ar_inv )
+!
+!--       The fft_y below (vector branch) can directly process ar_inv (i.e. does not require a
+!--       resorting)
+          IF ( .NOT. temperton_fft_vec )  CALL resort_for_zy( ar_inv, ar )
+!--       Only one thread should call MPI routines, therefore forward and backward tranpose are in
+!--       the same section
+          IF ( jj == 0 )  THEN
+             CALL cpu_log( log_point_s(8), 'transpo invers', 'start', cpu_log_nowait )
+          ELSE
+             CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
+          ENDIF
+          CALL transpose_zy( ar1(:,:,:), f_in )
+          DO  jnew = 0, ny-1
+             jind = jj + ( nblk + 1 ) * jnew
+             f_inv(:,:,jind) = f_in(:,:,jnew)
+          ENDDO
           CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
+          CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
+          IF ( temperton_fft_vec )  THEN
+!
+!--          Output array ar_inv can be used as input to the below fft_x routine without resorting
+             CALL fft_y( ar, 'backward', ar_inv = ar_inv, nxl_y_bound = nxl_y, nxr_y_bound = nxr_y,&
+                         nxl_y_l = nxl_y, nxr_y_l = nxr_y )
+       ENDDO
+!
+!--    Restore original indices/counters
+       ny               = isave(1)
+       nys_z            = isave(2)
+       nyn_z            = isave(3)
+       sendrecvcount_yz = isave(4)
+       CALL resort_for_zy( f_inv, f_out_y )
+       DEALLOCATE( ar1, f_in, f_inv )
+!
+!--    fft along y backward + subsequent transposition y --> x
+       ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
+!
+!--    Save original indices and gridpoint counter
+       isave(1) = nx
+       isave(2) = nxl_y
+       isave(3) = nxr_y
+       isave(4) = sendrecvcount_xy
+!
+!--    Set new indices for transformation
+       nblk             = ( ( nx+1 ) / pdims(2) ) - 1
+       nx               = pdims(2)
+       nnx_y            = 1
+       nxl_y            = myidy * nnx_y
+       nxr_y            = ( myidy + 1 ) * nnx_y - 1
+       sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
+       ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
+       ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
+       DO  ii = 0, nblk
+          CALL cpu_log( log_point_s(7), 'fft_y', 'continue', cpu_log_nowait )
+          n = isave(2) + ii
+          nxl_y_bound = isave(2)
+          nxr_y_bound = isave(3)
+          CALL fft_y( ar1(:,:,:), 'backward', ar_tr = f_out_y, nxl_y_bound = nxl_y_bound,          &
+                      nxr_y_bound = nxr_y_bound, nxl_y_l = n, nxr_y_l = n )
+          IF ( ii == nblk )  THEN
+             CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
           ELSE
+             CALL fft_y( ar, 'backward', ar_tr = ar, nxl_y_bound = nxl_y, nxr_y_bound = nxr_y,     &
+                         nxl_y_l = nxl_y, nxr_y_l = nxr_y )
+             CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
           ENDIF
+          CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
+!
+!--       Transposition y --> x
+          CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
+          CALL transpose_yx( ar, ar_inv )
+          IF ( .NOT. temperton_fft_vec )  CALL resort_for_yx( ar_inv, ar )
+          CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
+          CALL transpose_yx( ar1(:,:,:), f_in )
+          DO  inew = 0, nx-1
+             iind = ii + (nblk+1) * inew
+             f_inv(:,:,iind) = f_in(:,:,inew)
+          ENDDO
           CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
+          CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
+          IF ( temperton_fft_vec )  THEN
+             CALL fft_x( ar, 'backward',  ar_inv=ar_inv )
+       ENDDO
+!
+!--    Restore original indices/counters
+       nx               = isave(1)
+       nxl_y            = isave(2)
+       nxr_y            = isave(3)
+       sendrecvcount_xy = isave(4)
+       CALL resort_for_yx( f_inv, f_out_z )
+       DEALLOCATE( ar1, f_in, f_inv )
+!
+!--    fft along x backward + subsequent final transposition x --> z
+       ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
+!
+!--    Save original indices and gridpoint counter
+       isave(1) = nz
+       isave(2) = nzb_x
+       isave(3) = nzt_x
+       isave(4) = sendrecvcount_zx
+!
+!--    Set new indices for transformation
+       nblk             = nz / pdims(1)
+       nz               = pdims(1)
+       nnz_x            = 1
+       nzb_x            = 1 + myidx * nnz_x
+       nzt_x            = ( myidx + 1 ) * nnz_x
+       sendrecvcount_zx = nnx * nny * nnz_x
+       ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
+       ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
+       DO  kk = 1, nblk
+          CALL cpu_log( log_point_s(4), 'fft_x', 'continue', cpu_log_nowait )
+          n = isave(2) + kk - 1
+          CALL fft_x( ar1(:,:,:), 'backward', f_out_z(:,:,n) )
+          IF ( kk == nblk )  THEN
+             CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
           ELSE
              CALL fft_x( ar, 'backward' )
+             CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
           ENDIF
+          CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
+!
+!--       Transposition x --> z
+          CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
+          CALL transpose_xz( ar, ar_inv )
+          CALL resort_for_xz( ar_inv, ar )
+          CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
+#ifndef _OPENACC
+       ELSE
+!
+!--       2d-domain-decomposition or no decomposition (1 PE run) with
+!--       overlapping transposition / fft
+!--       cputime logging must not use barriers, which would prevent overlapping
+          ALLOCATE( f_out_y(0:ny,nxl_y:nxr_y,nzb_y:nzt_y), &
+                    f_out_z(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
+!
+!--       Transposition z --> x + subsequent fft along x
+          ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
+          CALL resort_for_zx( ar, f_inv )
+!
+!--       Save original indices and gridpoint counter
+          isave(1) = nz
+          isave(2) = nzb_x
+          isave(3) = nzt_x
+          isave(4) = sendrecvcount_zx
+!
+!--       Set new indices for transformation
+          nblk  = nz / pdims(1)
+          nz    = pdims(1)
+          nnz_x = 1
+          nzb_x = 1 + myidx * nnz_x
+          nzt_x = ( myidx + 1 ) * nnz_x
+          sendrecvcount_zx = nnx * nny * nnz_x
+          ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
+          ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
+          DO  kk = 1, nblk
+             IF ( kk == 1 )  THEN
+                CALL cpu_log( log_point_s(5), 'transpo forward', 'start', cpu_log_nowait )
+             ELSE
+                CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
+             ENDIF
+             DO  knew = 1, nz
+                ki = kk + nblk * ( knew - 1 )
+                f_in(:,:,knew) = f_inv(:,:,ki)
+             ENDDO
+             CALL transpose_zx( f_in, ar1(:,:,:))
+             CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
+             IF ( kk == 1 )  THEN
+                CALL cpu_log( log_point_s(4), 'fft_x', 'start', cpu_log_nowait )
+             ELSE
+                CALL cpu_log( log_point_s(4), 'fft_x', 'continue', cpu_log_nowait )
+             ENDIF
+             n = isave(2) + kk - 1
+             CALL fft_x( ar1(:,:,:), 'forward',  ar_2d = f_out_z(:,:,n))
+             CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
+          ENDDO
+!
+!--       Restore original indices/counters
+          nz               = isave(1)
+          nzb_x            = isave(2)
+          nzt_x            = isave(3)
+          sendrecvcount_zx = isave(4)
+          DEALLOCATE( ar1, f_in, f_inv )
+!
+!--       Transposition x --> y + subsequent fft along y
+          ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
+          CALL resort_for_xy( f_out_z, f_inv )
+!
+!--       Save original indices and gridpoint counter
+          isave(1) = nx
+          isave(2) = nxl_y
+          isave(3) = nxr_y
+          isave(4) = sendrecvcount_xy
+!
+!--       Set new indices for transformation
+          nblk  = ( ( nx+1 ) / pdims(2) ) - 1
+          nx    = pdims(2)
+          nnx_y = 1
+          nxl_y = myidy * nnx_y
+          nxr_y = ( myidy + 1 ) * nnx_y - 1
+          sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
+          ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
+          ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
+          DO  ii = 0, nblk
+             CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
+             DO  inew = 0, nx-1
+                iind = ii + ( nblk + 1 ) * inew
+                f_in(:,:,inew) = f_inv(:,:,iind)
+             ENDDO
+             CALL transpose_xy( f_in, ar1(:,:,:) )
+             CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
+             IF ( ii == 1 )  THEN
+                CALL cpu_log( log_point_s(7), 'fft_y', 'start', cpu_log_nowait )
+             ELSE
+                CALL cpu_log( log_point_s(7), 'fft_y', 'continue', cpu_log_nowait )
+             ENDIF
+             nxl_y_bound = isave(2)
+             nxr_y_bound = isave(3)
+             n           = isave(2) + ii
+             CALL fft_y( ar1(:,:,:), 'forward', ar_tr = f_out_y,               &
+                         nxl_y_bound = nxl_y_bound, nxr_y_bound = nxr_y_bound, &
+                         nxl_y_l = n, nxr_y_l = n )
+             CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
+          ENDDO
+!
+!--       Restore original indices/counters
+          nx               = isave(1)
+          nxl_y            = isave(2)
+          nxr_y            = isave(3)
+          sendrecvcount_xy = isave(4)
+          DEALLOCATE( ar1, f_in, f_inv )
+!
+!--       Transposition y --> z + subsequent tridia + resort for z --> y
+          ALLOCATE( f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
+          CALL resort_for_yz( f_out_y, f_inv )
+!
+!--       Save original indices and gridpoint counter
+          isave(1) = ny
+          isave(2) = nys_z
+          isave(3) = nyn_z
+          isave(4) = sendrecvcount_yz
+!
+!--       Set new indices for transformation
+          nblk             = ( ( ny+1 ) / pdims(1) ) - 1
+          ny               = pdims(1)
+          nny_z            = 1
+          nys_z            = myidx * nny_z
+          nyn_z            = ( myidx + 1 ) * nny_z - 1
+          sendrecvcount_yz = ( nxr_y-nxl_y+1 ) * nny_z * ( nzt_y-nzb_y+1 )
+          ALLOCATE( ar1(nxl_z:nxr_z,nys_z:nyn_z,1:nz) )
+          ALLOCATE( f_in(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
+          DO  jj = 0, nblk
+!
+!--          Forward Fourier Transformation
+!--          Transposition y --> z
+             CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
+             DO  jnew = 0, ny-1
+                jind = jj + ( nblk + 1 ) * jnew
+                f_in(:,:,jnew) = f_inv(:,:,jind)
+             ENDDO
+             CALL transpose_yz( f_in, ar1(:,:,:) )
+             IF ( jj == nblk )  THEN
+                CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
+             ELSE
+                CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
+             ENDIF
+!
+!--          Solve the tridiagonal equation system along z
+             CALL cpu_log( log_point_s(6), 'tridia', 'start', cpu_log_nowait )
+             n = isave(2) + jj
+             CALL tridia_substi_overlap( ar1(:,:,:), n )
+             CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
+!
+!--          Inverse Fourier Transformation
+!--          Transposition z --> y
+!--          Only one thread should call MPI routines, therefore forward and
+!--          backward tranpose are in the same section
+             IF ( jj == 0 )  THEN
+                CALL cpu_log( log_point_s(8), 'transpo invers', 'start', cpu_log_nowait )
+             ELSE
+                CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
+             ENDIF
+             CALL transpose_zy( ar1(:,:,:), f_in )
+             DO  jnew = 0, ny-1
+                jind = jj + ( nblk + 1 ) * jnew
+                f_inv(:,:,jind) = f_in(:,:,jnew)
+             ENDDO
+          CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
+          CALL transpose_xz( ar1(:,:,:), f_in )
+          DO  knew = 1, nz
+             ki = kk + nblk * ( knew - 1 )
+             f_inv(:,:,ki) = f_in(:,:,knew)
+          ENDDO
+          IF ( kk == nblk )  THEN
+             CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
+          ELSE
              CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
+          ENDDO
+!
+!--       Restore original indices/counters
+          ny               = isave(1)
+          nys_z            = isave(2)
+          nyn_z            = isave(3)
+          sendrecvcount_yz = isave(4)
+          CALL resort_for_zy( f_inv, f_out_y )
+          DEALLOCATE( ar1, f_in, f_inv )
+!
+!--       fft along y backward + subsequent transposition y --> x
+          ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
+!
+!--       Save original indices and gridpoint counter
+          isave(1) = nx
+          isave(2) = nxl_y
+          isave(3) = nxr_y
+          isave(4) = sendrecvcount_xy
+!
+!--       Set new indices for transformation
+          nblk             = (( nx+1 ) / pdims(2) ) - 1
+          nx               = pdims(2)
+          nnx_y            = 1
+          nxl_y            = myidy * nnx_y
+          nxr_y            = ( myidy + 1 ) * nnx_y - 1
+          sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
+          ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
+          ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
+          DO  ii = 0, nblk
+             CALL cpu_log( log_point_s(7), 'fft_y', 'continue', cpu_log_nowait )
+             n = isave(2) + ii
+             nxl_y_bound = isave(2)
+             nxr_y_bound = isave(3)
+             CALL fft_y( ar1(:,:,:), 'backward', ar_tr = f_out_y,              &
+                         nxl_y_bound = nxl_y_bound, nxr_y_bound = nxr_y_bound, &
+                         nxl_y_l = n, nxr_y_l = n )
+             IF ( ii == nblk )  THEN
+                CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
+             ELSE
+                CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
+             ENDIF
+             CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
+             CALL transpose_yx( ar1(:,:,:), f_in )
+             DO  inew = 0, nx-1
+                iind = ii + (nblk+1) * inew
+                f_inv(:,:,iind) = f_in(:,:,inew)
+             ENDDO
+             CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
+          ENDDO
+!
+!--       Restore original indices/counters
+          nx               = isave(1)
+          nxl_y            = isave(2)
+          nxr_y            = isave(3)
+          sendrecvcount_xy = isave(4)
+          CALL resort_for_yx( f_inv, f_out_z )
+          DEALLOCATE( ar1, f_in, f_inv )
+!
+!--       fft along x backward + subsequent final transposition x --> z
+          ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
+!
+!--       Save original indices and gridpoint counter
+          isave(1) = nz
+          isave(2) = nzb_x
+          isave(3) = nzt_x
+          isave(4) = sendrecvcount_zx
+!
+!--       Set new indices for transformation
+          nblk             = nz / pdims(1)
+          nz               = pdims(1)
+          nnz_x            = 1
+          nzb_x            = 1 + myidx * nnz_x
+          nzt_x            = ( myidx + 1 ) * nnz_x
+          sendrecvcount_zx = nnx * nny * nnz_x
+          ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
+          ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
+          DO  kk = 1, nblk
+             CALL cpu_log( log_point_s(4), 'fft_x', 'continue', cpu_log_nowait )
+             n = isave(2) + kk - 1
+             CALL fft_x( ar1(:,:,:), 'backward', f_out_z(:,:,n))
+             IF ( kk == nblk )  THEN
+                CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
+             ELSE
+                CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
+             ENDIF
+             CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
+             CALL transpose_xz( ar1(:,:,:), f_in )
+             DO  knew = 1, nz
+                ki = kk + nblk * (knew-1)
+                f_inv(:,:,ki) = f_in(:,:,knew)
+             ENDDO
+             IF ( kk == nblk )  THEN
+                CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
+             ELSE
+                CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
+             ENDIF
+          ENDDO
+!
+!--       Restore original indices/counters
+          nz               = isave(1)
+          nzb_x            = isave(2)
+          nzt_x            = isave(3)
+          sendrecvcount_zx = isave(4)
+          CALL resort_for_xz( f_inv, ar )
+          DEALLOCATE( ar1, f_in, f_inv )
+       ENDIF
+          ENDIF
+       ENDDO
+!
+!--    Restore original indices/counters
+       nz               = isave(1)
+       nzb_x            = isave(2)
+       nzt_x            = isave(3)
+       sendrecvcount_zx = isave(4)
+       CALL resort_for_xz( f_inv, ar )
+       DEALLOCATE( ar1, f_in, f_inv )
+    ENDIF
 #endif
 #if !__acc_fft_device
        !$ACC UPDATE DEVICE(ar)
+    !$ACC UPDATE DEVICE(ar)
 #endif
        CALL cpu_log( log_point_s(3), 'poisfft', 'stop' )
     END SUBROUTINE poisfft
 !------------------------------------------------------------------------------!
+    CALL cpu_log( log_point_s(3), 'poisfft', 'stop' )
+ END SUBROUTINE poisfft
+!--------------------------------------------------------------------------------------------------!
 ! Description:
 ! ------------
 !> Fourier-transformation along y with subsequent transposition y --> x for
 !> a 1d-decomposition along x.
+!> Fourier-transformation along y with subsequent transposition y --> x for a 1d-decomposition
+!> along x.
 !>
+!> @attention The performance of this routine is much faster on the NEC-SX6,
+!>            if the first index of work_ffty_vec is odd. Otherwise
+!>            memory bank conflicts may occur (especially if the index is a
+!>            multiple of 128). That's why work_ffty_vec is dimensioned as
+!>            0:ny+1.
+!>            Of course, this will not work if users are using an odd number
+!>            of gridpoints along y.
+!------------------------------------------------------------------------------!
+    SUBROUTINE ffty_tr_yx( f_in, f_out )
+       USE control_parameters,                                                 &
+           ONLY:  loop_optimization
+       USE cpulog,                                                             &
+           ONLY:  cpu_log, log_point_s
+       USE kinds
+       USE pegrid
+       IMPLICIT NONE
+       INTEGER(iwp)            ::  i            !<
+       INTEGER(iwp)            ::  iend         !<
+       INTEGER(iwp)            ::  iouter       !<
+       INTEGER(iwp)            ::  ir           !<
+       INTEGER(iwp)            ::  j            !<
+       INTEGER(iwp)            ::  k            !<
+       INTEGER(iwp), PARAMETER ::  stridex = 4  !<
+       REAL(wp), DIMENSION(1:nz,0:ny,nxl:nxr)             ::  f_in   !<
+       REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) ::  f_out  !<
+       REAL(wp), DIMENSION(nxl:nxr,1:nz,0:ny)             ::  work   !<
+       REAL(wp), DIMENSION(:,:), ALLOCATABLE   ::  work_ffty      !<
+       REAL(wp), DIMENSION(:,:,:), ALLOCATABLE ::  work_ffty_vec  !<
+!
+!--    Carry out the FFT along y, where all data are present due to the
+!--    1d-decomposition along x. Resort the data in a way that x becomes
+!--    the first index.
+       CALL cpu_log( log_point_s(7), 'fft_y_1d', 'start' )
+       IF ( loop_optimization == 'vector' )  THEN
+          ALLOCATE( work_ffty_vec(0:ny+1,1:nz,nxl:nxr) )
+!
+!--       Code optimized for vector processors
+          !$OMP PARALLEL PRIVATE ( i, j, k )
+          !$OMP DO
+          DO  i = nxl, nxr
+!> @attention The performance of this routine is much faster on the NEC-SX6, if the first index of
+!>            work_ffty_vec is odd. Otherwise memory bank conflicts may occur (especially if the
+!>            index is a multiple of 128). That's why work_ffty_vec is dimensioned as 0:ny+1.
+!>            Of course, this will not work if users are using an odd number of gridpoints along y.
+!--------------------------------------------------------------------------------------------------!
+ SUBROUTINE ffty_tr_yx( f_in, f_out )
+    USE control_parameters,                                                                        &
+        ONLY:  loop_optimization
+    USE cpulog,                                                                                    &
+        ONLY:  cpu_log,                                                                            &
+               log_point_s
+    USE kinds
+    USE pegrid
+    IMPLICIT NONE
+    INTEGER(iwp) ::  i       !<
+    INTEGER(iwp) ::  iend    !<
+    INTEGER(iwp) ::  iouter  !<
+    INTEGER(iwp) ::  ir      !<
+    INTEGER(iwp) ::  j       !<
+    INTEGER(iwp) ::  k       !<
+    INTEGER(iwp), PARAMETER ::  stridex = 4  !<
+    REAL(wp), DIMENSION(1:nz,0:ny,nxl:nxr)             ::  f_in   !<
+    REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) ::  f_out  !<
+    REAL(wp), DIMENSION(nxl:nxr,1:nz,0:ny)             ::  work   !<
+    REAL(wp), DIMENSION(:,:), ALLOCATABLE   ::  work_ffty      !<
+    REAL(wp), DIMENSION(:,:,:), ALLOCATABLE ::  work_ffty_vec  !<
+!
+!-- Carry out the FFT along y, where all data are present due to the 1d-decomposition along x.
+!-- Resort the data in a way that x becomes the first index.
+    CALL cpu_log( log_point_s(7), 'fft_y_1d', 'start' )
+    IF ( loop_optimization == 'vector' )  THEN
+       ALLOCATE( work_ffty_vec(0:ny+1,1:nz,nxl:nxr) )
+!
+!--    Code optimized for vector processors
+       !$OMP PARALLEL PRIVATE ( i, j, k )
+       !$OMP DO
+       DO  i = nxl, nxr
+          DO  j = 0, ny
+             DO  k = 1, nz
+                work_ffty_vec(j,k,i) = f_in(k,j,i)
+             ENDDO
+          ENDDO
+          CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' )
+       ENDDO
+       !$OMP DO
+       DO  k = 1, nz
+          DO  j = 0, ny
+             DO  i = nxl, nxr
+                work(i,k,j) = work_ffty_vec(j,k,i)
+             ENDDO
+          ENDDO
+       ENDDO
+       !$OMP END PARALLEL
+       DEALLOCATE( work_ffty_vec )
+    ELSE
+!
+!--    Cache optimized code.
+       ALLOCATE( work_ffty(0:ny,stridex) )
+!
+!--    The i-(x-)direction is split into a strided outer loop and an inner loop for better cache
+!--    performance
+       !$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty)
+       !$OMP DO
+       DO  iouter = nxl, nxr, stridex
+          iend = MIN( iouter+stridex-1, nxr )  ! Upper bound for inner i loop
+          DO  k = 1, nz
+             DO  i = iouter, iend
+                ir = i-iouter+1  ! Counter within a stride
+                DO  j = 0, ny
+                   work_ffty(j,ir) = f_in(k,j,i)
+                ENDDO
+!
+!--             FFT along y
+                CALL fft_y_1d( work_ffty(:,ir), 'forward' )
+             ENDDO
+!
+!--          Resort
              DO  j = 0, ny
                 DO  k = 1, nz
                    work_ffty_vec(j,k,i) = f_in(k,j,i)
+                DO  i = iouter, iend
+                   work(i,k,j) = work_ffty(j,i-iouter+1)
                 ENDDO
              ENDDO
+             CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' )
+          ENDDO
+          !$OMP DO
+          DO  k = 1, nz
+             DO  j = 0, ny
+                DO  i = nxl, nxr
+                   work(i,k,j) = work_ffty_vec(j,k,i)
+                ENDDO
+             ENDDO
+          ENDDO
+          !$OMP END PARALLEL
+          DEALLOCATE( work_ffty_vec )
+       ELSE
+!
+!--       Cache optimized code.
+          ALLOCATE( work_ffty(0:ny,stridex) )
+!
+!--       The i-(x-)direction is split into a strided outer loop and an inner
+!--       loop for better cache performance
+          !$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty)
+          !$OMP DO
+          DO  iouter = nxl, nxr, stridex
+             iend = MIN( iouter+stridex-1, nxr )  ! Upper bound for inner i loop
+             DO  k = 1, nz
+                DO  i = iouter, iend
+                   ir = i-iouter+1  ! counter within a stride
+                   DO  j = 0, ny
+                      work_ffty(j,ir) = f_in(k,j,i)
+                   ENDDO
+!
+!--                FFT along y
+                   CALL fft_y_1d( work_ffty(:,ir), 'forward' )
+                ENDDO
+!
+!--             Resort
+                DO  j = 0, ny
+                   DO  i = iouter, iend
+                      work(i,k,j) = work_ffty(j,i-iouter+1)
+                   ENDDO
+                ENDDO
+             ENDDO
+          ENDDO
+          !$OMP END PARALLEL
+          DEALLOCATE( work_ffty )
+       ENDIF
+       CALL cpu_log( log_point_s(7), 'fft_y_1d', 'pause' )
+!
+!--    Transpose array
+          ENDDO
+       ENDDO
+       !$OMP END PARALLEL
+       DEALLOCATE( work_ffty )
+    ENDIF
+    CALL cpu_log( log_point_s(7), 'fft_y_1d', 'pause' )
+!
+!-- Transpose array
 #if defined( __parallel )
+       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
+       IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
+       CALL MPI_ALLTOALL( work(nxl,1,0),      sendrecvcount_xy, MPI_REAL, &
+                          f_out(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
+                          comm1dx, ierr )
+       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
+    CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
+    IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
+    CALL MPI_ALLTOALL( work(nxl,1,0), sendrecvcount_xy, MPI_REAL, f_out(1,1,nys_x,1),              &
+                       sendrecvcount_xy, MPI_REAL, comm1dx, ierr )
+    CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
 #else
+!
 !--    Next line required to avoid compile error about unused dummy argument in serial mode
        i = SIZE( f_out )
+!-- Next line required to avoid compile error about unused dummy argument in serial mode
+    i = SIZE( f_out )
 #endif
     END SUBROUTINE ffty_tr_yx
 !------------------------------------------------------------------------------!
+ END SUBROUTINE ffty_tr_yx
+!--------------------------------------------------------------------------------------------------!
 ! Description:
 ! ------------
+!>  Transposition x --> y with a subsequent backward Fourier transformation for
+!>  a 1d-decomposition along x
+!------------------------------------------------------------------------------!
+    SUBROUTINE tr_xy_ffty( f_in, f_out )
+       USE control_parameters,                                                 &
+           ONLY:  loop_optimization
+       USE cpulog,                                                             &
+           ONLY:  cpu_log, log_point_s
+       USE kinds
+       USE pegrid
+       IMPLICIT NONE
+       INTEGER(iwp)            ::  i            !<
+       INTEGER(iwp)            ::  iend         !<
+       INTEGER(iwp)            ::  iouter       !<
+       INTEGER(iwp)            ::  ir           !<
+       INTEGER(iwp)            ::  j            !<
+       INTEGER(iwp)            ::  k            !<
+       INTEGER(iwp), PARAMETER ::  stridex = 4  !<
+       REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) ::  f_in   !<
+       REAL(wp), DIMENSION(1:nz,0:ny,nxl:nxr)             ::  f_out  !<
+       REAL(wp), DIMENSION(nxl:nxr,1:nz,0:ny)             ::  work   !<
+       REAL(wp), DIMENSION(:,:), ALLOCATABLE   ::  work_ffty         !<
+       REAL(wp), DIMENSION(:,:,:), ALLOCATABLE ::  work_ffty_vec     !<
+!
+!--    Transpose array
+!> Transposition x --> y with a subsequent backward Fourier transformation for a 1d-decomposition
+!> along x
+!--------------------------------------------------------------------------------------------------!
+ SUBROUTINE tr_xy_ffty( f_in, f_out )
+    USE control_parameters,                                                                        &
+        ONLY:  loop_optimization
+    USE cpulog,                                                                                    &
+        ONLY:  cpu_log,                                                                            &
+               log_point_s
+    USE kinds
+    USE pegrid
+    IMPLICIT NONE
+    INTEGER(iwp) ::  i       !<
+    INTEGER(iwp) ::  iend    !<
+    INTEGER(iwp) ::  iouter  !<
+    INTEGER(iwp) ::  ir      !<
+    INTEGER(iwp) ::  j       !<
+    INTEGER(iwp) ::  k       !<
+    INTEGER(iwp), PARAMETER ::  stridex = 4  !<
+    REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) ::  f_in   !<
+    REAL(wp), DIMENSION(1:nz,0:ny,nxl:nxr)             ::  f_out  !<
+    REAL(wp), DIMENSION(nxl:nxr,1:nz,0:ny)             ::  work   !<
+    REAL(wp), DIMENSION(:,:),   ALLOCATABLE ::  work_ffty      !<
+    REAL(wp), DIMENSION(:,:,:), ALLOCATABLE ::  work_ffty_vec  !<
+!
+!-- Transpose array
 #if defined( __parallel )
+       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
+       IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
+       CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
+                          work(nxl,1,0),     sendrecvcount_xy, MPI_REAL, &
+                          comm1dx, ierr )
+       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
+    CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
+    IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
+    CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, work(nxl,1,0),               &
+                       sendrecvcount_xy, MPI_REAL, comm1dx, ierr )
+    CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
 #else
+!
 !--    Next line required to avoid compile error about unused dummy argument in serial mode
        i = SIZE( f_in )
+!-- Next line required to avoid compile error about unused dummy argument in serial mode
+    i = SIZE( f_in )
 #endif
+!
+!--    Resort the data in a way that y becomes the first index and carry out the
+!--    backward fft along y.
+       CALL cpu_log( log_point_s(7), 'fft_y_1d', 'continue' )
+       IF ( loop_optimization == 'vector' )  THEN
+          ALLOCATE( work_ffty_vec(0:ny+1,1:nz,nxl:nxr) )
+!
+!--       Code optimized for vector processors
+          !$OMP PARALLEL PRIVATE ( i, j, k )
+          !$OMP DO
+!-- Resort the data in a way that y becomes the first index and carry out the backward fft along y.
+    CALL cpu_log( log_point_s(7), 'fft_y_1d', 'continue' )
+    IF ( loop_optimization == 'vector' )  THEN
+       ALLOCATE( work_ffty_vec(0:ny+1,1:nz,nxl:nxr) )
+!
+!--    Code optimized for vector processors
+       !$OMP PARALLEL PRIVATE ( i, j, k )
+       !$OMP DO
+       DO  k = 1, nz
+          DO  j = 0, ny
+             DO  i = nxl, nxr
+                work_ffty_vec(j,k,i) = work(i,k,j)
+             ENDDO
+          ENDDO
+       ENDDO
+       !$OMP DO
+       DO  i = nxl, nxr
+          CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' )
+          DO  j = 0, ny
+             DO  k = 1, nz
+                f_out(k,j,i) = work_ffty_vec(j,k,i)
+             ENDDO
+          ENDDO
+       ENDDO
+       !$OMP END PARALLEL
+       DEALLOCATE( work_ffty_vec )
+    ELSE
+!
+!--    Cache optimized code.
+       ALLOCATE( work_ffty(0:ny,stridex) )
+!
+!--    The i-(x-)direction is split into a strided outer loop and an inner loop for better cache
+!--    performance
+       !$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty )
+       !$OMP DO
+       DO  iouter = nxl, nxr, stridex
+          iend = MIN( iouter+stridex-1, nxr )  ! Upper bound for inner i loop
           DO  k = 1, nz
+!
+!--          Resort
              DO  j = 0, ny
                 DO  i = nxl, nxr
                    work_ffty_vec(j,k,i) = work(i,k,j)
+                DO  i = iouter, iend
+                   work_ffty(j,i-iouter+1) = work(i,k,j)
                 ENDDO
              ENDDO
+          ENDDO
+          !$OMP DO
+          DO  i = nxl, nxr
              CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' )
+             DO  j = 0, ny
                 DO  k = 1, nz
                    f_out(k,j,i) = work_ffty_vec(j,k,i)
+             DO  i = iouter, iend
+!
+!--             FFT along y
+                ir = i-iouter+1  ! Counter within a stride
+                CALL fft_y_1d( work_ffty(:,ir), 'backward' )
+                DO  j = 0, ny
+                   f_out(k,j,i) = work_ffty(j,ir)
                 ENDDO
              ENDDO
           ENDDO
+          !$OMP END PARALLEL
+          DEALLOCATE( work_ffty_vec )
+       ELSE
+!
+!--       Cache optimized code.
+          ALLOCATE( work_ffty(0:ny,stridex) )
+!
+!--       The i-(x-)direction is split into a strided outer loop and an inner
+!--       loop for better cache performance
+          !$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty )
+          !$OMP DO
+          DO  iouter = nxl, nxr, stridex
+             iend = MIN( iouter+stridex-1, nxr )  ! Upper bound for inner i loop
+             DO  k = 1, nz
+!
+!--             Resort
+                DO  j = 0, ny
+                   DO  i = iouter, iend
+                      work_ffty(j,i-iouter+1) = work(i,k,j)
+                   ENDDO
+                ENDDO
+                DO  i = iouter, iend
+!
+!--                FFT along y
+                   ir = i-iouter+1  ! counter within a stride
+                   CALL fft_y_1d( work_ffty(:,ir), 'backward' )
+                   DO  j = 0, ny
+                      f_out(k,j,i) = work_ffty(j,ir)
+                   ENDDO
+                ENDDO
+             ENDDO
+          ENDDO
+          !$OMP END PARALLEL
+          DEALLOCATE( work_ffty )
+       ENDIF
+       CALL cpu_log( log_point_s(7), 'fft_y_1d', 'stop' )
+    END SUBROUTINE tr_xy_ffty
+!------------------------------------------------------------------------------!
+       ENDDO
+       !$OMP END PARALLEL
+       DEALLOCATE( work_ffty )
+    ENDIF
+    CALL cpu_log( log_point_s(7), 'fft_y_1d', 'stop' )
+ END SUBROUTINE tr_xy_ffty
+!--------------------------------------------------------------------------------------------------!
 ! Description:
 ! ------------
+!> FFT along x, solution of the tridiagonal system and backward FFT for
+!> a 1d-decomposition along x
+!> FFT along x, solution of the tridiagonal system and backward FFT for a 1d-decomposition along x
 !>
+!> @warning this subroutine may still not work for hybrid parallelization
+!>          with OpenMP (for possible necessary changes see the original
+!>          routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
+!------------------------------------------------------------------------------!
+    SUBROUTINE fftx_tri_fftx( ar )
+       USE control_parameters,                                                 &
+           ONLY:  loop_optimization
+       USE cpulog,                                                             &
+           ONLY:  cpu_log, log_point_s
+       USE grid_variables,                                                     &
+           ONLY:  ddx2, ddy2
+       USE kinds
+       USE pegrid
+       IMPLICIT NONE
+       INTEGER(iwp) ::  i                   !<
+       INTEGER(iwp) ::  j                   !<
+       INTEGER(iwp) ::  k                   !<
+       INTEGER(iwp) ::  m                   !<
+       INTEGER(iwp) ::  n                   !<
+!$     INTEGER(iwp) ::  omp_get_thread_num  !<
+       INTEGER(iwp) ::  tn                  !<
+       REAL(wp), DIMENSION(0:nx)                          ::  work_fftx  !<
+       REAL(wp), DIMENSION(0:nx,1:nz)                     ::  work_trix  !<
+       REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) ::  ar         !<
+       REAL(wp), DIMENSION(:,:,:,:), ALLOCATABLE          ::  tri        !<
+       CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'start' )
+       ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) )
+       tn = 0              ! Default thread number in case of one thread
+!> @warning This subroutine may still not work for hybrid parallelization with OpenMP (for possible
+!>          necessary changes see the original routine poisfft_hybrid, developed by Klaus Ketelsen,
+!>          May 2002)
+!--------------------------------------------------------------------------------------------------!
+ SUBROUTINE fftx_tri_fftx( ar )
+    USE control_parameters,                                                                        &
+        ONLY:  loop_optimization
+    USE cpulog,                                                                                    &
+        ONLY:  cpu_log,                                                                            &
+               log_point_s
+    USE grid_variables,                                                                            &
+        ONLY:  ddx2,                                                                               &
+               ddy2
+    USE kinds
+    USE pegrid
+    IMPLICIT NONE
+    INTEGER(iwp) ::  i                   !<
+    INTEGER(iwp) ::  j                   !<
+    INTEGER(iwp) ::  k                   !<
+    INTEGER(iwp) ::  m                   !<
+    INTEGER(iwp) ::  n                   !<
+!   INTEGER(iwp) ::  omp_get_thread_num  !<
+    INTEGER(iwp) ::  tn                  !<
+    REAL(wp), DIMENSION(0:nx)                          ::  work_fftx  !<
+    REAL(wp), DIMENSION(0:nx,1:nz)                     ::  work_trix  !<
+    REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) ::  ar         !<
+    REAL(wp), DIMENSION(:,:,:,:), ALLOCATABLE          ::  tri        !<
+    CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'start' )
+    ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) )
+    tn = 0              ! Default thread number in case of one thread
 !$OMP  PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_fftx, work_trix )
+       DO  j = nys_x, nyn_x
+!$        tn = omp_get_thread_num()
+          IF ( loop_optimization == 'vector' )  THEN
+!
+!--          Code optimized for vector processors
+             DO  k = 1, nz
+                m = 0
+                DO  n = 1, pdims(1)
+                   DO  i = 1, nnx
+                      work_trix(m,k) = ar(i,k,j,n)
+                      m = m + 1
+                   ENDDO
+    DO  j = nys_x, nyn_x
+!$     tn = omp_get_thread_num()
+       IF ( loop_optimization == 'vector' )  THEN
+!
+!--       Code optimized for vector processors
+          DO  k = 1, nz
+             m = 0
+             DO  n = 1, pdims(1)
+                DO  i = 1, nnx
+                   work_trix(m,k) = ar(i,k,j,n)
+                   m = m + 1
                 ENDDO
+             ENDDO
+             CALL fft_x_m( work_trix, 'forward' )
+          ELSE
+!
 !--          Cache optimized code
+             DO  k = 1, nz
+                m = 0
                 DO  n = 1, pdims(1)
                    DO  i = 1, nnx
                       work_fftx(m) = ar(i,k,j,n)
                       m = m + 1
                    ENDDO
+             ENDDO
+          ENDDO
+          CALL fft_x_m( work_trix, 'forward' )
+       ELSE
+!
+!--       Cache optimized code
+          DO  k = 1, nz
+             m = 0
+             DO  n = 1, pdims(1)
+                DO  i = 1, nnx
+                   work_fftx(m) = ar(i,k,j,n)
+                   m = m + 1
                 ENDDO
+                CALL fft_x_1d( work_fftx, 'forward' )
+                DO  i = 0, nx
+                   work_trix(i,k) = work_fftx(i)
+             ENDDO
+             CALL fft_x_1d( work_fftx, 'forward' )
+             DO  i = 0, nx
+                work_trix(i,k) = work_fftx(i)
+             ENDDO
+          ENDDO
+       ENDIF
+!
+!--    Solve the linear equation system
+       CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) )
+       IF ( loop_optimization == 'vector' )  THEN
+!
+!--       Code optimized for vector processors
+          CALL fft_x_m( work_trix, 'backward' )
+          DO  k = 1, nz
+             m = 0
+             DO  n = 1, pdims(1)
+                DO  i = 1, nnx
+                   ar(i,k,j,n) = work_trix(m,k)
+                   m = m + 1
                 ENDDO
+             ENDDO
+          ENDIF
+!
+!--       Solve the linear equation system
+          CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) )
+          IF ( loop_optimization == 'vector' )  THEN
+!
+!--          Code optimized for vector processors
+             CALL fft_x_m( work_trix, 'backward' )
+             DO  k = 1, nz
+                m = 0
+                DO  n = 1, pdims(1)
+                   DO  i = 1, nnx
+                      ar(i,k,j,n) = work_trix(m,k)
+                      m = m + 1
+                   ENDDO
+             ENDDO
+          ENDDO
+       ELSE
+!
+!--       Cache optimized code
+          DO  k = 1, nz
+             DO  i = 0, nx
+                work_fftx(i) = work_trix(i,k)
+             ENDDO
+             CALL fft_x_1d( work_fftx, 'backward' )
+             m = 0
+             DO  n = 1, pdims(1)
+                DO  i = 1, nnx
+                   ar(i,k,j,n) = work_fftx(m)
+                   m = m + 1
                 ENDDO
+             ENDDO
+          ELSE
+!
+!--          Cache optimized code
+             DO  k = 1, nz
+                DO  i = 0, nx
+                   work_fftx(i) = work_trix(i,k)
+                ENDDO
+                CALL fft_x_1d( work_fftx, 'backward' )
+                m = 0
+                DO  n = 1, pdims(1)
+                   DO  i = 1, nnx
+                      ar(i,k,j,n) = work_fftx(m)
+                      m = m + 1
+                   ENDDO
+                ENDDO
+             ENDDO
+          ENDIF
+       ENDDO
+       DEALLOCATE( tri )
+       CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'stop' )
+    END SUBROUTINE fftx_tri_fftx
+!------------------------------------------------------------------------------!
+             ENDDO
+          ENDDO
+       ENDIF
+    ENDDO
+    DEALLOCATE( tri )
+    CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'stop' )
+ END SUBROUTINE fftx_tri_fftx
+!--------------------------------------------------------------------------------------------------!
 ! Description:
 ! ------------
 !> Fourier-transformation along x with subsequent transposition x --> y for
 !> a 1d-decomposition along y.
+!> Fourier-transformation along x with subsequent transposition x --> y for a 1d-decomposition
+!> along y.
 !>
+!> @attention NEC-branch of this routine may significantly profit from
+!>            further optimizations. So far, performance is much worse than
+!>            for routine ffty_tr_yx (more than three times slower).
+!------------------------------------------------------------------------------!
+    SUBROUTINE fftx_tr_xy( f_in, f_out )
+       USE control_parameters,                                                 &
+           ONLY:  loop_optimization
+       USE cpulog,                                                             &
+           ONLY:  cpu_log, log_point_s
+       USE kinds
+       USE pegrid
+       IMPLICIT NONE
+       INTEGER(iwp) ::  i  !<
+       INTEGER(iwp) ::  j  !<
+       INTEGER(iwp) ::  k  !<
+       REAL(wp), DIMENSION(0:nx,1:nz,nys:nyn)             ::  work_fftx  !<
+       REAL(wp), DIMENSION(1:nz,nys:nyn,0:nx)             ::  f_in       !<
+       REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) ::  f_out      !<
+       REAL(wp), DIMENSION(nys:nyn,1:nz,0:nx)             ::  work       !<
+!
+!--    Carry out the FFT along x, where all data are present due to the
+!--    1d-decomposition along y. Resort the data in a way that y becomes
+!--    the first index.
+       CALL cpu_log( log_point_s(4), 'fft_x_1d', 'start' )
+       IF ( loop_optimization == 'vector' )  THEN
+!
+!--       Code for vector processors
+!$OMP     PARALLEL PRIVATE ( i, j, k )
+!$OMP     DO
+          DO  i = 0, nx
+             DO  j = nys, nyn
+                DO  k = 1, nz
+                   work_fftx(i,k,j) = f_in(k,j,i)
+                ENDDO
+             ENDDO
+          ENDDO
+!$OMP     DO
+          DO  j = nys, nyn
+             CALL fft_x_m( work_fftx(:,:,j), 'forward' )
+             DO  k = 1, nz
+                DO  i = 0, nx
+                   work(j,k,i) = work_fftx(i,k,j)
+                ENDDO
+             ENDDO
+          ENDDO
+!$OMP     END PARALLEL
+       ELSE
+!
+!--       Cache optimized code (there might be still a potential for better
+!--       optimization).
+!$OMP     PARALLEL PRIVATE (i,j,k)
+!$OMP     DO
+          DO  i = 0, nx
+             DO  j = nys, nyn
+                DO  k = 1, nz
+                   work_fftx(i,k,j) = f_in(k,j,i)
+                ENDDO
+             ENDDO
+          ENDDO
+!$OMP     DO
+!> @attention NEC-branch of this routine may significantly profit from further optimizations. So
+!>            far, performance is much worse than for routine ffty_tr_yx (more than three times
+!>            slower).
+!--------------------------------------------------------------------------------------------------!
+ SUBROUTINE fftx_tr_xy( f_in, f_out )
+    USE control_parameters,                                                                        &
+        ONLY:  loop_optimization
+    USE cpulog,                                                                                    &
+        ONLY:  cpu_log,                                                                            &
+               log_point_s
+    USE kinds
+    USE pegrid
+    IMPLICIT NONE
+    INTEGER(iwp) ::  i  !<
+    INTEGER(iwp) ::  j  !<
+    INTEGER(iwp) ::  k  !<
+    REAL(wp), DIMENSION(0:nx,1:nz,nys:nyn)             ::  work_fftx  !<
+    REAL(wp), DIMENSION(1:nz,nys:nyn,0:nx)             ::  f_in       !<
+    REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) ::  f_out      !<
+    REAL(wp), DIMENSION(nys:nyn,1:nz,0:nx)             ::  work       !<
+!
+!--  Carry out the FFT along x, where all data are present due to the 1d-decomposition along y.
+!--  Resort the data in a way that y becomes the first index.
+     CALL cpu_log( log_point_s(4), 'fft_x_1d', 'start' )
+     IF ( loop_optimization == 'vector' )  THEN
+!
+!--    Code for vector processors
+!$OMP  PARALLEL PRIVATE ( i, j, k )
+!$OMP  DO
+       DO  i = 0, nx
           DO  j = nys, nyn
              DO  k = 1, nz
+                CALL fft_x_1d( work_fftx(0:nx,k,j), 'forward' )
+                DO  i = 0, nx
+                   work(j,k,i) = work_fftx(i,k,j)
+                ENDDO
+             ENDDO
+          ENDDO
+!$OMP     END PARALLEL
+       ENDIF
+       CALL cpu_log( log_point_s(4), 'fft_x_1d', 'pause' )
+!
+!--    Transpose array
+                work_fftx(i,k,j) = f_in(k,j,i)
+             ENDDO
+          ENDDO
+       ENDDO
+!$OMP  DO
+       DO  j = nys, nyn
+          CALL fft_x_m( work_fftx(:,:,j), 'forward' )
+          DO  k = 1, nz
+             DO  i = 0, nx
+                work(j,k,i) = work_fftx(i,k,j)
+             ENDDO
+          ENDDO
+       ENDDO
+!$OMP  END PARALLEL
+    ELSE
+!
+!--    Cache optimized code (there might still be a potential for better optimization).
+!$OMP  PARALLEL PRIVATE (i,j,k)
+!$OMP  DO
+       DO  i = 0, nx
+          DO  j = nys, nyn
+             DO  k = 1, nz
+                work_fftx(i,k,j) = f_in(k,j,i)
+             ENDDO
+          ENDDO
+       ENDDO
+!$OMP  DO
+       DO  j = nys, nyn
+          DO  k = 1, nz
+             CALL fft_x_1d( work_fftx(0:nx,k,j), 'forward' )
+             DO  i = 0, nx
+                work(j,k,i) = work_fftx(i,k,j)
+             ENDDO
+          ENDDO
+       ENDDO
+!$OMP  END PARALLEL
+    ENDIF
+    CALL cpu_log( log_point_s(4), 'fft_x_1d', 'pause' )
+!
+!-- Transpose array
 #if defined( __parallel )
+       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
+       IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
+       CALL MPI_ALLTOALL( work(nys,1,0),      sendrecvcount_xy, MPI_REAL, &
+                          f_out(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
+                          comm1dy, ierr )
+       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
+    CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
+    IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
+    CALL MPI_ALLTOALL( work(nys,1,0), sendrecvcount_xy, MPI_REAL, f_out(1,1,nxl_y,1),              &
+                       sendrecvcount_xy, MPI_REAL, comm1dy, ierr )
+    CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
 #else
+!
 !--    Next line required to avoid compile error about unused dummy argument in serial mode
        i = SIZE( f_out )
+!-- Next line required to avoid compile error about unused dummy argument in serial mode
+    i = SIZE( f_out )
 #endif
     END SUBROUTINE fftx_tr_xy
 !------------------------------------------------------------------------------!
+ END SUBROUTINE fftx_tr_xy
+!--------------------------------------------------------------------------------------------------!
 ! Description:
 ! ------------
+!> Transposition y --> x with a subsequent backward Fourier transformation for
+!> a 1d-decomposition along x.
+!------------------------------------------------------------------------------!
+    SUBROUTINE tr_yx_fftx( f_in, f_out )
+       USE control_parameters,                                                 &
+           ONLY:  loop_optimization
+       USE cpulog,                                                             &
+           ONLY:  cpu_log, log_point_s
+       USE kinds
+       USE pegrid
+       IMPLICIT NONE
+       INTEGER(iwp) ::  i  !<
+       INTEGER(iwp) ::  j  !<
+       INTEGER(iwp) ::  k  !<
+       REAL(wp), DIMENSION(0:nx,1:nz,nys:nyn)             ::  work_fftx  !<
+       REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) ::  f_in       !<
+       REAL(wp), DIMENSION(1:nz,nys:nyn,0:nx)             ::  f_out      !<
+       REAL(wp), DIMENSION(nys:nyn,1:nz,0:nx)             ::  work       !<
+!
+!--    Transpose array
+!> Transposition y --> x with a subsequent backward Fourier transformation for a 1d-decomposition
+!> along x.
+!--------------------------------------------------------------------------------------------------!
+ SUBROUTINE tr_yx_fftx( f_in, f_out )
+    USE control_parameters,                                                                        &
+        ONLY:  loop_optimization
+    USE cpulog,                                                                                    &
+        ONLY:  cpu_log,                                                                            &
+               log_point_s
+    USE kinds
+    USE pegrid
+    IMPLICIT NONE
+    INTEGER(iwp) ::  i  !<
+    INTEGER(iwp) ::  j  !<
+    INTEGER(iwp) ::  k  !<
+    REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) ::  f_in       !<
+    REAL(wp), DIMENSION(1:nz,nys:nyn,0:nx)             ::  f_out      !<
+    REAL(wp), DIMENSION(nys:nyn,1:nz,0:nx)             ::  work       !<
+    REAL(wp), DIMENSION(0:nx,1:nz,nys:nyn)             ::  work_fftx  !<
+!
+!-- Transpose array
 #if defined( __parallel )
+       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
+       IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
+       CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
+                          work(nys,1,0),     sendrecvcount_xy, MPI_REAL, &
+                          comm1dy, ierr )
+       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
+    CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
+    IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
+    CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, work(nys,1,0),               &
+                       sendrecvcount_xy, MPI_REAL, comm1dy, ierr )
+    CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
 #else
+!
 !--    Next line required to avoid compile error about unused dummy argument in serial mode
        i = SIZE( f_in )
+!-- Next line required to avoid compile error about unused dummy argument in serial mode
+    i = SIZE( f_in )
 #endif
+!
+!--    Carry out the FFT along x, where all data are present due to the
+!--    1d-decomposition along y. Resort the data in a way that y becomes
+!--    the first index.
+       CALL cpu_log( log_point_s(4), 'fft_x_1d', 'continue' )
+       IF ( loop_optimization == 'vector' )  THEN
+!
+!--       Code optimized for vector processors
+!$OMP     PARALLEL PRIVATE ( i, j, k )
+!$OMP     DO
+          DO  j = nys, nyn
+             DO  k = 1, nz
+                DO  i = 0, nx
+                   work_fftx(i,k,j) = work(j,k,i)
+                ENDDO
+             ENDDO
+             CALL fft_x_m( work_fftx(:,:,j), 'backward' )
+          ENDDO
+!$OMP     DO
+          DO  i = 0, nx
+             DO  j = nys, nyn
+                DO  k = 1, nz
+                   f_out(k,j,i) = work_fftx(i,k,j)
+                ENDDO
+             ENDDO
+          ENDDO
+!$OMP     END PARALLEL
+       ELSE
+!
+!--       Cache optimized code (there might be still a potential for better
+!--       optimization).
+!$OMP     PARALLEL PRIVATE (i,j,k)
+!$OMP     DO
+!-- Carry out the FFT along x, where all data are present due to the 1d-decomposition along y.
+!-- Resort the data in a way that y becomes the first index.
+    CALL cpu_log( log_point_s(4), 'fft_x_1d', 'continue' )
+    IF ( loop_optimization == 'vector' )  THEN
+!
+!--    Code optimized for vector processors
+!$OMP  PARALLEL PRIVATE ( i, j, k )
+!$OMP  DO
+       DO  j = nys, nyn
+          DO  k = 1, nz
+             DO  i = 0, nx
+                work_fftx(i,k,j) = work(j,k,i)
+             ENDDO
+          ENDDO
+          CALL fft_x_m( work_fftx(:,:,j), 'backward' )
+       ENDDO
+!$OMP  DO
+       DO  i = 0, nx
           DO  j = nys, nyn
              DO  k = 1, nz
+                DO  i = 0, nx
+                   work_fftx(i,k,j) = work(j,k,i)
+                ENDDO
+                CALL fft_x_1d( work_fftx(0:nx,k,j), 'backward' )
+             ENDDO
+          ENDDO
+!$OMP     DO
+          DO  i = 0, nx
+             DO  j = nys, nyn
+                DO  k = 1, nz
+                   f_out(k,j,i) = work_fftx(i,k,j)
+                ENDDO
+             ENDDO
+          ENDDO
+!$OMP     END PARALLEL
+       ENDIF
+       CALL cpu_log( log_point_s(4), 'fft_x_1d', 'stop' )
+    END SUBROUTINE tr_yx_fftx
+!------------------------------------------------------------------------------!
+                f_out(k,j,i) = work_fftx(i,k,j)
+             ENDDO
+          ENDDO
+       ENDDO
+!$OMP  END PARALLEL
+    ELSE
+!
+!--    Cache optimized code (there might be still a potential for better optimization).
+!$OMP  PARALLEL PRIVATE (i,j,k)
+!$OMP  DO
+       DO  j = nys, nyn
+          DO  k = 1, nz
+             DO  i = 0, nx
+                work_fftx(i,k,j) = work(j,k,i)
+             ENDDO
+             CALL fft_x_1d( work_fftx(0:nx,k,j), 'backward' )
+          ENDDO
+       ENDDO
+!$OMP  DO
+       DO  i = 0, nx
+          DO  j = nys, nyn
+             DO  k = 1, nz
+                f_out(k,j,i) = work_fftx(i,k,j)
+             ENDDO
+          ENDDO
+       ENDDO
+!$OMP  END PARALLEL
+    ENDIF
+    CALL cpu_log( log_point_s(4), 'fft_x_1d', 'stop' )
+ END SUBROUTINE tr_yx_fftx
+!--------------------------------------------------------------------------------------------------!
 ! Description:
 ! ------------
+!> FFT along y, solution of the tridiagonal system and backward FFT for
+!> a 1d-decomposition along y.
+!> FFT along y, solution of the tridiagonal system and backward FFT for a 1d-decomposition along y.
 !>
+!> @warning this subroutine may still not work for hybrid parallelization
+!>          with OpenMP (for possible necessary changes see the original
+!>          routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
+!------------------------------------------------------------------------------!
+    SUBROUTINE ffty_tri_ffty( ar )
+       USE control_parameters,                                                 &
+           ONLY:  loop_optimization
+       USE cpulog,                                                             &
+           ONLY:  cpu_log, log_point_s
+       USE grid_variables,                                                     &
+           ONLY:  ddx2, ddy2
+       USE kinds
+       USE pegrid
+       IMPLICIT NONE
+       INTEGER(iwp) ::  i                   !<
+       INTEGER(iwp) ::  j                   !<
+       INTEGER(iwp) ::  k                   !<
+       INTEGER(iwp) ::  m                   !<
+       INTEGER(iwp) ::  n                   !<
+!$     INTEGER(iwp) ::  omp_get_thread_num  !<
+       INTEGER(iwp) ::  tn                  !<
+       REAL(wp), DIMENSION(0:ny)                          ::  work_ffty  !<
+       REAL(wp), DIMENSION(0:ny,1:nz)                     ::  work_triy  !<
+       REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) ::  ar         !<
+       REAL(wp), DIMENSION(:,:,:,:), ALLOCATABLE          ::  tri        !<
+       CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'start' )
+       ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) )
+       tn = 0           ! Default thread number in case of one thread
+!$OMP  PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy )
+       DO  i = nxl_y, nxr_y
+!$        tn = omp_get_thread_num()
+          IF ( loop_optimization == 'vector' )  THEN
+!
+!--          Code optimized for vector processors
+             DO  k = 1, nz
+                m = 0
+                DO  n = 1, pdims(2)
+                   DO  j = 1, nny
+                      work_triy(m,k) = ar(j,k,i,n)
+                      m = m + 1
+                   ENDDO
+!> @warning This subroutine may still not work for hybrid parallelization with OpenMP (for possible
+!>          necessary changes see the original routine poisfft_hybrid, developed by Klaus Ketelsen,
+!>          May 2002)
+!--------------------------------------------------------------------------------------------------!
+ SUBROUTINE ffty_tri_ffty( ar )
+    USE control_parameters,                                                                        &
+        ONLY:  loop_optimization
+    USE cpulog,                                                                                    &
+        ONLY:  cpu_log,                                                                            &
+               log_point_s
+    USE grid_variables,                                                                            &
+        ONLY:  ddx2,                                                                               &
+               ddy2
+    USE kinds
+    USE pegrid
+    IMPLICIT NONE
+    INTEGER(iwp) ::  i                   !<
+    INTEGER(iwp) ::  j                   !<
+    INTEGER(iwp) ::  k                   !<
+    INTEGER(iwp) ::  m                   !<
+    INTEGER(iwp) ::  n                   !<
+!$  INTEGER(iwp) ::  omp_get_thread_num  !<
+    INTEGER(iwp) ::  tn                  !<
+    REAL(wp), DIMENSION(0:ny)                          ::  work_ffty  !<
+    REAL(wp), DIMENSION(0:ny,1:nz)                     ::  work_triy  !<
+    REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) ::  ar         !<
+    REAL(wp), DIMENSION(:,:,:,:), ALLOCATABLE          ::  tri        !<
+    CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'start' )
+    ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) )
+    tn = 0           ! Default thread number in case of one thread
+!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy )
+    DO  i = nxl_y, nxr_y
+!$     tn = omp_get_thread_num()
+       IF ( loop_optimization == 'vector' )  THEN
+!
+!--       Code optimized for vector processors
+          DO  k = 1, nz
+             m = 0
+             DO  n = 1, pdims(2)
+                DO  j = 1, nny
+                   work_triy(m,k) = ar(j,k,i,n)
+                   m = m + 1
                 ENDDO
+             ENDDO
+             CALL fft_y_m( work_triy, ny, 'forward' )
+          ELSE
+!
 !--          Cache optimized code
+             DO  k = 1, nz
+                m = 0
                 DO  n = 1, pdims(2)
                    DO  j = 1, nny
                       work_ffty(m) = ar(j,k,i,n)
                       m = m + 1
                    ENDDO
+             ENDDO
+          ENDDO
+          CALL fft_y_m( work_triy, ny, 'forward' )
+       ELSE
+!
+!--       Cache optimized code
+          DO  k = 1, nz
+             m = 0
+             DO  n = 1, pdims(2)
+                DO  j = 1, nny
+                   work_ffty(m) = ar(j,k,i,n)
+                   m = m + 1
                 ENDDO
+                CALL fft_y_1d( work_ffty, 'forward' )
+                DO  j = 0, ny
+                   work_triy(j,k) = work_ffty(j)
+             ENDDO
+             CALL fft_y_1d( work_ffty, 'forward' )
+             DO  j = 0, ny
+                work_triy(j,k) = work_ffty(j)
+             ENDDO
+          ENDDO
+       ENDIF
+!
+!--    Solve the linear equation system
+       CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) )
+       IF ( loop_optimization == 'vector' )  THEN
+!
+!--       Code optimized for vector processors
+          CALL fft_y_m( work_triy, ny, 'backward' )
+          DO  k = 1, nz
+             m = 0
+             DO  n = 1, pdims(2)
+                DO  j = 1, nny
+                   ar(j,k,i,n) = work_triy(m,k)
+                   m = m + 1
                 ENDDO
+             ENDDO
+          ENDIF
+!
+!--       Solve the linear equation system
+          CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) )
+          IF ( loop_optimization == 'vector' )  THEN
+!
+!--          Code optimized for vector processors
+             CALL fft_y_m( work_triy, ny, 'backward' )
+             DO  k = 1, nz
+                m = 0
+                DO  n = 1, pdims(2)
+                   DO  j = 1, nny
+                      ar(j,k,i,n) = work_triy(m,k)
+                      m = m + 1
+                   ENDDO
+             ENDDO
+          ENDDO
+       ELSE
+!
+!--       Cache optimized code
+          DO  k = 1, nz
+             DO  j = 0, ny
+                work_ffty(j) = work_triy(j,k)
+             ENDDO
+             CALL fft_y_1d( work_ffty, 'backward' )
+             m = 0
+             DO  n = 1, pdims(2)
+                DO  j = 1, nny
+                   ar(j,k,i,n) = work_ffty(m)
+                   m = m + 1
                 ENDDO
+             ENDDO
+          ELSE
+!
+!--          Cache optimized code
+             DO  k = 1, nz
+                DO  j = 0, ny
+                   work_ffty(j) = work_triy(j,k)
+                ENDDO
+                CALL fft_y_1d( work_ffty, 'backward' )
+                m = 0
+                DO  n = 1, pdims(2)
+                   DO  j = 1, nny
+                      ar(j,k,i,n) = work_ffty(m)
+                      m = m + 1
+                   ENDDO
+                ENDDO
+             ENDDO
+          ENDIF
+       ENDDO
+       DEALLOCATE( tri )
+       CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'stop' )
+    END SUBROUTINE ffty_tri_ffty
+             ENDDO
+          ENDDO
+       ENDIF
+    ENDDO
+    DEALLOCATE( tri )
+    CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'stop' )
+ END SUBROUTINE ffty_tri_ffty
  END MODULE poisfft_mod

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 4649 for palm/trunk/SOURCE/poisfft_mod.f90

Legend:

palm/trunk/SOURCE/poisfft_mod.f90

Download in other formats: