MODULE poisfft_hybrid_mod
!------------------------------------------------------------------------------
! 
! Current revisions: 
! ----------------- 
! Output of messages replaced by message handling routine.
!
!
! Former revisions: 
! ----------------- 
! $Id: poisfft_hybrid.f90 257 2009-03-11 15:17:42Z weinreis $
! RCS Log replace by Id keyword, revision history cleaned up
!
! Revision 1.11  2004/04/30 12:43:14  raasch
! Renaming of fft routines, additional argument in calls of fft_y_m
!
! Revision 1.2  2002/12/19 16:08:31  raasch
! Preprocessor directive KKMP introduced (OMP does NOT work),
! array tri will be a shared array in OpenMP loop, to get better cache
! utilization, the i index  (x-direction) will be executed in stride
! "istride" as outer loop and in a shorter inner loop,
! overlapping of computation and communication realized by new routine
! poisfft_hybrid_nodes, name of old routine poisfft_hybrid changed to
! poisfft_hybrid_omp, STOP statement replaced by call of subroutine local_stop
! 
! 
! Description: 
! ------------ 
! Solution of the Poisson equation with a 2D spectral method.  
! Hybrid version for parallel computers using a 1D domain decomposition, 
! realized with MPI, along x and parallelization with OPEN-MP along y
! (routine poisfft_hybrid_omp). In a second version (poisfft_hybrid_nodes),
! optimization is realized by overlapping of computation and communication
! and by simultaneously executing as many communication calls as switches
! per logical partition (LPAR) are available. This version comes into
! effect if more than one node is used and if the environment variable
! tasks_per_node is set in a way that it can be devided by switch_per_lpar
! without any rest.
!
! WARNING: In case of OpenMP, there are problems with allocating large
! arrays in parallel regions. 
! 
! Copyright   Klaus Ketelsen / Siegfried Raasch   May 2002
!------------------------------------------------------------------------------!

    USE fft_xy
    USE indices
    USE pegrid

    IMPLICIT NONE

    PRIVATE
    PUBLIC poisfft_hybrid, poisfft_hybrid_ini

    INTEGER, PARAMETER ::  switch_per_lpar = 2

    INTEGER, SAVE ::  nxl_a, nxr_a, &      ! total   x dimension
                      nxl_p, nxr_p, &      ! partial x dimension
                      nys_a, nyn_a, &      ! total   y dimension
                      nys_p, nyn_p, &      ! partial y dimension

                      npe_s,        &      ! total number of PEs for solver
                      nwords,       &      ! number of points to be exchanged
                                           ! with MPI_ALLTOALL
                      n_omp_threads        ! number of OpenMP threads

!
!-- Variables for multi node version (cluster version) using routine
!-- poisfft_hybrid_nodes
    INTEGER, SAVE :: comm_nodes,          & ! communicater nodes
                     comm_node_all,       & ! communicater all PEs node version
                     comm_tasks,          & ! communicater tasks
                     me, me_node, me_task,& ! identity of this PE
                     nodes,               & ! number of nodes
                     tasks_per_logical_node = -1    ! default no cluster
                      

!
!-- Public interfaces
    INTERFACE poisfft_hybrid_ini
       MODULE PROCEDURE poisfft_hybrid_ini
    END INTERFACE poisfft_hybrid_ini

    INTERFACE poisfft_hybrid
       MODULE PROCEDURE poisfft_hybrid
    END INTERFACE poisfft_hybrid

!
!-- Private interfaces
    INTERFACE poisfft_hybrid_omp
       MODULE PROCEDURE poisfft_hybrid_omp
    END INTERFACE poisfft_hybrid_omp

    INTERFACE poisfft_hybrid_omp_vec
       MODULE PROCEDURE poisfft_hybrid_omp_vec
    END INTERFACE poisfft_hybrid_omp_vec

    INTERFACE poisfft_hybrid_nodes
       MODULE PROCEDURE poisfft_hybrid_nodes
    END INTERFACE poisfft_hybrid_nodes

    INTERFACE tridia_hybrid
       MODULE PROCEDURE tridia_hybrid
    END INTERFACE tridia_hybrid

    INTERFACE cascade
       MODULE PROCEDURE cascade
    END INTERFACE cascade

 CONTAINS
 
 
    SUBROUTINE poisfft_hybrid_ini

       USE control_parameters
       USE pegrid

       IMPLICIT NONE

       CHARACTER(LEN=8)      ::  cdummy
       INTEGER               ::  idummy, istat
       INTEGER, DIMENSION(2) ::  coords, dims

       LOGICAL, DIMENSION(2) ::  period = .false., re_dims


!
!--    Set the internal index values for the hybrid solver
#if defined( __parallel )
       npe_s = pdims(1)
#else
       npe_s = 1
#endif
       nxl_a = 0
       nxr_a = nx
       nxl_p = 0
       nxr_p = ( ( nx+1 ) / npe_s ) - 1
       nys_a = nys 
       nyn_a = nyn
       nys_p = 0
       nyn_p = ( ( ny+1 ) / npe_s ) - 1

       nwords = ( nxr_p-nxl_p+1 ) * nz * ( nyn_p-nys_p+1 )

#if defined( __KKMP )
       CALL LOCAL_GETENV( 'OMP_NUM_THREADS', 15, cdummy, idummy )
       READ ( cdummy, '(I8)' )  n_omp_threads
       IF ( n_omp_threads > 1 )  THEN
          WRITE( message_string, * ) 'Number of OpenMP threads = ', &
                                     n_omp_threads
          CALL message( 'poisfft_hybrid_ini', 'PA0280', 0, 0, 0, 6, 0 ) 
       ENDIF
#else
       n_omp_threads = 1
#endif
!
!--    Initialize the one-dimensional FFT routines
       CALL fft_init

!
!--    Setup for multi node version (poisfft_hybrid_nodes)
       IF ( n_omp_threads == 1  .AND.  &
            ( host(1:4) == 'ibmh' .OR. host(1:4) == 'ibmb' ) )  THEN

          IF ( tasks_per_node /= -9999 )  THEN
!
!--          Multi node version requires that the available number of
!--          switches per logical partition must be an integral divisor
!--          of the chosen number of tasks per node
             IF ( MOD( tasks_per_node, switch_per_lpar ) == 0 )  THEN
!
!--             Set the switch which decides about usage of the multi node
!--             version
                IF ( tasks_per_node / switch_per_lpar > 1  .AND. &
                     numprocs > tasks_per_node )  THEN
                   tasks_per_logical_node = tasks_per_node / switch_per_lpar
                ENDIF

                IF ( tasks_per_logical_node > -1 )  THEN

                   WRITE( message_string, * ) 'running optimized ',         &
                                              'multinode version',          &
                                              '&switch_per_lpar        = ', &
                                              switch_per_lpar,              &
                                              '&tasks_per_lpar         = ', &
                                              tasks_per_node,               &
                                              'tasks_per_logical_node = ',  &
                                              tasks_per_logical_node
                   CALL message( 'poisfft_hybrid_ini', 'PA0281', 0, 0, 0, 6, 0 ) 

                ENDIF

             ENDIF
          ENDIF
       ENDIF

!
!--    Determine sub-topologies for multi node version
       IF ( tasks_per_logical_node >= 2 )  THEN

#if defined( __parallel )
          nodes   = ( numprocs + tasks_per_logical_node - 1 ) / &
                    tasks_per_logical_node
          dims(1) = nodes
          dims(2) = tasks_per_logical_node

          CALL MPI_CART_CREATE( comm2d, 2, dims, period, .FALSE., &
                                comm_node_all, istat )
          CALL MPI_COMM_RANK( comm_node_all, me, istat )

          re_dims(1) = .TRUE.
          re_dims(2) = .FALSE.
          CALL MPI_CART_SUB( comm_node_all, re_dims, comm_nodes, istat )
          CALL MPI_COMM_RANK( comm_nodes, me_node, istat )

          re_dims(1) = .FALSE.
          re_dims(2) = .TRUE.
          CALL MPI_CART_SUB( comm_node_all, re_dims, comm_tasks, istat )
          CALL MPI_COMM_RANK( comm_tasks, me_task, istat )

!          write(0,*) 'who am i',myid,me,me_node,me_task,nodes,&
!                     tasks_per_logical_node
#else
          message_string = 'parallel environment (MPI) required'
          CALL message( 'poisfft_hybrid_ini', 'PA0282', 1, 2, 0, 6, 0 )
#endif
       ENDIF

    END SUBROUTINE poisfft_hybrid_ini


    SUBROUTINE poisfft_hybrid( ar )

       USE control_parameters
       USE interfaces

       IMPLICIT NONE

       REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) ::  ar

       IF ( host(1:3) == 'nec' )  THEN
          CALL poisfft_hybrid_omp_vec( ar )
       ELSE
          IF ( tasks_per_logical_node == -1 )  THEN
             CALL poisfft_hybrid_omp( ar )
          ELSE
             CALL poisfft_hybrid_nodes( ar )
          ENDIF
       ENDIF

    END SUBROUTINE poisfft_hybrid


    SUBROUTINE poisfft_hybrid_omp ( ar )

       USE cpulog
       USE interfaces

       IMPLICIT NONE

       INTEGER, PARAMETER ::  istride = 4  ! stride of i loop
       INTEGER ::  i, ii, ir, iei, iouter, istat, j, jj, k, m, n, jthread

       REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) ::  ar

       REAL, DIMENSION(0:nx)                 ::  fftx_ar
       REAL, DIMENSION(0:ny,istride)         ::  ffty_ar
 
       REAL, DIMENSION(0:nx,nz)              ::  tri_ar

       REAL, DIMENSION(nxl_p:nxr_p,nz,nys_p:nyn_p,npe_s) ::  work1, work2
#if defined( __KKMP )
       INTEGER ::  omp_get_thread_num
       REAL, DIMENSION(:,:,:,:), ALLOCATABLE ::  tri
       ALLOCATE( tri(5,0:nx,0:nz-1,n_omp_threads ) )
#else
       REAL, DIMENSION(5,0:nx,0:nz-1,1)      ::  tri
#endif


       CALL cpu_log( log_point_s(30), 'poisfft_hybrid_omp', 'start' )

       CALL cpu_log( log_point_s(7), 'fft_y', 'start' )

!$OMP  PARALLEL PRIVATE (i,iouter,ii,ir,iei,j,k,m,n,ffty_ar)
!$OMP  DO
!
!--    Store grid points to be transformed on a 1d-array, do the fft
!--    and sample the results on a 4d-array
       DO  iouter = nxl_p, nxr_p, istride     ! stride loop, better cache
          iei = MIN( iouter+istride-1, nxr_p )
          DO  k = 1, nz

             DO  i = iouter, iei
                ii = nxl + i
                ir = i - iouter + 1

                DO  j = nys_a, nyn_a
                   ffty_ar(j,ir) = ar(k,j,ii)
                ENDDO
   
                CALL fft_y( ffty_ar(:,ir), 'forward' )
             ENDDO

             m = nys_a
             DO  n = 1, npe_s
                DO  j = nys_p, nyn_p
                   DO  i = iouter, iei
                      ir = i - iouter + 1
                      work1(i,k,j,n) = ffty_ar(m,ir)
                   ENDDO
                   m = m+1
                ENDDO
             ENDDO
   
          ENDDO
       ENDDO
!$OMP  END PARALLEL

       CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )

#if defined( __parallel )
       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )

       CALL MPI_ALLTOALL( work1(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
                          work2(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
                          comm2d, istat )

       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
#else
       work2 = work1
#endif

       CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'start' )

#if defined( __KKMP )
!$OMP  PARALLEL PRIVATE (i,j,jj,k,m,n,fftx_ar,tri_ar,jthread)
!$OMP  DO
       DO  j = nys_p, nyn_p
          jthread = omp_get_thread_num() + 1
#else
       DO  j = nys_p, nyn_p
          jthread = 1
#endif
          DO  k = 1, nz

             m = nxl_a
             DO  n = 1, npe_s
                DO  i = nxl_p, nxr_p
                   fftx_ar(m) = work2(i,k,j,n)
                   m = m+1
                ENDDO
             ENDDO

             CALL fft_x( fftx_ar, 'forward' )

             DO  i = nxl_a, nxr_a
                tri_ar(i,k) = fftx_ar(i)
             ENDDO

          ENDDO
  
          jj = myid * (nyn_p-nys_p+1) + j
          CALL tridia_hybrid( jj, tri_ar, tri(:,:,:,jthread))

          DO  k = 1, nz
             DO  i = nxl_a, nxr_a
                fftx_ar(i) = tri_ar (i,k)
             ENDDO

             CALL fft_x( fftx_ar, 'backward' )

             m = nxl_a
             DO  n = 1, npe_s
                DO  i = nxl_p, nxr_p
                   work2(i,k,j,n) = fftx_ar(m)
                   m = m+1
                ENDDO
             ENDDO

          ENDDO
       ENDDO
#if defined( __KKMP )
!$OMP  END PARALLEL
#endif

       CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'stop' )

#if defined( __parallel )
       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' ) 
       nwords = (nxr_p-nxl_p+1) * nz * (nyn_p-nys_p+1)

       CALL MPI_ALLTOALL( work2(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
                          work1(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
                          comm2d, istat )

       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
#else
       work1 = work2
#endif

       CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )

!$OMP  PARALLEL PRIVATE (i,iouter,ii,ir,iei,j,k,m,n,ffty_ar)
!$OMP  DO
       DO  iouter = nxl_p, nxr_p, istride
          iei = MIN( iouter+istride-1, nxr_p )
          DO  k = 1, nz

             m = nys_a
             DO  n = 1, npe_s
                DO  j = nys_p, nyn_p
                   DO  i = iouter, iei
                      ir = i - iouter + 1
                      ffty_ar(m,ir) = work1 (i,k,j,n)
                   ENDDO
                   m = m+1
                ENDDO
             ENDDO

             DO  i = iouter, iei
                ii = nxl + i
                ir = i - iouter + 1
                CALL fft_y( ffty_ar(:,ir), 'backward' )

                DO  j = nys_a, nyn_a
                   ar(k,j,ii) = ffty_ar(j,ir)
                ENDDO
             ENDDO

          ENDDO
       ENDDO
!$OMP  END PARALLEL

       CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )

       CALL cpu_log( log_point_s(30), 'poisfft_hybrid_omp', 'stop' )

#if defined( __KKMP )
       DEALLOCATE( tri )
#endif

    END SUBROUTINE poisfft_hybrid_omp


    SUBROUTINE poisfft_hybrid_omp_vec ( ar )

       USE cpulog
       USE interfaces

       IMPLICIT NONE

       INTEGER, PARAMETER ::  istride = 4  ! stride of i loop
       INTEGER ::  i, ii, ir, iei, iouter, istat, j, jj, k, m, n, jthread

       REAL, DIMENSION(0:nx,nz)               ::  tri_ar

       REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr)  ::  ar

       REAL, DIMENSION(0:ny+3,nz,nxl_p:nxr_p) ::  ffty_ar3
       REAL, DIMENSION(0:nx+3,nz,nys_p:nyn_p) ::  fftx_ar3
 
       REAL, DIMENSION(nxl_p:nxr_p,nz,nys_p:nyn_p,npe_s) ::  work1, work2
#if defined( __KKMP )
       INTEGER ::  omp_get_thread_num
       REAL, DIMENSION(:,:,:,:), ALLOCATABLE ::  tri
       ALLOCATE( tri(5,0:nx,0:nz-1,n_omp_threads ) )
#else
       REAL, DIMENSION(5,0:nx,0:nz-1,1)      ::  tri
#endif


       CALL cpu_log( log_point_s(30), 'poisfft_hybrid_vec', 'start' )

       CALL cpu_log( log_point_s(7), 'fft_y_m', 'start' )

!$OMP  PARALLEL PRIVATE (i,j,k,m,n)
!$OMP  DO
!
!--    Store grid points to be transformed on a 1d-array, do the fft
!--    and sample the results on a 4d-array
       DO  i = nxl_p, nxr_p

          DO  j = nys_a, nyn_a
             DO  k = 1, nz
                ffty_ar3(j,k,i) = ar(k,j,i+nxl)
             ENDDO
          ENDDO

          CALL fft_y_m( ffty_ar3(:,:,i), ny+3, 'forward' )
       ENDDO

!$OMP  DO
       DO  k = 1, nz
          m = nys_a
          DO  n = 1, npe_s
             DO  j = nys_p, nyn_p
                 DO  i = nxl_p, nxr_p
                     work1(i,k,j,n) = ffty_ar3(m,k,i)
                 ENDDO
                 m = m+1
             ENDDO
          ENDDO
       ENDDO
!$OMP  END PARALLEL

       CALL cpu_log( log_point_s(7), 'fft_y_m', 'pause' )

#if defined( __parallel )
       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
       CALL MPI_ALLTOALL( work1(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
                          work2(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
                          comm2d, istat )
       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
#else
       work2 = work1
#endif

       CALL cpu_log( log_point_s(33), 'fft_x_m + tridia', 'start' )

#if defined( __KKMP )
!$OMP  PARALLEL PRIVATE (i,j,jj,k,m,n,tri_ar,jthread)
!$OMP  DO
       DO  j = nys_p, nyn_p
          jthread = omp_get_thread_num() + 1
#else
       DO  j = nys_p, nyn_p
          jthread = 1
#endif
          DO  k = 1, nz

             m = nxl_a
             DO  n = 1, npe_s
                DO  i = nxl_p, nxr_p
                   fftx_ar3(m,k,j) = work2(i,k,j,n)
                   m = m+1
                ENDDO
             ENDDO
          ENDDO

          CALL fft_x_m( fftx_ar3(:,:,j), 'forward' )

          DO  k = 1, nz
             DO  i = nxl_a, nxr_a
                tri_ar(i,k) = fftx_ar3(i,k,j)
             ENDDO
          ENDDO
  
          jj = myid * (nyn_p-nys_p+1) + j
          CALL tridia_hybrid( jj, tri_ar, tri(:,:,:,jthread))

          DO  k = 1, nz
             DO  i = nxl_a, nxr_a
                fftx_ar3(i,k,j) = tri_ar (i,k)
             ENDDO
          ENDDO

          CALL fft_x_m( fftx_ar3(:,:,j), 'backward' )

          DO  k = 1, nz
             m = nxl_a
             DO  n = 1, npe_s
                DO  i = nxl_p, nxr_p
                   work2(i,k,j,n) = fftx_ar3(m,k,j)
                   m = m+1
                ENDDO
             ENDDO
          ENDDO

       ENDDO
#if defined( __KKMP )
!$OMP  END PARALLEL
#endif

       CALL cpu_log( log_point_s(33), 'fft_x_m + tridia', 'stop' )

#if defined( __parallel )
       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' ) 
       nwords = (nxr_p-nxl_p+1) * nz * (nyn_p-nys_p+1)
       CALL MPI_ALLTOALL( work2(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
                          work1(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
                          comm2d, istat )
       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
#else
       work1 = work2
#endif

       CALL cpu_log( log_point_s(7), 'fft_y_m', 'continue' )

!$OMP  PARALLEL PRIVATE (i,iouter,ii,ir,iei,j,k,m,n)
!$OMP  DO
       DO  k = 1, nz
          m = nys_a
          DO  n = 1, npe_s
             DO  j = nys_p, nyn_p
                 DO  i = nxl_p, nxr_p
                     ffty_ar3(m,k,i) = work1(i,k,j,n)
                 ENDDO
                 m = m+1
             ENDDO
          ENDDO
       ENDDO 

!$OMP  DO
       DO  i = nxl_p, nxr_p
          CALL fft_y_m( ffty_ar3(:,:,i), ny+3, 'backward' )
          DO  j = nys_a, nyn_a
             DO  k = 1, nz
                ar(k,j,i+nxl) = ffty_ar3(j,k,i)
             ENDDO
          ENDDO
       ENDDO
!$OMP  END PARALLEL

       CALL cpu_log( log_point_s(7), 'fft_y_m', 'stop' )

       CALL cpu_log( log_point_s(30), 'poisfft_hybrid_vec', 'stop' )

#if defined( __KKMP )
       DEALLOCATE( tri )
#endif

    END SUBROUTINE poisfft_hybrid_omp_vec


    SUBROUTINE poisfft_hybrid_nodes ( ar )

       USE cpulog
       USE interfaces

       IMPLICIT NONE

       INTEGER, PARAMETER ::  istride = 4  ! stride of i loop
       INTEGER            ::  i, iei, ii, iouter, ir, istat, j, jj, k, m, &
                              n, nn, nt, nw1, nw2

       REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) ::  ar

       REAL, DIMENSION(0:nx)                 ::  fftx_ar
       REAL, DIMENSION(0:ny,istride)         ::  ffty_ar
 
       REAL, DIMENSION(0:nx,nz)              ::  tri_ar

       REAL, DIMENSION(nxl_p:nxr_p,nz,tasks_per_logical_node, &
                          nodes,nys_p:nyn_p) ::  work1,work2
       REAL, DIMENSION(5,0:nx,0:nz-1)        ::  tri


       CALL cpu_log( log_point_s(30), 'poisfft_hybrid_nodes', 'start' )

       CALL cpu_log( log_point_s(7), 'fft_y', 'start' )

!
!--    Store grid points to be transformed on a 1d-array, do the fft
!--    and sample the results on a 4d-array
       DO  iouter = nxl_p, nxr_p, istride  ! stride loop, better cache
          iei = MIN( iouter+istride-1, nxr_p )
          DO  k = 1, nz

             DO  i = iouter, iei
                ii = nxl + i
                ir = i - iouter + 1

                DO  j = nys_a, nyn_a
                   ffty_ar(j,ir) = ar(k,j,ii)
                ENDDO
   
                CALL fft_y( ffty_ar(:,ir), 'forward' )
             ENDDO

             m = nys_a
             DO  nn = 1, nodes
                DO  nt = 1, tasks_per_logical_node
                   DO  j = nys_p, nyn_p
                      DO  i = iouter, iei
                         ir = i - iouter + 1
                         work1(i,k,nt,nn,j) = ffty_ar(m,ir)
                      ENDDO
                      m = m+1
                   ENDDO
                ENDDO
             ENDDO
   
          ENDDO
       ENDDO

       CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )

       CALL cpu_log( log_point_s(32), 'alltoall_task', 'start' )
       nw1 = SIZE( work1, 1 ) * SIZE( work1, 2 )
       DO  nn = 1, nodes
           DO  j = nys_p, nyn_p
#if defined( __parallel )
              CALL MPI_ALLTOALL( work1(nxl_p,1,1,nn,j), nw1, MPI_REAL, &
                                 work2(nxl_p,1,1,nn,j), nw1, MPI_REAL, &
                                                     comm_tasks, istat )
#endif
           ENDDO
       ENDDO
       CALL cpu_log( log_point_s(32), 'alltoall_task', 'stop' )


       DO  j = nys_p, nyn_p
        
          CALL cascade( 1, j, nys_p, nyn_p )
          nw2 = nw1 * SIZE( work1, 3 )
          CALL cpu_log( log_point_s(37), 'alltoall_node', 'start' )
#if defined( __parallel )
          CALL MPI_ALLTOALL( work2(nxl_p,1,1,1,j), nw2, MPI_REAL, &
                             work1(nxl_p,1,1,1,j), nw2, MPI_REAL, &
                                                  comm_nodes, istat )
#endif
          CALL cpu_log( log_point_s(37), 'alltoall_node', 'pause' )
          CALL cascade( 2, j, nys_p, nyn_p )

          CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'start' )
          DO  k = 1, nz

             m = nxl_a
             DO  nn = 1, nodes
                DO  nt = 1, tasks_per_logical_node
                   DO  i = nxl_p, nxr_p
                      fftx_ar(m) = work1(i,k,nt,nn,j)
                      m = m+1
                   ENDDO
                ENDDO
             ENDDO

             CALL fft_x( fftx_ar, 'forward' )

             DO  i = nxl_a, nxr_a
                tri_ar(i,k) = fftx_ar(i)
             ENDDO

          ENDDO
  
          jj = myid * (nyn_p-nys_p+1) + j
          CALL tridia_hybrid( jj, tri_ar, tri(:,:,:) )

          DO  k = 1, nz
             DO  i = nxl_a, nxr_a
                fftx_ar(i) = tri_ar(i,k)
             ENDDO

             CALL fft_x( fftx_ar, 'backward' )

             m = nxl_a
             DO  nn = 1, nodes
                DO  nt = 1, tasks_per_logical_node
                   DO  i = nxl_p, nxr_p
                      work1(i,k,nt,nn,j) = fftx_ar(m)
                      m = m+1
                   ENDDO
                ENDDO
             ENDDO
          ENDDO

          CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'stop' )
          nw2 = nw1 * SIZE( work1, 3 )
          CALL cpu_log( log_point_s(37), 'alltoall_node', 'continue' )
#if defined( __parallel )
          CALL MPI_ALLTOALL( work1(nxl_p,1,1,1,j), nw2, MPI_REAL, &
                             work2(nxl_p,1,1,1,j), nw2, MPI_REAL, &
                                                  comm_nodes, istat )
#endif
          CALL cpu_log( log_point_s(37), 'alltoall_node', 'stop' )

       ENDDO

       CALL cpu_log( log_point_s(32), 'alltoall_task', 'start' ) 
       DO  nn = 1, nodes
           DO  j = nys_p, nyn_p
#if defined( __parallel )
              CALL MPI_ALLTOALL( work2(nxl_p,1,1,nn,j), nw1, MPI_REAL, &
                                 work1(nxl_p,1,1,nn,j), nw1, MPI_REAL, &
                                                     comm_tasks, istat )
#endif
           ENDDO
       ENDDO
       CALL cpu_log( log_point_s(32), 'alltoall_task', 'stop' )

       CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )

       DO  iouter = nxl_p, nxr_p, istride
          iei = MIN( iouter+istride-1, nxr_p )
          DO  k = 1, nz

             m = nys_a
             DO  nn = 1, nodes
                DO  nt = 1, tasks_per_logical_node
                   DO  j = nys_p, nyn_p
                      DO  i = iouter, iei
                         ir = i - iouter + 1
                         ffty_ar(m,ir) = work1(i,k,nt,nn,j)
                      ENDDO
                      m = m+1
                   ENDDO
                ENDDO
             ENDDO

             DO  i = iouter, iei
                ii = nxl + i
                ir = i - iouter + 1
                CALL fft_y( ffty_ar(:,ir), 'backward' )

                DO  j = nys_a, nyn_a
                   ar(k,j,ii) = ffty_ar(j,ir)
                ENDDO
             ENDDO

          ENDDO
       ENDDO

       CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )

       CALL cpu_log( log_point_s(30), 'poisfft_hybrid_nodes', 'stop' )

    END SUBROUTINE poisfft_hybrid_nodes


    SUBROUTINE tridia_hybrid( j, ar, tri )

       USE arrays_3d
       USE control_parameters
       USE grid_variables

       IMPLICIT NONE

       INTEGER ::  i, j, k, nnyh
       REAL, DIMENSION(0:nx,nz)       ::  ar
       REAL, DIMENSION(0:nx,0:nz-1)   ::  ar1
       REAL, DIMENSION(5,0:nx,0:nz-1) ::  tri

       nnyh = (ny+1) / 2

       tri = 0.0
!
!--    Define constant elements of the tridiagonal matrix.
       DO  k = 0, nz-1
          DO  i = 0,nx
             tri(2,i,k) = ddzu(k+1) * ddzw(k+1)
             tri(3,i,k) = ddzu(k+2) * ddzw(k+1)
          ENDDO
       ENDDO

       IF ( j <= nnyh )  THEN
          CALL maketri_hybrid( j )
       ELSE
          CALL maketri_hybrid( ny+1-j)
       ENDIF
       CALL zerleg_hybrid
       CALL substi_hybrid( ar, tri )

    CONTAINS

       SUBROUTINE maketri_hybrid( j )

!----------------------------------------------------------------------!
!                         maketri                                      !
!                                                                      !
!       computes the i- and j-dependent component of the matrix        !
!----------------------------------------------------------------------!

          USE constants

          IMPLICIT NONE

          INTEGER ::  i, j, k, nnxh
          REAL    ::  a, c

          REAL, DIMENSION(0:nx) ::  l


          nnxh = (nx+1) / 2
!
!--       Provide the tridiagonal matrix for solution of the Poisson equation
!--       in Fourier space. The coefficients are computed following the method
!--       of Schmidt et al. (DFVLR-Mitteilung 84-15) --> departs from Stephan
!--       Siano's original version.
          DO  i = 0,nx
             IF ( i >= 0 .AND. i < nnxh ) THEN
                l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
                                        FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
                       2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
                                        FLOAT( ny+1 ) ) ) / ( dy * dy )
             ELSEIF ( i == nnxh ) THEN
                l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
                                         FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
                       2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
                                         FLOAT(ny+1) ) ) / ( dy * dy )
             ELSE
                l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
                                         FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
                       2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
                                         FLOAT( ny+1 ) ) ) / ( dy * dy )
             ENDIF
          ENDDO

          DO  k = 0,nz-1
             DO  i = 0, nx
                a = -1.0 * ddzu(k+2) * ddzw(k+1)
                c = -1.0 * ddzu(k+1) * ddzw(k+1)
                tri(1,i,k) = a + c - l(i)
             ENDDO
          ENDDO
          IF ( ibc_p_b == 1  .OR.  ibc_p_b == 2 )  THEN
             DO  i = 0,nx
                tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
             ENDDO
          ENDIF
          IF ( ibc_p_t == 1 )  THEN
             DO  i = 0,nx
                tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
             ENDDO
          ENDIF

       END SUBROUTINE maketri_hybrid


       SUBROUTINE zerleg_hybrid

!----------------------------------------------------------------------!
!                          zerleg                                      !
!                                                                      !
!       Splitting of the tridiagonal matrix (Thomas algorithm)         !
!----------------------------------------------------------------------!

          USE indices

          IMPLICIT NONE

          INTEGER ::  i, k

!
!--       Splitting
          DO  i = 0, nx
             tri(4,i,0) = tri(1,i,0)
          ENDDO
          DO  k = 1, nz-1
             DO  i = 0,nx
                tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
                tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
             ENDDO
          ENDDO

       END SUBROUTINE zerleg_hybrid

       SUBROUTINE substi_hybrid( ar, tri )

!----------------------------------------------------------------------!
!                          substi                                      !
!                                                                      !
!       Substitution (Forward and Backward) (Thomas algorithm)         !
!----------------------------------------------------------------------!

          IMPLICIT NONE

          INTEGER ::  i, j, k
          REAL, DIMENSION(0:nx,nz)       ::  ar
          REAL, DIMENSION(0:nx,0:nz-1)   ::  ar1
          REAL, DIMENSION(5,0:nx,0:nz-1) ::  tri

!
!--       Forward substitution
          DO  i = 0, nx
             ar1(i,0) = ar(i,1)
          ENDDO
          DO  k = 1, nz - 1
             DO  i = 0,nx
                ar1(i,k) = ar(i,k+1) - tri(5,i,k) * ar1(i,k-1)
             ENDDO
          ENDDO

!
!--       Backward substitution
          DO  i = 0,nx
             ar(i,nz) = ar1(i,nz-1) / tri(4,i,nz-1)
          ENDDO
          DO  k = nz-2, 0, -1
             DO  i = 0,nx
                ar(i,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,k+2) ) &
                            / tri(4,i,k)
             ENDDO
          ENDDO

       END SUBROUTINE substi_hybrid

    END SUBROUTINE tridia_hybrid


    SUBROUTINE cascade( loca, j, nys_p, nyn_p )

       USE cpulog

       IMPLICIT NONE

       INTEGER ::  ier, j, loca, nyn_p, nys_p, req, reqa(1)
       INTEGER, SAVE ::  tag = 10
#if defined( __parallel )
       INTEGER, DIMENSION(MPI_STATUS_SIZE) :: stat
#endif

       REAL ::  buf, buf1


       buf  = 1.0
       buf1 = 1.1
       IF ( me_node == 0 )  THEN  ! first node only

          SELECT CASE ( loca )

             CASE ( 1 )  ! before alltoall

                IF( me_task > 0 )  THEN  ! first task does not wait
#if defined( __parallel )
                   CALL MPI_SENDRECV( buf,  1, MPI_REAL, me_task-1, 0, &
                                      buf1, 1, MPI_REAL, me_task-1, 0, &
                                                 comm_tasks, stat,ierr )
#endif
                ELSEIF ( j > nys_p )  THEN
                   req = 0
                   tag = MOD( tag-10, 10 ) + 10
#if defined( __parallel )
                   CALL MPI_IRECV( buf, 1, MPI_REAL, tasks_per_logical_node-1,&
                                   tag, comm_tasks, req, ierr )
                   reqa = req
                   CALL MPI_WAITALL( 1, reqa, stat, ierr )
#endif
                ENDIF

             CASE ( 2 )  ! after alltoall

                IF ( me_task < tasks_per_logical_node-1 )  THEN  ! last task
#if defined( __parallel )
                   CALL MPI_SENDRECV( buf,  1, MPI_REAL, me_task+1, 0, &
                                      buf1, 1, MPI_REAL, me_task+1, 0, &
                                                 comm_tasks, stat, ierr)
#endif
                ELSEIF ( j < nyn_p )  THEN
                   req = 0
                   tag = MOD( tag-10, 10 ) + 10
#if defined( __parallel )
                   CALL MPI_ISEND( buf, 1, MPI_REAL, 0, tag, comm_tasks, req, &
                                   ierr )
#endif
                ENDIF

          END SELECT

       ENDIF

    END SUBROUTINE cascade

 END MODULE poisfft_hybrid_mod