SUBROUTINE transpose_xy( f_in, work1, f_inv, work2, f_out )

!------------------------------------------------------------------------------!
! Actual revisions:
! -----------------
! 
!
! Former revisions:
! -----------------
! $Log: transpose.f90,v $
! Revision 1.2  2004/04/30 13:12:17  raasch
! Switched from mpi_alltoallv to the simpler mpi_alltoall,
! all former transpose-routine files collected in this file, enlarged
! transposition arrays introduced
!
! Revision 1.1  2004/04/30 13:08:16  raasch
! Initial revision (collection of former routines transpose_xy, transpose_xz,
!                   transpose_yx, transpose_yz, transpose_zx, transpose_zy)
!
! Revision 1.7  2003/03/16 09:49:53  raasch
! Two underscores (_) are placed in front of all define-strings
!
! Revision 1.6  2001/03/30 07:55:50  raasch
! Array bound for work2 is explicitely calculated, all comments and identifiers
! translated into English
!
! Revision 1.5  2001/01/22 08:19:14  raasch
! Module test_variables removed
!
! Revision 1.4  1998/07/06 12:34:22  raasch
! + USE test_variables
!
! Revision 1.3  1997/08/13 13:52:42  raasch
! Zeitmessung von MPI_ALLTOALLV
!
! Revision 1.2  1997/08/11 06:27:31  raasch
! Keine Aenderungen
!
! Revision 1.1  1997/07/24 11:25:18  raasch
! Initial revision
!
!
! Description:
! ------------
! Transposition of input array (f_in) from x to y. For the input array, all
! elements along x reside on the same PE, while after transposition, all
! elements along y reside on the same PE.
!------------------------------------------------------------------------------!

    USE cpulog
    USE indices
    USE interfaces
    USE pegrid
    USE transpose_indices

    IMPLICIT NONE

    INTEGER ::  i, j, k, l, m, ys
    
    REAL ::  f_in(0:nxa,nys_x:nyn_xa,nzb_x:nzt_xa),                     &
             f_inv(nys_x:nyn_xa,nzb_x:nzt_xa,0:nxa),                    &
             f_out(0:nya,nxl_y:nxr_ya,nzb_y:nzt_ya),                    &
             work1(nys_x:nyn_xa,nzb_x:nzt_xa,0:nxa), work2(nnx*nny*nnz)

#if defined( __parallel )

!
!-- Rearrange indices of input array in order to make data to be send
!-- by MPI contiguous
    DO  k = nzb_x, nzt_xa
       DO  j = nys_x, nyn_xa
          DO  i = 0, nxa
             work1(j,k,i) = f_in(i,j,k)
          ENDDO
       ENDDO
    ENDDO

!
!-- Move data to different array, because memory location of work1 is
!-- needed further below (work1 = work2)
    DO  i = 0, nxa
       DO  k = nzb_x, nzt_xa
          DO  j = nys_x, nyn_xa
             f_inv(j,k,i) = work1(j,k,i)
          ENDDO
       ENDDO
    ENDDO

!
!-- Transpose array
    CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
    CALL MPI_ALLTOALL( f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
                       work2(1),             sendrecvcount_xy, MPI_REAL, &
                       comm1dy, ierr )
    CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )

!
!-- Reorder transposed array
    m = 0
    DO  l = 0, pdims(2) - 1
       ys = 0 + l * ( nyn_xa - nys_x + 1 )
       DO  i = nxl_y, nxr_ya
          DO  k = nzb_y, nzt_ya
             DO  j = ys, ys + nyn_xa - nys_x
                m = m + 1
                f_out(j,i,k) = work2(m)
             ENDDO
          ENDDO
       ENDDO
    ENDDO

#endif

 END SUBROUTINE transpose_xy


 SUBROUTINE transpose_xz( f_in, work1, f_inv, work2, f_out )

!------------------------------------------------------------------------------!
! Description:
! ------------
! Transposition of input array (f_in) from x to z. For the input array, all
! elements along x reside on the same PE, while after transposition, all
! elements along z reside on the same PE.
!------------------------------------------------------------------------------!

    USE cpulog
    USE indices
    USE interfaces
    USE pegrid
    USE transpose_indices

    IMPLICIT NONE

    INTEGER ::  i, j, k, l, m, xs
    
    REAL ::  f_in(0:nxa,nys_x:nyn_xa,nzb_x:nzt_xa),             &
             f_inv(nxl:nxra,nys:nyna,1:nza),                    &
             f_out(1:nza,nys:nyna,nxl:nxra),                    &
             work1(1:nza,nys:nyna,nxl:nxra), work2(nnx*nny*nnz)

#if defined( __parallel )

!
!-- If the PE grid is one-dimensional along y, the array has only to be
!-- reordered locally and therefore no transposition has to be done.
    IF ( pdims(1) /= 1 )  THEN
!
!--    Reorder input array for transposition
       m = 0
       DO  l = 0, pdims(1) - 1
          xs = 0 + l * nnx
          DO  k = nzb_x, nzt_xa
             DO  j = nys_x, nyn_xa
                DO  i = xs, xs + nnx - 1
                   m = m + 1
                   work2(m) = f_in(i,j,k)
                ENDDO
             ENDDO
          ENDDO
       ENDDO

!
!--    Transpose array
       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
       CALL MPI_ALLTOALL( work2(1),         sendrecvcount_zx, MPI_REAL, &
                          f_inv(nxl,nys,1), sendrecvcount_zx, MPI_REAL, &
                          comm1dx, ierr )
       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )

!
!--    Reorder transposed array in a way that the z index is in first position
       DO  i = nxl, nxra
          DO  j = nys, nyna
             DO  k = 1, nza
                work1(k,j,i) = f_inv(i,j,k)
             ENDDO
          ENDDO
       ENDDO
    ELSE
!
!--    Reorder the array in a way that the z index is in first position
       DO  i = nxl, nxra
          DO  j = nys, nyna
             DO  k = 1, nza
                work1(k,j,i) = f_in(i,j,k)
             ENDDO
          ENDDO
       ENDDO
    ENDIF

!
!-- Move data to output array
    DO  i = nxl, nxra
       DO  j = nys, nyna
          DO  k = 1, nza
             f_out(k,j,i) = work1(k,j,i)
          ENDDO
       ENDDO
    ENDDO

#endif

 END SUBROUTINE transpose_xz


 SUBROUTINE transpose_yx( f_in, work1, f_inv, work2, f_out )

!------------------------------------------------------------------------------!
! Description:
! ------------
! Transposition of input array (f_in) from y to x. For the input array, all
! elements along y reside on the same PE, while after transposition, all
! elements along x reside on the same PE.
!------------------------------------------------------------------------------!

    USE cpulog
    USE indices
    USE interfaces
    USE pegrid
    USE transpose_indices

    IMPLICIT NONE

    INTEGER ::  i, j, k, l, m, ys
    
    REAL ::  f_in(0:nya,nxl_y:nxr_ya,nzb_y:nzt_ya),                     &
             f_inv(nys_x:nyn_xa,nzb_x:nzt_xa,0:nxa),                    &
             f_out(0:nxa,nys_x:nyn_xa,nzb_x:nzt_xa),                    &
             work1(0:nxa,nys_x:nyn_xa,nzb_x:nzt_xa), work2(nnx*nny*nnz)

#if defined( __parallel )

!
!-- Reorder input array for transposition
    m = 0
    DO  l = 0, pdims(2) - 1
       ys = 0 + l * ( nyn_xa - nys_x + 1 )
       DO  i = nxl_y, nxr_ya
          DO  k = nzb_y, nzt_ya
             DO  j = ys, ys + nyn_xa - nys_x
                m = m + 1
                work2(m) = f_in(j,i,k)
             ENDDO
          ENDDO
       ENDDO
    ENDDO

!
!-- Transpose array
    CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
    CALL MPI_ALLTOALL( work2(1),             sendrecvcount_xy, MPI_REAL, &
                       f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
                       comm1dy, ierr )
    CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )

!
!-- Reorder transposed array in a way that the x index is in first position
    DO  i = 0, nxa
       DO  k = nzb_x, nzt_xa
          DO  j = nys_x, nyn_xa
             work1(i,j,k) = f_inv(j,k,i)
          ENDDO
       ENDDO
    ENDDO

!
!-- Move data to output array
    DO  k = nzb_x, nzt_xa
       DO  j = nys_x, nyn_xa
          DO  i = 0, nxa
             f_out(i,j,k) = work1(i,j,k)
          ENDDO
       ENDDO
    ENDDO

#endif

 END SUBROUTINE transpose_yx


 SUBROUTINE transpose_yxd( f_in, work1, f_inv, work2, f_out )

!------------------------------------------------------------------------------!
! Description:
! ------------
! Transposition of input array (f_in) from y to x. For the input array, all
! elements along y reside on the same PE, while after transposition, all
! elements along x reside on the same PE.
! This is a direct transposition for arrays with indices in regular order
! (k,j,i) (cf. transpose_yx).
!------------------------------------------------------------------------------!

    USE cpulog
    USE indices
    USE interfaces
    USE pegrid
    USE transpose_indices

    IMPLICIT NONE

    INTEGER ::  i, j, k, l, m, recvcount_yx, sendcount_yx, xs

    REAL ::  f_in(1:nza,nys:nyna,nxl:nxra), f_inv(nxl:nxra,1:nza,nys:nyna), &
             f_out(0:nxa,nys_x:nyn_xa,nzb_x:nzt_xa),                        &
             work1(nxl:nxra,1:nza,nys:nyna), work2(nnx*nny*nnz)

#if defined( __parallel )

!
!-- Rearrange indices of input array in order to make data to be send
!-- by MPI contiguous
    DO  k = 1, nza
       DO  j = nys, nyna
          DO  i = nxl, nxra
             work1(i,k,j) = f_in(k,j,i)
          ENDDO
       ENDDO
    ENDDO

!
!-- Move data to different array, because memory location of work1 is
!-- needed further below (work1 = work2)
    DO  j = nys, nyna
       DO  k = 1, nza
          DO  i = nxl, nxra
             f_inv(i,k,j) = work1(i,k,j)
          ENDDO
       ENDDO
    ENDDO

!
!-- Transpose array
    CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
    CALL MPI_ALLTOALL( f_inv(nxl,1,nys), sendrecvcount_xy, MPI_REAL, &
                       work2(1),         sendrecvcount_xy, MPI_REAL, &
                       comm1dx, ierr )
    CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )

!
!-- Reorder transposed array
    m = 0
    DO  l = 0, pdims(1) - 1
       xs = 0 + l * nnx
       DO  j = nys_x, nyn_xa
          DO  k = 1, nza
             DO  i = xs, xs + nnx - 1
                m = m + 1
                f_out(i,j,k) = work2(m)
             ENDDO
          ENDDO
       ENDDO
    ENDDO

#endif

 END SUBROUTINE transpose_yxd


 SUBROUTINE transpose_yz( f_in, work1, f_inv, work2, f_out )

!------------------------------------------------------------------------------!
! Description:
! ------------
! Transposition of input array (f_in) from y to z. For the input array, all
! elements along y reside on the same PE, while after transposition, all
! elements along z reside on the same PE.
!------------------------------------------------------------------------------!

    USE cpulog
    USE indices
    USE interfaces
    USE pegrid
    USE transpose_indices

    IMPLICIT NONE

    INTEGER ::  i, j, k, l, m, zs
    
    REAL ::  f_in(0:nya,nxl_y:nxr_ya,nzb_y:nzt_ya),                     &
             f_inv(nxl_y:nxr_ya,nzb_y:nzt_ya,0:nya),                    &
             f_out(nxl_z:nxr_za,nys_z:nyn_za,1:nza),                    &
             work1(nxl_y:nxr_ya,nzb_y:nzt_ya,0:nya), work2(nnx*nny*nnz)

#if defined( __parallel )

!
!-- Rearrange indices of input array in order to make data to be send
!-- by MPI contiguous
    DO  k = nzb_y, nzt_ya
       DO  i = nxl_y, nxr_ya
          DO  j = 0, nya
             work1(i,k,j) = f_in(j,i,k)
          ENDDO
       ENDDO
    ENDDO

!
!-- Move data to different array, because memory location of work1 is
!-- needed further below (work1 = work2).
!-- If the PE grid is one-dimensional along y, only local reordering
!-- of the data is necessary and no transposition has to be done.
    IF ( pdims(1) == 1 )  THEN
       DO  j = 0, nya
          DO  k = nzb_y, nzt_ya
             DO  i = nxl_y, nxr_ya
                f_out(i,j,k) = work1(i,k,j)
             ENDDO
          ENDDO
       ENDDO
       RETURN
    ELSE
       DO  j = 0, nya
          DO  k = nzb_y, nzt_ya
             DO  i = nxl_y, nxr_ya
                f_inv(i,k,j) = work1(i,k,j)
             ENDDO
          ENDDO
       ENDDO
    ENDIF

!
!-- Transpose array
    CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
    CALL MPI_ALLTOALL( f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
                       work2(1),             sendrecvcount_yz, MPI_REAL, &
                       comm1dx, ierr )
    CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )

!
!-- Reorder transposed array
    m = 0
    DO  l = 0, pdims(1) - 1
       zs = 1 + l * ( nzt_ya - nzb_y + 1 )
       DO  j = nys_z, nyn_za
          DO  k = zs, zs + nzt_ya - nzb_y
             DO  i = nxl_z, nxr_za
                m = m + 1
                f_out(i,j,k) = work2(m)
             ENDDO
          ENDDO
       ENDDO
    ENDDO

#endif

 END SUBROUTINE transpose_yz


 SUBROUTINE transpose_zx( f_in, work1, f_inv, work2, f_out )

!------------------------------------------------------------------------------!
! Description:
! ------------
! Transposition of input array (f_in) from z to x. For the input array, all
! elements along z reside on the same PE, while after transposition, all
! elements along x reside on the same PE.
!------------------------------------------------------------------------------!

    USE cpulog
    USE indices
    USE interfaces
    USE pegrid
    USE transpose_indices

    IMPLICIT NONE

    INTEGER ::  i, j, k, l, m, xs
    
    REAL ::  f_in(1:nza,nys:nyna,nxl:nxra), f_inv(nxl:nxra,nys:nyna,1:nza), &
             f_out(0:nxa,nys_x:nyn_xa,nzb_x:nzt_xa),                        &
             work1(nxl:nxra,nys:nyna,1:nza), work2(nnx*nny*nnz)

#if defined( __parallel )

!
!-- Rearrange indices of input array in order to make data to be send
!-- by MPI contiguous
    DO  i = nxl, nxra
       DO  j = nys, nyna
          DO  k = 1,nza
             work1(i,j,k) = f_in(k,j,i)
          ENDDO
       ENDDO
    ENDDO

!
!-- Move data to different array, because memory location of work1 is
!-- needed further below (work1 = work2).
!-- If the PE grid is one-dimensional along y, only local reordering
!-- of the data is necessary and no transposition has to be done.
    IF ( pdims(1) == 1 )  THEN
       DO  k = 1, nza
          DO  j = nys, nyna
             DO  i = nxl, nxra
                f_out(i,j,k) = work1(i,j,k)
             ENDDO
          ENDDO
       ENDDO
       RETURN
    ELSE
       DO  k = 1, nza
          DO  j = nys, nyna
             DO  i = nxl, nxra
                f_inv(i,j,k) = work1(i,j,k)
             ENDDO
          ENDDO
       ENDDO
    ENDIF

!
!-- Transpose array
    CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
    CALL MPI_ALLTOALL( f_inv(nxl,nys,1), sendrecvcount_zx, MPI_REAL, &
                       work2(1),         sendrecvcount_zx, MPI_REAL, &
                       comm1dx, ierr )
    CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )

!
!-- Reorder transposed array
    m = 0
    DO  l = 0, pdims(1) - 1
       xs = 0 + l * nnx
       DO  k = nzb_x, nzt_xa
          DO  j = nys_x, nyn_xa
             DO  i = xs, xs + nnx - 1
                m = m + 1
                f_out(i,j,k) = work2(m)
             ENDDO
          ENDDO
       ENDDO
    ENDDO

#endif

 END SUBROUTINE transpose_zx


 SUBROUTINE transpose_zy( f_in, work1, f_inv, work2, f_out )

!------------------------------------------------------------------------------!
! Description:
! ------------
! Transposition of input array (f_in) from z to y. For the input array, all
! elements along z reside on the same PE, while after transposition, all
! elements along y reside on the same PE.
!------------------------------------------------------------------------------!

    USE cpulog
    USE indices
    USE interfaces
    USE pegrid
    USE transpose_indices

    IMPLICIT NONE

    INTEGER ::  i, j, k, l, m, zs
    
    REAL ::  f_in(nxl_z:nxr_za,nys_z:nyn_za,1:nza),                     &
             f_inv(nxl_y:nxr_ya,nzb_y:nzt_ya,0:nya),                    &
             f_out(0:nya,nxl_y:nxr_ya,nzb_y:nzt_ya),                    &
             work1(0:nya,nxl_y:nxr_ya,nzb_y:nzt_ya), work2(nnx*nny*nnz)

#if defined( __parallel )

!
!-- If the PE grid is one-dimensional along y, the array has only to be
!-- reordered locally and therefore no transposition has to be done.
    IF ( pdims(1) /= 1 )  THEN
!
!--    Reorder input array for transposition
       m = 0
       DO  l = 0, pdims(1) - 1
          zs = 1 + l * ( nzt_ya - nzb_y + 1 )
          DO  j = nys_z, nyn_za
             DO  k = zs, zs + nzt_ya - nzb_y
                DO  i = nxl_z, nxr_za
                   m = m + 1
                   work2(m) = f_in(i,j,k)
                ENDDO
             ENDDO
          ENDDO
       ENDDO

!
!--    Transpose array
       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
       CALL MPI_ALLTOALL( work2(1),             sendrecvcount_yz, MPI_REAL, &
                          f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
                          comm1dx, ierr )
       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )

!
!--    Reorder transposed array in a way that the y index is in first position
       DO  k = nzb_y, nzt_ya
          DO  i = nxl_y, nxr_ya
             DO  j = 0, nya
                work1(j,i,k) = f_inv(i,k,j)
             ENDDO
          ENDDO
       ENDDO
    ELSE
!
!--    Reorder the array in a way that the y index is in first position
       DO  k = nzb_y, nzt_ya
          DO  i = nxl_y, nxr_ya
             DO  j = 0, nya
                work1(j,i,k) = f_in(i,j,k)
             ENDDO
          ENDDO
       ENDDO
    ENDIF

!
!-- Move data to output array
    DO  k = nzb_y, nzt_ya
       DO  i = nxl_y, nxr_ya
          DO  j = 0, nya
             f_out(j,i,k) = work1(j,i,k)
          ENDDO
       ENDDO
    ENDDO

#endif

 END SUBROUTINE transpose_zy


 SUBROUTINE transpose_zyd( f_in, work1, f_inv, work2, f_out )

!------------------------------------------------------------------------------!
! Description:
! ------------
! Transposition of input array (f_in) from z to y. For the input array, all
! elements along z reside on the same PE, while after transposition, all
! elements along y reside on the same PE.
! This is a direct transposition for arrays with indices in regular order
! (k,j,i) (cf. transpose_zy).
!------------------------------------------------------------------------------!

    USE cpulog
    USE indices
    USE interfaces
    USE pegrid
    USE transpose_indices

    IMPLICIT NONE

    INTEGER ::  i, j, k, l, m, ys
    
    REAL ::  f_in(1:nza,nys:nyna,nxl:nxra), f_inv(nys:nyna,nxl:nxra,1:nza), &
             f_out(0:nya,nxl_yd:nxr_yda,nzb_yd:nzt_yda),                    &
             work1(nys:nyna,nxl:nxra,1:nza), work2(nnx*nny*nnz)

#if defined( __parallel )

!
!-- Rearrange indices of input array in order to make data to be send
!-- by MPI contiguous
    DO  i = nxl, nxra
       DO  j = nys, nyna
          DO  k = 1, nza
             work1(j,i,k) = f_in(k,j,i)
          ENDDO
       ENDDO
    ENDDO

!
!-- Move data to different array, because memory location of work1 is
!-- needed further below (work1 = work2).
!-- If the PE grid is one-dimensional along x, only local reordering
!-- of the data is necessary and no transposition has to be done.
    IF ( pdims(2) == 1 )  THEN
       DO  k = 1, nza
          DO  i = nxl, nxra
             DO  j = nys, nyna
                f_out(j,i,k) = work1(j,i,k)
             ENDDO
          ENDDO
       ENDDO
       RETURN
    ELSE
       DO  k = 1, nza
          DO  i = nxl, nxra
             DO  j = nys, nyna
                f_inv(j,i,k) = work1(j,i,k)
             ENDDO
          ENDDO
       ENDDO
    ENDIF

!
!-- Transpose array
    CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
    CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zyd, MPI_REAL, &
                       work2(1),         sendrecvcount_zyd, MPI_REAL, &
                       comm1dy, ierr )
    CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )

!
!-- Reorder transposed array
    m = 0
    DO  l = 0, pdims(2) - 1
       ys = 0 + l * nny
       DO  k = nzb_yd, nzt_yda
          DO  i = nxl_yd, nxr_yda
             DO  j = ys, ys + nny - 1
                m = m + 1
                f_out(j,i,k) = work2(m)
             ENDDO
          ENDDO
       ENDDO
    ENDDO

#endif

 END SUBROUTINE transpose_zyd