SUBROUTINE resort_for_xy( f_in, f_inv ) !--------------------------------------------------------------------------------! ! This file is part of PALM. ! ! PALM is free software: you can redistribute it and/or modify it under the terms ! of the GNU General Public License as published by the Free Software Foundation, ! either version 3 of the License, or (at your option) any later version. ! ! PALM is distributed in the hope that it will be useful, but WITHOUT ANY ! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR ! A PARTICULAR PURPOSE. See the GNU General Public License for more details. ! ! You should have received a copy of the GNU General Public License along with ! PALM. If not, see . ! ! Copyright 1997-2012 Leibniz University Hannover !--------------------------------------------------------------------------------! ! ! Current revisions: ! ----------------- ! ! ! Former revisions: ! ----------------- ! $Id: transpose.f90 1217 2013-08-26 11:30:05Z raasch $ ! ! 1216 2013-08-26 09:31:42Z raasch ! re-sorting of the transposed / to be transposed arrays moved to separate ! routines resort_for_... ! ! 1111 2013-03-08 23:54:10Z raasch ! openACC directives added, ! resorting data from/to work changed, work got 4 dimensions instead of 1 ! ! 1106 2013-03-04 05:31:38Z raasch ! preprocessor lines rearranged so that routines can also be used in serial ! (non-parallel) mode ! ! 1092 2013-02-02 11:24:22Z raasch ! unused variables removed ! ! 1036 2012-10-22 13:43:42Z raasch ! code put under GPL (PALM 3.9) ! ! 1003 2012-09-14 14:35:53Z raasch ! indices nxa, nya, etc. replaced by nx, ny, etc. ! ! 683 2011-02-09 14:25:15Z raasch ! openMP parallelization of transpositions for 2d-domain-decomposition ! ! 622 2010-12-10 08:08:13Z raasch ! optional barriers included in order to speed up collective operations ! ! 164 2008-05-15 08:46:15Z raasch ! f_inv changed from subroutine argument to automatic array in order to do ! re-ordering from f_in to f_inv in one step, one array work is needed instead ! of work1 and work2 ! ! February 2007 ! RCS Log replace by Id keyword, revision history cleaned up ! ! Revision 1.2 2004/04/30 13:12:17 raasch ! Switched from mpi_alltoallv to the simpler mpi_alltoall, ! all former transpose-routine files collected in this file, enlarged ! transposition arrays introduced ! ! Revision 1.1 2004/04/30 13:08:16 raasch ! Initial revision (collection of former routines transpose_xy, transpose_xz, ! transpose_yx, transpose_yz, transpose_zx, transpose_zy) ! ! Revision 1.1 1997/07/24 11:25:18 raasch ! Initial revision ! !------------------------------------------------------------------------------! ! Description: ! ------------ ! Resorting data for the transposition from x to y. The transposition itself ! is carried out in transpose_xy !------------------------------------------------------------------------------! USE indices USE transpose_indices IMPLICIT NONE REAL :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x) REAL :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) INTEGER :: i, j, k ! !-- Rearrange indices of input array in order to make data to be send !-- by MPI contiguous !$OMP PARALLEL PRIVATE ( i, j, k ) !$OMP DO !$acc kernels present( f_in, f_inv ) !$acc loop DO i = 0, nx DO k = nzb_x, nzt_x !$acc loop vector( 32 ) DO j = nys_x, nyn_x f_inv(j,k,i) = f_in(i,j,k) ENDDO ENDDO ENDDO !$acc end kernels !$OMP END PARALLEL END SUBROUTINE resort_for_xy SUBROUTINE transpose_xy( f_inv, f_out ) !------------------------------------------------------------------------------! ! Description: ! ------------ ! Transposition of input array (f_in) from x to y. For the input array, all ! elements along x reside on the same PE, while after transposition, all ! elements along y reside on the same PE. !------------------------------------------------------------------------------! USE cpulog USE indices USE interfaces USE pegrid USE transpose_indices IMPLICIT NONE INTEGER :: i, j, k, l, ys REAL :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx), f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) REAL, DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work IF ( numprocs /= 1 ) THEN #if defined( __parallel ) ! !-- Transpose array CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' ) IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr ) !$acc update host( f_inv ) CALL MPI_ALLTOALL( f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, & work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, & comm1dy, ierr ) CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' ) ! !-- Reorder transposed array !$OMP PARALLEL PRIVATE ( i, j, k, l, ys ) !$OMP DO !$acc data copyin( work ) DO l = 0, pdims(2) - 1 ys = 0 + l * ( nyn_x - nys_x + 1 ) !$acc kernels present( f_out, work ) !$acc loop DO i = nxl_y, nxr_y DO k = nzb_y, nzt_y !$acc loop vector( 32 ) DO j = ys, ys + nyn_x - nys_x f_out(j,i,k) = work(j-ys+1,k,i,l) ENDDO ENDDO ENDDO !$acc end kernels ENDDO !$acc end data !$OMP END PARALLEL #endif ELSE ! !-- Reorder transposed array !$OMP PARALLEL PRIVATE ( i, j, k ) !$OMP DO !$acc kernels present( f_inv, f_out ) !$acc loop DO k = nzb_y, nzt_y DO i = nxl_y, nxr_y !$acc loop vector( 32 ) DO j = 0, ny f_out(j,i,k) = f_inv(j,k,i) ENDDO ENDDO ENDDO !$acc end kernels !$OMP END PARALLEL ENDIF END SUBROUTINE transpose_xy SUBROUTINE resort_for_xz( f_inv, f_out ) !------------------------------------------------------------------------------! ! Description: ! ------------ ! Resorting data after the transposition from x to z. The transposition itself ! is carried out in transpose_xz !------------------------------------------------------------------------------! USE indices USE transpose_indices IMPLICIT NONE REAL :: f_inv(nys:nyn,nxl:nxr,1:nz) REAL :: f_out(1:nz,nys:nyn,nxl:nxr) INTEGER :: i, j, k ! !-- Rearrange indices of input array in order to make data to be send !-- by MPI contiguous. !-- In case of parallel fft/transposition, scattered store is faster in !-- backward direction!!! !$OMP PARALLEL PRIVATE ( i, j, k ) !$OMP DO !$acc kernels present( f_inv, f_out ) !$acc loop DO k = 1, nz DO i = nxl, nxr !$acc loop vector( 32 ) DO j = nys, nyn f_out(k,j,i) = f_inv(j,i,k) ENDDO ENDDO ENDDO !$acc end kernels !$OMP END PARALLEL END SUBROUTINE resort_for_xz SUBROUTINE transpose_xz( f_in, f_inv ) !------------------------------------------------------------------------------! ! Description: ! ------------ ! Transposition of input array (f_in) from x to z. For the input array, all ! elements along x reside on the same PE, while after transposition, all ! elements along z reside on the same PE. !------------------------------------------------------------------------------! USE cpulog USE indices USE interfaces USE pegrid USE transpose_indices IMPLICIT NONE INTEGER :: i, j, k, l, xs REAL :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x), f_inv(nys:nyn,nxl:nxr,1:nz) REAL, DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work ! !-- If the PE grid is one-dimensional along y, the array has only to be !-- reordered locally and therefore no transposition has to be done. IF ( pdims(1) /= 1 ) THEN #if defined( __parallel ) ! !-- Reorder input array for transposition !$OMP PARALLEL PRIVATE ( i, j, k, l, xs ) !$OMP DO !$acc data copyout( work ) DO l = 0, pdims(1) - 1 xs = 0 + l * nnx !$acc kernels present( f_in, work ) !$acc loop DO k = nzb_x, nzt_x DO i = xs, xs + nnx - 1 !$acc loop vector( 32 ) DO j = nys_x, nyn_x work(j,i-xs+1,k,l) = f_in(i,j,k) ENDDO ENDDO ENDDO !$acc end kernels ENDDO !$acc end data !$OMP END PARALLEL ! !-- Transpose array CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' ) IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr ) CALL MPI_ALLTOALL( work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, & f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, & comm1dx, ierr ) !$acc update device( f_inv ) CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' ) #endif ELSE ! !-- Reorder the array in a way that the z index is in first position !$OMP PARALLEL PRIVATE ( i, j, k ) !$OMP DO !$acc kernels present( f_in, f_inv ) !$acc loop DO i = nxl, nxr DO j = nys, nyn !$acc loop vector( 32 ) DO k = 1, nz f_inv(j,i,k) = f_in(i,j,k) ENDDO ENDDO ENDDO !$acc end kernels !$OMP END PARALLEL ENDIF END SUBROUTINE transpose_xz SUBROUTINE resort_for_yx( f_inv, f_out ) !------------------------------------------------------------------------------! ! Description: ! ------------ ! Resorting data after the transposition from y to x. The transposition itself ! is carried out in transpose_yx !------------------------------------------------------------------------------! USE indices USE transpose_indices IMPLICIT NONE REAL :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) REAL :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) INTEGER :: i, j, k ! !-- Rearrange indices of input array in order to make data to be send !-- by MPI contiguous !$OMP PARALLEL PRIVATE ( i, j, k ) !$OMP DO !$acc kernels present( f_inv, f_out ) !$acc loop DO i = 0, nx DO k = nzb_x, nzt_x !$acc loop vector( 32 ) DO j = nys_x, nyn_x f_out(i,j,k) = f_inv(j,k,i) ENDDO ENDDO ENDDO !$acc end kernels !$OMP END PARALLEL END SUBROUTINE resort_for_yx SUBROUTINE transpose_yx( f_in, f_inv ) !------------------------------------------------------------------------------! ! Description: ! ------------ ! Transposition of input array (f_in) from y to x. For the input array, all ! elements along y reside on the same PE, while after transposition, all ! elements along x reside on the same PE. !------------------------------------------------------------------------------! USE cpulog USE indices USE interfaces USE pegrid USE transpose_indices IMPLICIT NONE INTEGER :: i, j, k, l, ys REAL :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y), f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) REAL, DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work IF ( numprocs /= 1 ) THEN #if defined( __parallel ) ! !-- Reorder input array for transposition !$OMP PARALLEL PRIVATE ( i, j, k, l, ys ) !$OMP DO !$acc data copyout( work ) DO l = 0, pdims(2) - 1 ys = 0 + l * ( nyn_x - nys_x + 1 ) !$acc kernels present( f_in, work ) !$acc loop DO i = nxl_y, nxr_y DO k = nzb_y, nzt_y !$acc loop vector( 32 ) DO j = ys, ys + nyn_x - nys_x work(j-ys+1,k,i,l) = f_in(j,i,k) ENDDO ENDDO ENDDO !$acc end kernels ENDDO !$acc end data !$OMP END PARALLEL ! !-- Transpose array CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' ) IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr ) CALL MPI_ALLTOALL( work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, & f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, & comm1dy, ierr ) !$acc update device( f_inv ) CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' ) #endif ELSE ! !-- Reorder array f_in the same way as ALLTOALL did it !$OMP PARALLEL PRIVATE ( i, j, k ) !$OMP DO !$acc kernels present( f_in, f_inv ) !$acc loop DO i = nxl_y, nxr_y DO k = nzb_y, nzt_y !$acc loop vector( 32 ) DO j = 0, ny f_inv(j,k,i) = f_in(j,i,k) ENDDO ENDDO ENDDO !$acc end kernels !$OMP END PARALLEL ENDIF END SUBROUTINE transpose_yx SUBROUTINE transpose_yxd( f_in, f_out ) !------------------------------------------------------------------------------! ! Description: ! ------------ ! Transposition of input array (f_in) from y to x. For the input array, all ! elements along y reside on the same PE, while after transposition, all ! elements along x reside on the same PE. ! This is a direct transposition for arrays with indices in regular order ! (k,j,i) (cf. transpose_yx). !------------------------------------------------------------------------------! USE cpulog USE indices USE interfaces USE pegrid USE transpose_indices IMPLICIT NONE INTEGER :: i, j, k, l, m, xs REAL :: f_in(1:nz,nys:nyn,nxl:nxr), f_inv(nxl:nxr,1:nz,nys:nyn), & f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x), & work(nnx*nny*nnz) #if defined( __parallel ) ! !-- Rearrange indices of input array in order to make data to be send !-- by MPI contiguous DO k = 1, nz DO j = nys, nyn DO i = nxl, nxr f_inv(i,k,j) = f_in(k,j,i) ENDDO ENDDO ENDDO ! !-- Transpose array CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' ) IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr ) CALL MPI_ALLTOALL( f_inv(nxl,1,nys), sendrecvcount_xy, MPI_REAL, & work(1), sendrecvcount_xy, MPI_REAL, & comm1dx, ierr ) CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' ) ! !-- Reorder transposed array m = 0 DO l = 0, pdims(1) - 1 xs = 0 + l * nnx DO j = nys_x, nyn_x DO k = 1, nz DO i = xs, xs + nnx - 1 m = m + 1 f_out(i,j,k) = work(m) ENDDO ENDDO ENDDO ENDDO #endif END SUBROUTINE transpose_yxd SUBROUTINE resort_for_yz( f_in, f_inv ) !------------------------------------------------------------------------------! ! Description: ! ------------ ! Resorting data for the transposition from y to z. The transposition itself ! is carried out in transpose_yz !------------------------------------------------------------------------------! USE indices USE transpose_indices IMPLICIT NONE REAL :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) REAL :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) INTEGER :: i, j, k ! !-- Rearrange indices of input array in order to make data to be send !-- by MPI contiguous !$OMP PARALLEL PRIVATE ( i, j, k ) !$OMP DO !$acc kernels present( f_in, f_inv ) !$acc loop DO j = 0, ny DO k = nzb_y, nzt_y !$acc loop vector( 32 ) DO i = nxl_y, nxr_y f_inv(i,k,j) = f_in(j,i,k) ENDDO ENDDO ENDDO !$acc end kernels !$OMP END PARALLEL END SUBROUTINE resort_for_yz SUBROUTINE transpose_yz( f_inv, f_out ) !------------------------------------------------------------------------------! ! Description: ! ------------ ! Transposition of input array (f_in) from y to z. For the input array, all ! elements along y reside on the same PE, while after transposition, all ! elements along z reside on the same PE. !------------------------------------------------------------------------------! USE cpulog USE indices USE interfaces USE pegrid USE transpose_indices IMPLICIT NONE INTEGER :: i, j, k, l, zs REAL :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny), f_out(nxl_z:nxr_z,nys_z:nyn_z,1:nz) REAL, DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work ! !-- If the PE grid is one-dimensional along y, only local reordering !-- of the data is necessary and no transposition has to be done. IF ( pdims(1) == 1 ) THEN !$OMP PARALLEL PRIVATE ( i, j, k ) !$OMP DO !$acc kernels present( f_inv, f_out ) !$acc loop DO j = 0, ny DO k = nzb_y, nzt_y !$acc loop vector( 32 ) DO i = nxl_y, nxr_y f_out(i,j,k) = f_inv(i,k,j) ENDDO ENDDO ENDDO !$acc end kernels !$OMP END PARALLEL ELSE #if defined( __parallel ) ! !-- Transpose array CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' ) IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr ) !$acc update host( f_inv ) CALL MPI_ALLTOALL( f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, & work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, & comm1dx, ierr ) CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' ) ! !-- Reorder transposed array !$OMP PARALLEL PRIVATE ( i, j, k, l, zs ) !$OMP DO !$acc data copyin( work ) DO l = 0, pdims(1) - 1 zs = 1 + l * ( nzt_y - nzb_y + 1 ) !$acc kernels present( f_out ) !$acc loop DO j = nys_z, nyn_z DO k = zs, zs + nzt_y - nzb_y !$acc loop vector( 32 ) DO i = nxl_z, nxr_z f_out(i,j,k) = work(i,k-zs+1,j,l) ENDDO ENDDO ENDDO !$acc end kernels ENDDO !$acc end data !$OMP END PARALLEL #endif ENDIF END SUBROUTINE transpose_yz SUBROUTINE resort_for_zx( f_in, f_inv ) !------------------------------------------------------------------------------! ! Description: ! ------------ ! Resorting data for the transposition from z to x. The transposition itself ! is carried out in transpose_zx !------------------------------------------------------------------------------! USE indices USE transpose_indices IMPLICIT NONE REAL :: f_in(1:nz,nys:nyn,nxl:nxr) REAL :: f_inv(nys:nyn,nxl:nxr,1:nz) INTEGER :: i, j, k ! !-- Rearrange indices of input array in order to make data to be send !-- by MPI contiguous !$OMP PARALLEL PRIVATE ( i, j, k ) !$OMP DO !$acc kernels present( f_in, f_inv ) !$acc loop DO k = 1,nz DO i = nxl, nxr !$acc loop vector( 32 ) DO j = nys, nyn f_inv(j,i,k) = f_in(k,j,i) ENDDO ENDDO ENDDO !$acc end kernels !$OMP END PARALLEL END SUBROUTINE resort_for_zx SUBROUTINE transpose_zx( f_inv, f_out ) !------------------------------------------------------------------------------! ! Description: ! ------------ ! Transposition of input array (f_in) from z to x. For the input array, all ! elements along z reside on the same PE, while after transposition, all ! elements along x reside on the same PE. !------------------------------------------------------------------------------! USE cpulog USE indices USE interfaces USE pegrid USE transpose_indices IMPLICIT NONE INTEGER :: i, j, k, l, xs REAL :: f_inv(nys:nyn,nxl:nxr,1:nz), f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) REAL, DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work ! !-- If the PE grid is one-dimensional along y, only local reordering !-- of the data is necessary and no transposition has to be done. IF ( pdims(1) == 1 ) THEN !$OMP PARALLEL PRIVATE ( i, j, k ) !$OMP DO !$acc kernels present( f_inv, f_out ) !$acc loop DO k = 1, nz DO i = nxl, nxr !$acc loop vector( 32 ) DO j = nys, nyn f_out(i,j,k) = f_inv(j,i,k) ENDDO ENDDO ENDDO !$acc end kernels !$OMP END PARALLEL ELSE #if defined( __parallel ) ! !-- Transpose array CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' ) IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr ) !$acc update host( f_inv ) CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, & work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, & comm1dx, ierr ) CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' ) ! !-- Reorder transposed array !$OMP PARALLEL PRIVATE ( i, j, k, l, xs ) !$OMP DO !$acc data copyin( work ) DO l = 0, pdims(1) - 1 xs = 0 + l * nnx !$acc kernels present( f_out ) !$acc loop DO k = nzb_x, nzt_x DO i = xs, xs + nnx - 1 !$acc loop vector( 32 ) DO j = nys_x, nyn_x f_out(i,j,k) = work(j,i-xs+1,k,l) ENDDO ENDDO ENDDO !$acc end kernels ENDDO !$acc end data !$OMP END PARALLEL #endif ENDIF END SUBROUTINE transpose_zx SUBROUTINE resort_for_zy( f_inv, f_out ) !------------------------------------------------------------------------------! ! Description: ! ------------ ! Resorting data after the transposition from z to y. The transposition itself ! is carried out in transpose_zy !------------------------------------------------------------------------------! USE indices USE transpose_indices IMPLICIT NONE REAL :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) REAL :: f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) INTEGER :: i, j, k ! !-- Rearrange indices of input array in order to make data to be send !-- by MPI contiguous !$OMP PARALLEL PRIVATE ( i, j, k ) !$OMP DO !$acc kernels present( f_inv, f_out ) !$acc loop DO k = nzb_y, nzt_y DO j = 0, ny !$acc loop vector( 32 ) DO i = nxl_y, nxr_y f_out(j,i,k) = f_inv(i,k,j) ENDDO ENDDO ENDDO !$acc end kernels !$OMP END PARALLEL END SUBROUTINE resort_for_zy SUBROUTINE transpose_zy( f_in, f_inv ) !------------------------------------------------------------------------------! ! Description: ! ------------ ! Transposition of input array (f_in) from z to y. For the input array, all ! elements along z reside on the same PE, while after transposition, all ! elements along y reside on the same PE. !------------------------------------------------------------------------------! USE cpulog USE indices USE interfaces USE pegrid USE transpose_indices IMPLICIT NONE INTEGER :: i, j, k, l, zs REAL :: f_in(nxl_z:nxr_z,nys_z:nyn_z,1:nz), f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) REAL, DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work ! !-- If the PE grid is one-dimensional along y, the array has only to be !-- reordered locally and therefore no transposition has to be done. IF ( pdims(1) /= 1 ) THEN #if defined( __parallel ) ! !-- Reorder input array for transposition !$OMP PARALLEL PRIVATE ( i, j, k, l, zs ) !$OMP DO !$acc data copyout( work ) DO l = 0, pdims(1) - 1 zs = 1 + l * ( nzt_y - nzb_y + 1 ) !$acc kernels present( f_in, work ) !$acc loop DO j = nys_z, nyn_z DO k = zs, zs + nzt_y - nzb_y !$acc loop vector( 32 ) DO i = nxl_z, nxr_z work(i,k-zs+1,j,l) = f_in(i,j,k) ENDDO ENDDO ENDDO !$acc end kernels ENDDO !$acc end data !$OMP END PARALLEL ! !-- Transpose array CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' ) IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr ) CALL MPI_ALLTOALL( work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, & f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, & comm1dx, ierr ) !$acc update device( f_inv ) CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' ) #endif ELSE ! !-- Reorder the array in the same way like ALLTOALL did it !$OMP PARALLEL PRIVATE ( i, j, k ) !$OMP DO !$acc kernels present( f_in, f_inv ) !$acc loop DO k = nzb_y, nzt_y DO j = 0, ny !$acc loop vector( 32 ) DO i = nxl_y, nxr_y f_inv(i,k,j) = f_in(i,j,k) ENDDO ENDDO ENDDO !$acc end kernels !$OMP END PARALLEL ENDIF END SUBROUTINE transpose_zy SUBROUTINE transpose_zyd( f_in, f_out ) !------------------------------------------------------------------------------! ! Description: ! ------------ ! Transposition of input array (f_in) from z to y. For the input array, all ! elements along z reside on the same PE, while after transposition, all ! elements along y reside on the same PE. ! This is a direct transposition for arrays with indices in regular order ! (k,j,i) (cf. transpose_zy). !------------------------------------------------------------------------------! USE cpulog USE indices USE interfaces USE pegrid USE transpose_indices IMPLICIT NONE INTEGER :: i, j, k, l, m, ys REAL :: f_in(1:nz,nys:nyn,nxl:nxr), f_inv(nys:nyn,nxl:nxr,1:nz), & f_out(0:ny,nxl_yd:nxr_yd,nzb_yd:nzt_yd), & work(nnx*nny*nnz) #if defined( __parallel ) ! !-- Rearrange indices of input array in order to make data to be send !-- by MPI contiguous DO i = nxl, nxr DO j = nys, nyn DO k = 1, nz f_inv(j,i,k) = f_in(k,j,i) ENDDO ENDDO ENDDO ! !-- Move data to different array, because memory location of work1 is !-- needed further below (work1 = work2). !-- If the PE grid is one-dimensional along x, only local reordering !-- of the data is necessary and no transposition has to be done. IF ( pdims(2) == 1 ) THEN DO k = 1, nz DO i = nxl, nxr DO j = nys, nyn f_out(j,i,k) = f_inv(j,i,k) ENDDO ENDDO ENDDO RETURN ENDIF ! !-- Transpose array CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' ) IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr ) CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zyd, MPI_REAL, & work(1), sendrecvcount_zyd, MPI_REAL, & comm1dy, ierr ) CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' ) ! !-- Reorder transposed array m = 0 DO l = 0, pdims(2) - 1 ys = 0 + l * nny DO k = nzb_yd, nzt_yd DO i = nxl_yd, nxr_yd DO j = ys, ys + nny - 1 m = m + 1 f_out(j,i,k) = work(m) ENDDO ENDDO ENDDO ENDDO #endif END SUBROUTINE transpose_zyd