!> @file transpose.f90 !------------------------------------------------------------------------------! ! This file is part of the PALM model system. ! ! PALM is free software: you can redistribute it and/or modify it under the ! terms of the GNU General Public License as published by the Free Software ! Foundation, either version 3 of the License, or (at your option) any later ! version. ! ! PALM is distributed in the hope that it will be useful, but WITHOUT ANY ! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR ! A PARTICULAR PURPOSE. See the GNU General Public License for more details. ! ! You should have received a copy of the GNU General Public License along with ! PALM. If not, see . ! ! Copyright 1997-2020 Leibniz Universitaet Hannover !------------------------------------------------------------------------------! ! ! Current revisions: ! ----------------- ! ! ! Former revisions: ! ----------------- ! $Id: transpose.f90 4429 2020-02-27 15:24:30Z eckhard $ ! bugfix: cpp-directives added for serial mode ! ! 4415 2020-02-20 10:30:33Z raasch ! bugfix for misplaced preprocessor directive ! ! 4370 2020-01-10 14:00:44Z raasch ! vector array renamed ! ! 4366 2020-01-09 08:12:43Z raasch ! modifications for NEC vectorization ! ! 4360 2020-01-07 11:25:50Z suehring ! Added missing OpenMP directives ! ! 4182 2019-08-22 15:20:23Z scharf ! Corrected "Former revisions" section ! ! 4171 2019-08-19 17:44:09Z gronemeier ! loop reordering for performance optimization ! ! 3832 2019-03-28 13:16:58Z raasch ! loop reordering for performance optimization ! ! 3694 2019-01-23 17:01:49Z knoop ! OpenACC port for SPEC ! ! Revision 1.1 1997/07/24 11:25:18 raasch ! Initial revision ! ! ! Description: ! ------------ !> Resorting data for the transposition from x to y. The transposition itself !> is carried out in transpose_xy !------------------------------------------------------------------------------! #define __acc_fft_device ( defined( _OPENACC ) && ( defined ( __cuda_fft ) ) ) SUBROUTINE resort_for_xy( f_in, f_inv ) USE indices, & ONLY: nx USE kinds USE transpose_indices, & ONLY: nyn_x, nys_x, nzb_x, nzt_x IMPLICIT NONE REAL(wp) :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !< REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !< INTEGER(iwp) :: i !< INTEGER(iwp) :: j !< INTEGER(iwp) :: k !< ! !-- Rearrange indices of input array in order to make data to be send !-- by MPI contiguous !$OMP PARALLEL PRIVATE ( i, j, k ) !$OMP DO #if __acc_fft_device !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) & !$ACC PRESENT(f_inv, f_in) #endif DO k = nzb_x, nzt_x DO j = nys_x, nyn_x DO i = 0, nx f_inv(j,k,i) = f_in(i,j,k) ENDDO ENDDO ENDDO !$OMP END PARALLEL END SUBROUTINE resort_for_xy !------------------------------------------------------------------------------! ! Description: ! ------------ !> Transposition of input array (f_in) from x to y. For the input array, all !> elements along x reside on the same PE, while after transposition, all !> elements along y reside on the same PE. !------------------------------------------------------------------------------! SUBROUTINE transpose_xy( f_inv, f_out ) #if defined( __parallel ) USE cpulog, & ONLY: cpu_log, cpu_log_nowait, log_point_s #endif USE indices, & ONLY: nx, ny USE kinds USE pegrid USE transpose_indices, & ONLY: nxl_y, nxr_y, nyn_x, nys_x, nzb_x, nzb_y, nzt_x, nzt_y IMPLICIT NONE INTEGER(iwp) :: i !< INTEGER(iwp) :: j !< INTEGER(iwp) :: k !< #if defined( __parallel ) INTEGER(iwp) :: l !< INTEGER(iwp) :: ys !< #endif REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !< REAL(wp) :: f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !< #if defined( __parallel ) REAL(wp), DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work !< #if __acc_fft_device !$ACC DECLARE CREATE(work) #endif #endif IF ( numprocs /= 1 ) THEN #if defined( __parallel ) ! !-- Transpose array CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait ) #if __acc_fft_device #ifndef __cuda_aware_mpi !$ACC UPDATE HOST(f_inv) #else !$ACC HOST_DATA USE_DEVICE(work, f_inv) #endif #endif IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr ) CALL MPI_ALLTOALL( f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, & work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, & comm1dy, ierr ) #if __acc_fft_device #ifndef __cuda_aware_mpi !$ACC UPDATE DEVICE(work) #else !$ACC END HOST_DATA #endif #endif CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' ) ! !-- Reorder transposed array !$OMP PARALLEL PRIVATE ( i, j, k, l, ys ) DO l = 0, pdims(2) - 1 ys = 0 + l * ( nyn_x - nys_x + 1 ) #if __acc_fft_device !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) & !$ACC PRESENT(f_out, work) #endif !$OMP DO DO i = nxl_y, nxr_y DO k = nzb_y, nzt_y DO j = ys, ys + nyn_x - nys_x f_out(j,i,k) = work(j-ys+1,k,i,l) ENDDO ENDDO ENDDO !$OMP END DO NOWAIT ENDDO !$OMP END PARALLEL #endif ELSE ! !-- Reorder transposed array !$OMP PARALLEL PRIVATE ( i, j, k ) !$OMP DO #if __acc_fft_device !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) & !$ACC PRESENT(f_out, f_inv) #endif DO k = nzb_y, nzt_y DO i = nxl_y, nxr_y DO j = 0, ny f_out(j,i,k) = f_inv(j,k,i) ENDDO ENDDO ENDDO !$OMP END PARALLEL ENDIF END SUBROUTINE transpose_xy !------------------------------------------------------------------------------! ! Description: ! ------------ !> Resorting data after the transposition from x to z. The transposition itself !> is carried out in transpose_xz !------------------------------------------------------------------------------! SUBROUTINE resort_for_xz( f_inv, f_out ) USE indices, & ONLY: nxl, nxr, nyn, nys, nz USE kinds IMPLICIT NONE REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !< REAL(wp) :: f_out(1:nz,nys:nyn,nxl:nxr) !< INTEGER(iwp) :: i !< INTEGER(iwp) :: j !< INTEGER(iwp) :: k !< ! !-- Rearrange indices of input array in order to make data to be send !-- by MPI contiguous. !-- In case of parallel fft/transposition, scattered store is faster in !-- backward direction!!! !$OMP PARALLEL PRIVATE ( i, j, k ) !$OMP DO #if __acc_fft_device !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) & !$ACC PRESENT(f_out, f_inv) #endif DO i = nxl, nxr DO j = nys, nyn DO k = 1, nz f_out(k,j,i) = f_inv(j,i,k) ENDDO ENDDO ENDDO !$OMP END PARALLEL END SUBROUTINE resort_for_xz !------------------------------------------------------------------------------! ! Description: ! ------------ !> Transposition of input array (f_in) from x to z. For the input array, all !> elements along x reside on the same PE, while after transposition, all !> elements along z reside on the same PE. !------------------------------------------------------------------------------! SUBROUTINE transpose_xz( f_in, f_inv ) #if defined( __parallel ) USE cpulog, & ONLY: cpu_log, cpu_log_nowait, log_point_s USE fft_xy, & ONLY: f_vec_x, temperton_fft_vec #endif USE indices, & ONLY: nx, nxl, nxr, nyn, nys, nz #if defined( __parallel ) USE indices, & ONLY: nnx #endif USE kinds USE pegrid USE transpose_indices, & ONLY: nyn_x, nys_x, nzb_x, nzt_x IMPLICIT NONE INTEGER(iwp) :: i !< INTEGER(iwp) :: j !< INTEGER(iwp) :: k !< #if defined( __parallel ) INTEGER(iwp) :: l !< INTEGER(iwp) :: mm !< INTEGER(iwp) :: xs !< #endif REAL(wp) :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !< REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !< #if defined( __parallel ) REAL(wp), DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work !< #if __acc_fft_device !$ACC DECLARE CREATE(work) #endif #endif ! !-- If the PE grid is one-dimensional along y, the array has only to be !-- reordered locally and therefore no transposition has to be done. IF ( pdims(1) /= 1 ) THEN #if defined( __parallel ) ! !-- Reorder input array for transposition. Data from the vectorized Temperton-fft is stored in !-- different array format (f_vec_x). IF ( temperton_fft_vec ) THEN DO l = 0, pdims(1) - 1 xs = 0 + l * nnx DO k = nzb_x, nzt_x DO i = xs, xs + nnx - 1 DO j = nys_x, nyn_x mm = j-nys_x+1+(k-nzb_x)*(nyn_x-nys_x+1) work(j,i-xs+1,k,l) = f_vec_x(mm,i) ENDDO ENDDO ENDDO ENDDO ELSE !$OMP PARALLEL PRIVATE ( i, j, k, l, xs ) DO l = 0, pdims(1) - 1 xs = 0 + l * nnx #if __acc_fft_device !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) & !$ACC PRESENT(work, f_in) #endif !$OMP DO DO k = nzb_x, nzt_x DO i = xs, xs + nnx - 1 DO j = nys_x, nyn_x work(j,i-xs+1,k,l) = f_in(i,j,k) ENDDO ENDDO ENDDO !$OMP END DO NOWAIT ENDDO !$OMP END PARALLEL ENDIF ! !-- Transpose array CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait ) #if __acc_fft_device #ifndef __cuda_aware_mpi !$ACC UPDATE HOST(work) #else !$ACC HOST_DATA USE_DEVICE(work, f_inv) #endif #endif IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr ) CALL MPI_ALLTOALL( work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, & f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, & comm1dx, ierr ) #if __acc_fft_device #ifndef __cuda_aware_mpi !$ACC UPDATE DEVICE(f_inv) #else !$ACC END HOST_DATA #endif #endif CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' ) #endif ELSE ! !-- Reorder the array in a way that the z index is in first position !$OMP PARALLEL PRIVATE ( i, j, k ) !$OMP DO #if __acc_fft_device !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) & !$ACC PRESENT(f_inv, f_in) #endif DO i = nxl, nxr DO j = nys, nyn DO k = 1, nz f_inv(j,i,k) = f_in(i,j,k) ENDDO ENDDO ENDDO !$OMP END PARALLEL ENDIF END SUBROUTINE transpose_xz !------------------------------------------------------------------------------! ! Description: ! ------------ !> Resorting data after the transposition from y to x. The transposition itself !> is carried out in transpose_yx !------------------------------------------------------------------------------! SUBROUTINE resort_for_yx( f_inv, f_out ) USE indices, & ONLY: nx USE kinds USE transpose_indices, & ONLY: nyn_x, nys_x, nzb_x, nzt_x IMPLICIT NONE REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !< REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !< INTEGER(iwp) :: i !< INTEGER(iwp) :: j !< INTEGER(iwp) :: k !< ! !-- Rearrange indices of input array in order to make data to be send !-- by MPI contiguous !$OMP PARALLEL PRIVATE ( i, j, k ) !$OMP DO #if __acc_fft_device !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) & !$ACC PRESENT(f_out, f_inv) #endif DO k = nzb_x, nzt_x DO j = nys_x, nyn_x DO i = 0, nx f_out(i,j,k) = f_inv(j,k,i) ENDDO ENDDO ENDDO !$OMP END PARALLEL END SUBROUTINE resort_for_yx !------------------------------------------------------------------------------! ! Description: ! ------------ !> Transposition of input array (f_in) from y to x. For the input array, all !> elements along y reside on the same PE, while after transposition, all !> elements along x reside on the same PE. !------------------------------------------------------------------------------! SUBROUTINE transpose_yx( f_in, f_inv ) #if defined( __parallel ) USE cpulog, & ONLY: cpu_log, cpu_log_nowait, log_point_s #endif USE indices, & ONLY: nx, ny USE kinds USE pegrid USE transpose_indices, & ONLY: nxl_y, nxr_y, nyn_x, nys_x, nzb_x, nzb_y, nzt_x, nzt_y IMPLICIT NONE INTEGER(iwp) :: i !< INTEGER(iwp) :: j !< INTEGER(iwp) :: k !< #if defined( __parallel ) INTEGER(iwp) :: l !< INTEGER(iwp) :: ys !< #endif REAL(wp) :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !< REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !< #if defined( __parallel ) REAL(wp), DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work !< #if __acc_fft_device !$ACC DECLARE CREATE(work) #endif #endif IF ( numprocs /= 1 ) THEN #if defined( __parallel ) ! !-- Reorder input array for transposition !$OMP PARALLEL PRIVATE ( i, j, k, l, ys ) DO l = 0, pdims(2) - 1 ys = 0 + l * ( nyn_x - nys_x + 1 ) #if __acc_fft_device !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) & !$ACC PRESENT(work, f_in) #endif !$OMP DO DO i = nxl_y, nxr_y DO k = nzb_y, nzt_y DO j = ys, ys + nyn_x - nys_x work(j-ys+1,k,i,l) = f_in(j,i,k) ENDDO ENDDO ENDDO !$OMP END DO NOWAIT ENDDO !$OMP END PARALLEL ! !-- Transpose array CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait ) #if __acc_fft_device #ifndef __cuda_aware_mpi !$ACC UPDATE HOST(work) #else !$ACC HOST_DATA USE_DEVICE(work, f_inv) #endif #endif IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr ) CALL MPI_ALLTOALL( work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, & f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, & comm1dy, ierr ) #if __acc_fft_device #ifndef __cuda_aware_mpi !$ACC UPDATE DEVICE(f_inv) #else !$ACC END HOST_DATA #endif #endif CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' ) #endif ELSE ! !-- Reorder array f_in the same way as ALLTOALL did it !$OMP PARALLEL PRIVATE ( i, j, k ) !$OMP DO #if __acc_fft_device !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) & !$ACC PRESENT(f_inv, f_in) #endif DO i = nxl_y, nxr_y DO k = nzb_y, nzt_y DO j = 0, ny f_inv(j,k,i) = f_in(j,i,k) ENDDO ENDDO ENDDO !$OMP END PARALLEL ENDIF END SUBROUTINE transpose_yx !------------------------------------------------------------------------------! ! Description: ! ------------ !> Transposition of input array (f_in) from y to x. For the input array, all !> elements along y reside on the same PE, while after transposition, all !> elements along x reside on the same PE. !> This is a direct transposition for arrays with indices in regular order !> (k,j,i) (cf. transpose_yx). !------------------------------------------------------------------------------! #if defined( __parallel ) SUBROUTINE transpose_yxd( f_in, f_out ) USE cpulog, & ONLY: cpu_log, log_point_s USE indices, & ONLY: nnx, nny, nnz, nx, nxl, nxr, nyn, nys, nz USE kinds USE pegrid USE transpose_indices, & ONLY: nyn_x, nys_x, nzb_x, nzt_x IMPLICIT NONE INTEGER(iwp) :: i !< INTEGER(iwp) :: j !< INTEGER(iwp) :: k !< INTEGER(iwp) :: l !< INTEGER(iwp) :: m !< INTEGER(iwp) :: xs !< REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !< REAL(wp) :: f_inv(nxl:nxr,1:nz,nys:nyn) !< REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !< REAL(wp) :: work(nnx*nny*nnz) !< ! !-- Rearrange indices of input array in order to make data to be send !-- by MPI contiguous DO k = 1, nz DO j = nys, nyn DO i = nxl, nxr f_inv(i,k,j) = f_in(k,j,i) ENDDO ENDDO ENDDO ! !-- Transpose array CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' ) IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr ) CALL MPI_ALLTOALL( f_inv(nxl,1,nys), sendrecvcount_xy, MPI_REAL, & work(1), sendrecvcount_xy, MPI_REAL, & comm1dx, ierr ) CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' ) ! !-- Reorder transposed array m = 0 DO l = 0, pdims(1) - 1 xs = 0 + l * nnx DO j = nys_x, nyn_x DO k = 1, nz DO i = xs, xs + nnx - 1 m = m + 1 f_out(i,j,k) = work(m) ENDDO ENDDO ENDDO ENDDO END SUBROUTINE transpose_yxd #endif !------------------------------------------------------------------------------! ! Description: ! ------------ !> Resorting data for the transposition from y to z. The transposition itself !> is carried out in transpose_yz !------------------------------------------------------------------------------! SUBROUTINE resort_for_yz( f_in, f_inv ) USE indices, & ONLY: ny USE kinds USE transpose_indices, & ONLY: nxl_y, nxr_y, nzb_y, nzt_y IMPLICIT NONE REAL(wp) :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !< REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !< INTEGER(iwp) :: i !< INTEGER(iwp) :: j !< INTEGER(iwp) :: k !< ! !-- Rearrange indices of input array in order to make data to be send !-- by MPI contiguous !$OMP PARALLEL PRIVATE ( i, j, k ) !$OMP DO #if __acc_fft_device !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) & !$ACC PRESENT(f_inv, f_in) #endif DO k = nzb_y, nzt_y DO i = nxl_y, nxr_y DO j = 0, ny f_inv(i,k,j) = f_in(j,i,k) ENDDO ENDDO ENDDO !$OMP END PARALLEL END SUBROUTINE resort_for_yz !------------------------------------------------------------------------------! ! Description: ! ------------ !> Transposition of input array (f_in) from y to z. For the input array, all !> elements along y reside on the same PE, while after transposition, all !> elements along z reside on the same PE. !------------------------------------------------------------------------------! SUBROUTINE transpose_yz( f_inv, f_out ) #if defined( __parallel ) USE cpulog, & ONLY: cpu_log, cpu_log_nowait, log_point_s #endif USE indices, & ONLY: ny, nz USE kinds USE pegrid USE transpose_indices, & ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nyn_z, nys_z, nzb_y, nzt_y IMPLICIT NONE INTEGER(iwp) :: i !< INTEGER(iwp) :: j !< INTEGER(iwp) :: k !< #if defined( __parallel ) INTEGER(iwp) :: l !< INTEGER(iwp) :: zs !< #endif REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !< REAL(wp) :: f_out(nxl_z:nxr_z,nys_z:nyn_z,1:nz) !< #if defined( __parallel ) REAL(wp), DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work !< #if __acc_fft_device !$ACC DECLARE CREATE(work) #endif #endif ! !-- If the PE grid is one-dimensional along y, only local reordering !-- of the data is necessary and no transposition has to be done. IF ( pdims(1) == 1 ) THEN !$OMP PARALLEL PRIVATE ( i, j, k ) !$OMP DO #if __acc_fft_device !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) & !$ACC PRESENT(f_out, f_inv) #endif DO j = 0, ny DO k = nzb_y, nzt_y DO i = nxl_y, nxr_y f_out(i,j,k) = f_inv(i,k,j) ENDDO ENDDO ENDDO !$OMP END PARALLEL ELSE #if defined( __parallel ) ! !-- Transpose array CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait ) #if __acc_fft_device #ifndef __cuda_aware_mpi !$ACC UPDATE HOST(f_inv) #else !$ACC HOST_DATA USE_DEVICE(work, f_inv) #endif #endif IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr ) CALL MPI_ALLTOALL( f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, & work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, & comm1dx, ierr ) #if __acc_fft_device #ifndef __cuda_aware_mpi !$ACC UPDATE DEVICE(work) #else !$ACC END HOST_DATA #endif #endif CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' ) ! !-- Reorder transposed array !$OMP PARALLEL PRIVATE ( i, j, k, l, zs ) DO l = 0, pdims(1) - 1 zs = 1 + l * ( nzt_y - nzb_y + 1 ) #if __acc_fft_device !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) & !$ACC PRESENT(f_out, work) #endif !$OMP DO DO j = nys_z, nyn_z DO k = zs, zs + nzt_y - nzb_y DO i = nxl_z, nxr_z f_out(i,j,k) = work(i,k-zs+1,j,l) ENDDO ENDDO ENDDO !$OMP END DO NOWAIT ENDDO !$OMP END PARALLEL #endif ENDIF END SUBROUTINE transpose_yz !------------------------------------------------------------------------------! ! Description: ! ------------ !> Resorting data for the transposition from z to x. The transposition itself !> is carried out in transpose_zx !------------------------------------------------------------------------------! SUBROUTINE resort_for_zx( f_in, f_inv ) USE indices, & ONLY: nxl, nxr, nyn, nys, nz USE kinds IMPLICIT NONE REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !< REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !< INTEGER(iwp) :: i !< INTEGER(iwp) :: j !< INTEGER(iwp) :: k !< ! !-- Rearrange indices of input array in order to make data to be send !-- by MPI contiguous !$OMP PARALLEL PRIVATE ( i, j, k ) !$OMP DO #if __acc_fft_device !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) & !$ACC PRESENT(f_in, f_inv) #endif DO i = nxl, nxr DO j = nys, nyn DO k = 1,nz f_inv(j,i,k) = f_in(k,j,i) ENDDO ENDDO ENDDO !$OMP END PARALLEL END SUBROUTINE resort_for_zx !------------------------------------------------------------------------------! ! Description: ! ------------ !> Transposition of input array (f_in) from z to x. For the input array, all !> elements along z reside on the same PE, while after transposition, all !> elements along x reside on the same PE. !------------------------------------------------------------------------------! SUBROUTINE transpose_zx( f_inv, f_out ) #if defined( __parallel ) USE cpulog, & ONLY: cpu_log, cpu_log_nowait, log_point_s USE fft_xy, & ONLY: f_vec_x, temperton_fft_vec #endif USE indices, & ONLY: nx, nxl, nxr, nyn, nys, nz #if defined( __parallel ) USE indices, & ONLY: nnx #endif USE kinds USE pegrid USE transpose_indices, & ONLY: nyn_x, nys_x, nzb_x, nzt_x IMPLICIT NONE INTEGER(iwp) :: i !< INTEGER(iwp) :: j !< INTEGER(iwp) :: k !< #if defined( __parallel ) INTEGER(iwp) :: l !< INTEGER(iwp) :: mm !< INTEGER(iwp) :: xs !< #endif REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !< REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !< #if defined( __parallel ) REAL(wp), DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work !< #if __acc_fft_device !$ACC DECLARE CREATE(work) #endif #endif ! !-- If the PE grid is one-dimensional along y, only local reordering !-- of the data is necessary and no transposition has to be done. IF ( pdims(1) == 1 ) THEN !$OMP PARALLEL PRIVATE ( i, j, k ) !$OMP DO #if __acc_fft_device !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) & !$ACC PRESENT(f_out, f_inv) #endif DO k = 1, nz DO i = nxl, nxr DO j = nys, nyn f_out(i,j,k) = f_inv(j,i,k) ENDDO ENDDO ENDDO !$OMP END PARALLEL ELSE #if defined( __parallel ) ! !-- Transpose array CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait ) #if __acc_fft_device #ifndef __cuda_aware_mpi !$ACC UPDATE HOST(f_inv) #else !$ACC HOST_DATA USE_DEVICE(work, f_inv) #endif #endif IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr ) CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, & work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, & comm1dx, ierr ) #if __acc_fft_device #ifndef __cuda_aware_mpi !$ACC UPDATE DEVICE(work) #else !$ACC END HOST_DATA #endif #endif CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' ) ! !-- Reorder transposed array. !-- Data for the vectorized Temperton-fft is stored in different array format (f_vec_x) which !-- saves additional data copy in fft_x. IF ( temperton_fft_vec ) THEN DO l = 0, pdims(1) - 1 xs = 0 + l * nnx DO k = nzb_x, nzt_x DO i = xs, xs + nnx - 1 DO j = nys_x, nyn_x mm = j-nys_x+1+(k-nzb_x)*(nyn_x-nys_x+1) f_vec_x(mm,i) = work(j,i-xs+1,k,l) ENDDO ENDDO ENDDO ENDDO ELSE !$OMP PARALLEL PRIVATE ( i, j, k, l, xs ) DO l = 0, pdims(1) - 1 xs = 0 + l * nnx #if __acc_fft_device !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) & !$ACC PRESENT(f_out, work) #endif !$OMP DO DO k = nzb_x, nzt_x DO i = xs, xs + nnx - 1 DO j = nys_x, nyn_x f_out(i,j,k) = work(j,i-xs+1,k,l) ENDDO ENDDO ENDDO !$OMP END DO NOWAIT ENDDO !$OMP END PARALLEL ENDIF #endif ENDIF END SUBROUTINE transpose_zx !------------------------------------------------------------------------------! ! Description: ! ------------ !> Resorting data after the transposition from z to y. The transposition itself !> is carried out in transpose_zy !------------------------------------------------------------------------------! SUBROUTINE resort_for_zy( f_inv, f_out ) USE indices, & ONLY: ny USE kinds USE transpose_indices, & ONLY: nxl_y, nxr_y, nzb_y, nzt_y IMPLICIT NONE REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !< REAL(wp) :: f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !< INTEGER(iwp) :: i !< INTEGER(iwp) :: j !< INTEGER(iwp) :: k !< ! !-- Rearrange indices of input array in order to make data to be send !-- by MPI contiguous !$OMP PARALLEL PRIVATE ( i, j, k ) !$OMP DO #if __acc_fft_device !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) & !$ACC PRESENT(f_out, f_inv) #endif DO k = nzb_y, nzt_y DO i = nxl_y, nxr_y DO j = 0, ny f_out(j,i,k) = f_inv(i,k,j) ENDDO ENDDO ENDDO !$OMP END PARALLEL END SUBROUTINE resort_for_zy !------------------------------------------------------------------------------! ! Description:cpu_log_nowait ! ------------ !> Transposition of input array (f_in) from z to y. For the input array, all !> elements along z reside on the same PE, while after transposition, all !> elements along y reside on the same PE. !------------------------------------------------------------------------------! SUBROUTINE transpose_zy( f_in, f_inv ) #if defined( __parallel ) USE cpulog, & ONLY: cpu_log, cpu_log_nowait, log_point_s #endif USE indices, & ONLY: ny, nz USE kinds USE pegrid USE transpose_indices, & ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nyn_z, nys_z, nzb_y, nzt_y IMPLICIT NONE INTEGER(iwp) :: i !< INTEGER(iwp) :: j !< INTEGER(iwp) :: k !< #if defined( __parallel ) INTEGER(iwp) :: l !< INTEGER(iwp) :: zs !< #endif REAL(wp) :: f_in(nxl_z:nxr_z,nys_z:nyn_z,1:nz) !< REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !< #if defined( __parallel ) REAL(wp), DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work !< #if __acc_fft_device !$ACC DECLARE CREATE(work) #endif #endif ! !-- If the PE grid is one-dimensional along y, the array has only to be !-- reordered locally and therefore no transposition has to be done. IF ( pdims(1) /= 1 ) THEN #if defined( __parallel ) ! !-- Reorder input array for transposition !$OMP PARALLEL PRIVATE ( i, j, k, l, zs ) DO l = 0, pdims(1) - 1 zs = 1 + l * ( nzt_y - nzb_y + 1 ) #if __acc_fft_device !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) & !$ACC PRESENT(work, f_in) #endif !$OMP DO DO j = nys_z, nyn_z DO k = zs, zs + nzt_y - nzb_y DO i = nxl_z, nxr_z work(i,k-zs+1,j,l) = f_in(i,j,k) ENDDO ENDDO ENDDO !$OMP END DO NOWAIT ENDDO !$OMP END PARALLEL ! !-- Transpose array CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait ) #if __acc_fft_device #ifndef __cuda_aware_mpi !$ACC UPDATE HOST(work) #else !$ACC HOST_DATA USE_DEVICE(work, f_inv) #endif #endif IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr ) CALL MPI_ALLTOALL( work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, & f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, & comm1dx, ierr ) #if __acc_fft_device #ifndef __cuda_aware_mpi !$ACC UPDATE DEVICE(f_inv) #else !$ACC END HOST_DATA #endif #endif CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' ) #endif ELSE ! !-- Reorder the array in the same way like ALLTOALL did it !$OMP PARALLEL PRIVATE ( i, j, k ) !$OMP DO #if __acc_fft_device !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) & !$ACC PRESENT(f_inv, f_in) #endif DO k = nzb_y, nzt_y DO j = 0, ny DO i = nxl_y, nxr_y f_inv(i,k,j) = f_in(i,j,k) ENDDO ENDDO ENDDO !$OMP END PARALLEL ENDIF END SUBROUTINE transpose_zy !------------------------------------------------------------------------------! ! Description: ! ------------ !> Transposition of input array (f_in) from z to y. For the input array, all !> elements along z reside on the same PE, while after transposition, all !> elements along y reside on the same PE. !> This is a direct transposition for arrays with indices in regular order !> (k,j,i) (cf. transpose_zy). !------------------------------------------------------------------------------! #if defined( __parallel ) SUBROUTINE transpose_zyd( f_in, f_out ) USE cpulog, & ONLY: cpu_log, log_point_s USE indices, & ONLY: nnx, nny, nnz, nxl, nxr, nyn, nys, ny, nz USE kinds USE pegrid USE transpose_indices, & ONLY: nxl_yd, nxr_yd, nzb_yd, nzt_yd IMPLICIT NONE INTEGER(iwp) :: i !< INTEGER(iwp) :: j !< INTEGER(iwp) :: k !< INTEGER(iwp) :: l !< INTEGER(iwp) :: m !< INTEGER(iwp) :: ys !< REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !< REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !< REAL(wp) :: f_out(0:ny,nxl_yd:nxr_yd,nzb_yd:nzt_yd) !< REAL(wp) :: work(nnx*nny*nnz) !< ! !-- Rearrange indices of input array in order to make data to be send !-- by MPI contiguous DO i = nxl, nxr DO j = nys, nyn DO k = 1, nz f_inv(j,i,k) = f_in(k,j,i) ENDDO ENDDO ENDDO ! !-- Move data to different array, because memory location of work1 is !-- needed further below (work1 = work2). !-- If the PE grid is one-dimensional along x, only local reordering !-- of the data is necessary and no transposition has to be done. IF ( pdims(2) == 1 ) THEN DO k = 1, nz DO i = nxl, nxr DO j = nys, nyn f_out(j,i,k) = f_inv(j,i,k) ENDDO ENDDO ENDDO RETURN ENDIF ! !-- Transpose array CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' ) IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr ) CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zyd, MPI_REAL, & work(1), sendrecvcount_zyd, MPI_REAL, & comm1dy, ierr ) CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' ) ! !-- Reorder transposed array m = 0 DO l = 0, pdims(2) - 1 ys = 0 + l * nny DO k = nzb_yd, nzt_yd DO i = nxl_yd, nxr_yd DO j = ys, ys + nny - 1 m = m + 1 f_out(j,i,k) = work(m) ENDDO ENDDO ENDDO ENDDO END SUBROUTINE transpose_zyd #endif