Home

Context Navigation

← Previous Change
Next Change →

transpose.f90

Timestamp:

Mar 4, 2013 5:31:38 AM (11 years ago)

Author:

raasch

Message:

New:
---

Porting of FFT-solver for serial runs to GPU using CUDA FFT,
preprocessor lines in transpose routines rearranged, so that routines can also
be used in serial (non-parallel) mode,
transpositions also carried out in serial mode, routines fftx, fftxp replaced
by calls of fft_x, fft_x replaced by fft_x_1d in the 1D-decomposition routines
(Makefile, Makefile_check, fft_xy, poisfft, poisfft_hybrid, transpose, new: cuda_fft_interfaces)

--stdin argument for mpiexec on lckyuh, -y and -Y settings output to header (mrun)

Changed:

Module array_kind renamed precision_kind
(check_open, data_output_3d, fft_xy, modules, user_data_output_3d)

some format changes for coupled atmosphere-ocean runs (header)
small changes in code formatting (microphysics, prognostic_equations)

Errors:

bugfix: default value (0) assigned to coupling_start_time (modules)
bugfix: initial time for preruns of coupled runs is output as -coupling_start_time (data_output_profiles)

File:

: 1 edited

palm/trunk/SOURCE/transpose.f90 (modified) (19 diffs)

Legend:

: Unmodified
: Added
: Removed

palm/trunk/SOURCE/transpose.f90

-                      r1093
+                      r1106
 ! Current revisions:
 ! -----------------
+!
+! preprocessor lines rearranged so that routines can also be used in serial
+! (non-parallel) mode
+!
 ! Former revisions:
 …
              work(nnx*nny*nnz)
-#if defined( __parallel )
+!
 !-- Rearrange indices of input array in order to make data to be send
 …
 !$OMP  END PARALLEL
+!
+!-- Transpose array
+    CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
+    IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
+    CALL MPI_ALLTOALL( f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
+                       work(1),              sendrecvcount_xy, MPI_REAL, &
+                       comm1dy, ierr )
+    CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
+!
+!-- Reorder transposed array
+    IF ( numprocs /= 1 )  THEN
+#if defined( __parallel )
+!
+!--    Transpose array
+       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
+       IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
+       CALL MPI_ALLTOALL( f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
+                          work(1),              sendrecvcount_xy, MPI_REAL, &
+                          comm1dy, ierr )
+       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
+!
+!--    Reorder transposed array
 !$OMP  PARALLEL PRIVATE ( i, j, k, l, m, ys )
 !$OMP  DO
+    DO  l = 0, pdims(2) - 1
+       m  = l * ( nxr_y - nxl_y + 1 ) * ( nzt_y - nzb_y + 1 ) * &
+                ( nyn_x - nys_x + 1 )
+       ys = 0 + l * ( nyn_x - nys_x + 1 )
+       DO  i = nxl_y, nxr_y
+          DO  k = nzb_y, nzt_y
+             DO  j = ys, ys + nyn_x - nys_x
+                m = m + 1
+                f_out(j,i,k) = work(m)
+             ENDDO
+          ENDDO
+       ENDDO
+    ENDDO
+!$OMP  END PARALLEL
+       DO  l = 0, pdims(2) - 1
+          m  = l * ( nxr_y - nxl_y + 1 ) * ( nzt_y - nzb_y + 1 ) * &
+                   ( nyn_x - nys_x + 1 )
+          ys = 0 + l * ( nyn_x - nys_x + 1 )
+          DO  i = nxl_y, nxr_y
+             DO  k = nzb_y, nzt_y
+                DO  j = ys, ys + nyn_x - nys_x
+                   m = m + 1
+                   f_out(j,i,k) = work(m)
+                ENDDO
+             ENDDO
+          ENDDO
+       ENDDO
+!$OMP  END PARALLEL
 #endif
+    ELSE
+!
+!--    Reorder transposed array
+!$OMP  PARALLEL PRIVATE ( i, j, k )
+!$OMP  DO
+       DO  k = nzb_y, nzt_y
+          DO  i = nxl_y, nxr_y
+             DO  j = 0, ny
+                f_out(j,i,k) = f_inv(j,k,i)
+             ENDDO
+          ENDDO
+       ENDDO
+!$OMP  END PARALLEL
+    ENDIF
  END SUBROUTINE transpose_xy
 …
              work(nnx*nny*nnz)
-#if defined( __parallel )
+!
 …
 !-- reordered locally and therefore no transposition has to be done.
     IF ( pdims(1) /= 1 )  THEN
+#if defined( __parallel )
+!
 !--    Reorder input array for transposition
 …
        ENDDO
 !$OMP  END PARALLEL
+#endif
     ELSE
+!
 !--    Reorder the array in a way that the z index is in first position
 …
     ENDIF
-#endif
  END SUBROUTINE transpose_xz
 …
              work(nnx*nny*nnz)
+    IF ( numprocs /= 1 )  THEN
 #if defined( __parallel )
+!
+!-- Reorder input array for transposition
+!
+!--    Reorder input array for transposition
 !$OMP  PARALLEL PRIVATE ( i, j, k, l, m, ys )
 !$OMP  DO
+    DO  l = 0, pdims(2) - 1
+       m  = l * ( nxr_y - nxl_y + 1 ) * ( nzt_y - nzb_y + 1 ) * &
+                ( nyn_x - nys_x + 1 )
+       ys = 0 + l * ( nyn_x - nys_x + 1 )
+       DO  l = 0, pdims(2) - 1
+          m  = l * ( nxr_y - nxl_y + 1 ) * ( nzt_y - nzb_y + 1 ) * &
+                   ( nyn_x - nys_x + 1 )
+          ys = 0 + l * ( nyn_x - nys_x + 1 )
+          DO  i = nxl_y, nxr_y
+             DO  k = nzb_y, nzt_y
+                DO  j = ys, ys + nyn_x - nys_x
+                   m = m + 1
+                   work(m) = f_in(j,i,k)
+                ENDDO
+             ENDDO
+          ENDDO
+       ENDDO
+!$OMP  END PARALLEL
+!
+!--    Transpose array
+       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
+       IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
+       CALL MPI_ALLTOALL( work(1),              sendrecvcount_xy, MPI_REAL, &
+                          f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
+                          comm1dy, ierr )
+       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
+#endif
+    ELSE
+!
+!--    Reorder array f_in the same way as ALLTOALL did it
+!$OMP  PARALLEL PRIVATE ( i, j, k )
+!$OMP  DO
        DO  i = nxl_y, nxr_y
           DO  k = nzb_y, nzt_y
+             DO  j = ys, ys + nyn_x - nys_x
+                m = m + 1
+                work(m) = f_in(j,i,k)
+             ENDDO
+          ENDDO
+       ENDDO
+    ENDDO
+!$OMP  END PARALLEL
+!
+!-- Transpose array
+    CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
+    IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
+    CALL MPI_ALLTOALL( work(1),              sendrecvcount_xy, MPI_REAL, &
+                       f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
+                       comm1dy, ierr )
+    CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
+             DO  j = 0, ny
+                f_inv(j,k,i) = f_in(j,i,k)
+             ENDDO
+          ENDDO
+       ENDDO
+!$OMP  END PARALLEL
+    ENDIF
+!
 …
     ENDDO
 !$OMP  END PARALLEL
-#endif
  END SUBROUTINE transpose_yx
 …
              work(nnx*nny*nnz)
-#if defined( __parallel )
+!
 !-- Rearrange indices of input array in order to make data to be send
 …
 !-- of the data is necessary and no transposition has to be done.
     IF ( pdims(1) == 1 )  THEN
 !$OMP  PARALLEL PRIVATE ( i, j, k )
 !$OMP  DO
 …
        ENDDO
 !$OMP  END PARALLEL
+       RETURN
+    ENDIF
+!
+!-- Transpose array
+    CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
+    IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
+    CALL MPI_ALLTOALL( f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
+                       work(1),              sendrecvcount_yz, MPI_REAL, &
+                       comm1dx, ierr )
+    CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
+!
+!-- Reorder transposed array
+    ELSE
+#if defined( __parallel )
+!
+!--    Transpose array
+       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
+       IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
+       CALL MPI_ALLTOALL( f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
+                          work(1),              sendrecvcount_yz, MPI_REAL, &
+                          comm1dx, ierr )
+       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
+!
+!--    Reorder transposed array
 !$OMP  PARALLEL PRIVATE ( i, j, k, l, m, zs )
 !$OMP  DO
+    DO  l = 0, pdims(1) - 1
+       m  = l * ( nyn_z - nys_z + 1 ) * ( nzt_y - nzb_y + 1 ) * &
+                ( nxr_z - nxl_z + 1 )
+       zs = 1 + l * ( nzt_y - nzb_y + 1 )
+       DO  j = nys_z, nyn_z
+          DO  k = zs, zs + nzt_y - nzb_y
+             DO  i = nxl_z, nxr_z
+                m = m + 1
+                f_out(i,j,k) = work(m)
+             ENDDO
+          ENDDO
+       ENDDO
+    ENDDO
+!$OMP  END PARALLEL
+       DO  l = 0, pdims(1) - 1
+          m  = l * ( nyn_z - nys_z + 1 ) * ( nzt_y - nzb_y + 1 ) * &
+                   ( nxr_z - nxl_z + 1 )
+          zs = 1 + l * ( nzt_y - nzb_y + 1 )
+          DO  j = nys_z, nyn_z
+             DO  k = zs, zs + nzt_y - nzb_y
+                DO  i = nxl_z, nxr_z
+                   m = m + 1
+                   f_out(i,j,k) = work(m)
+                ENDDO
+             ENDDO
+          ENDDO
+       ENDDO
+!$OMP  END PARALLEL
 #endif
+   ENDIF
  END SUBROUTINE transpose_yz
 …
     INTEGER ::  i, j, k, l, m, xs
+    REAL ::  f_in(1:nz,nys:nyn,nxl:nxr), f_inv(nys:nyn,nxl:nxr,1:nz), &
+             f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x),                     &
+    REAL ::  f_in(1:nz,nys:nyn,nxl:nxr), f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x), &
              work(nnx*nny*nnz)
+#if defined( __parallel )
+    !$acc declare create ( f_inv )
+    REAL ::  f_inv(nys:nyn,nxl:nxr,1:nz)
+!
 …
 !$OMP  PARALLEL PRIVATE ( i, j, k )
 !$OMP  DO
+    !$acc kernels present( f_in )
+    !$acc loop
     DO  k = 1,nz
        DO  i = nxl, nxr
+          !$acc loop vector( 32 )
           DO  j = nys, nyn
              f_inv(j,i,k) = f_in(k,j,i)
 …
 !-- of the data is necessary and no transposition has to be done.
     IF ( pdims(1) == 1 )  THEN
+!$OMP  PARALLEL PRIVATE ( i, j, k )
+!$OMP  DO
+!$OMP  PARALLEL PRIVATE ( i, j, k )
+!$OMP  DO
+       !$acc kernels present( f_out )
+       !$acc loop
        DO  k = 1, nz
           DO  i = nxl, nxr
+             !$acc loop vector( 32 )
              DO  j = nys, nyn
                 f_out(i,j,k) = f_inv(j,i,k)
 …
        ENDDO
 !$OMP  END PARALLEL
+       RETURN
+    ELSE
+#if defined( __parallel )
+!
+!--    Transpose array
+       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
+       IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
+       CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
+                          work(1),          sendrecvcount_zx, MPI_REAL, &
+                          comm1dx, ierr )
+       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
+!
+!--    Reorder transposed array
+!$OMP  PARALLEL PRIVATE ( i, j, k, l, m, xs )
+!$OMP  DO
+       DO  l = 0, pdims(1) - 1
+          m  = l * ( nzt_x - nzb_x + 1 ) * nnx * ( nyn_x - nys_x + 1 )
+          xs = 0 + l * nnx
+          DO  k = nzb_x, nzt_x
+             DO  i = xs, xs + nnx - 1
+                DO  j = nys_x, nyn_x
+                   m = m + 1
+                   f_out(i,j,k) = work(m)
+                ENDDO
+             ENDDO
+          ENDDO
+       ENDDO
+!$OMP  END PARALLEL
+#endif
     ENDIF
+!
-!-- Transpose array
-    CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
-    IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
-    CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
-                       work(1),          sendrecvcount_zx, MPI_REAL, &
-                       comm1dx, ierr )
-    CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
+!
-!-- Reorder transposed array
-!$OMP  PARALLEL PRIVATE ( i, j, k, l, m, xs )
-!$OMP  DO
-    DO  l = 0, pdims(1) - 1
-       m  = l * ( nzt_x - nzb_x + 1 ) * nnx * ( nyn_x - nys_x + 1 )
-       xs = 0 + l * nnx
-       DO  k = nzb_x, nzt_x
-          DO  i = xs, xs + nnx - 1
-             DO  j = nys_x, nyn_x
-                m = m + 1
-                f_out(i,j,k) = work(m)
-             ENDDO
-          ENDDO
-       ENDDO
-    ENDDO
-!$OMP  END PARALLEL
-#endif
  END SUBROUTINE transpose_zx
 …
              work(nnx*nny*nnz)
-#if defined( __parallel )
+!
 !-- If the PE grid is one-dimensional along y, the array has only to be
 !-- reordered locally and therefore no transposition has to be done.
     IF ( pdims(1) /= 1 )  THEN
+#if defined( __parallel )
+!
 !--    Reorder input array for transposition
 …
                           comm1dx, ierr )
        CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
+!
+!--    Reorder transposed array in a way that the y index is in first position
+!$OMP  PARALLEL PRIVATE ( i, j, k )
+!$OMP  DO
+       DO  j = 0, ny
+          DO  k = nzb_y, nzt_y
+             DO  i = nxl_y, nxr_y
+                f_out(j,i,k) = f_inv(i,k,j)
+             ENDDO
+          ENDDO
+       ENDDO
+!$OMP  END PARALLEL
+#endif
     ELSE
+!
 !--    Reorder the array in a way that the y index is in first position
+!--    Reorder the array in the same way like ALLTOALL did it
 !$OMP  PARALLEL PRIVATE ( i, j, k )
 !$OMP  DO
 …
        ENDDO
 !$OMP  END PARALLEL
+!
-!--    Move data to output array
-!$OMP  PARALLEL PRIVATE ( i, j, k )
-!$OMP  DO
-       DO  k = nzb_y, nzt_y
-          DO  i = nxl_y, nxr_y
-             DO  j = 0, ny
-                f_out(j,i,k) = f_inv(i,k,j)
-             ENDDO
-          ENDDO
-       ENDDO
-!$OMP  END PARALLEL
     ENDIF
+#endif
+!
+!-- Reorder transposed array in a way that the y index is in first position
+!$OMP  PARALLEL PRIVATE ( i, j, k )
+!$OMP  DO
+    DO  k = nzb_y, nzt_y
+       DO  i = nxl_y, nxr_y
+          DO  j = 0, ny
+             f_out(j,i,k) = f_inv(i,k,j)
+          ENDDO
+       ENDDO
+    ENDDO
+!$OMP  END PARALLEL
  END SUBROUTINE transpose_zy

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 1106 for palm/trunk/SOURCE/transpose.f90

Legend:

palm/trunk/SOURCE/transpose.f90

Download in other formats: