Home

Context Navigation

← Previous Changeset
Next Changeset →

Changeset 1216

Timestamp:

Aug 26, 2013 9:31:42 AM (11 years ago)

Author:

raasch

Message:

overlapping execution of fft and transpositions (MPI_ALLTOALL), but real overlapping is not activated so far,
fftw implemented for 1D-decomposition
resorting of arrays moved to separate routines resort_for_...
bugfix in mbuild concerning Makefile_check

Location:

palm/trunk

Files:

: 11 edited

SCRIPTS/mbuild (modified) (2 diffs)
SOURCE/Makefile_check (modified) (4 diffs)
SOURCE/calc_spectra.f90 (modified) (3 diffs)
SOURCE/check_parameters.f90 (modified) (2 diffs)
SOURCE/fft_xy.f90 (modified) (27 diffs)
SOURCE/header.f90 (modified) (3 diffs)
SOURCE/modules.f90 (modified) (4 diffs)
SOURCE/parin.f90 (modified) (2 diffs)
SOURCE/poisfft.f90 (modified) (19 diffs)
SOURCE/transpose.f90 (modified) (42 diffs)
SOURCE/tridia_solver.f90 (modified) (3 diffs)

Legend:

: Unmodified
: Added
: Removed

palm/trunk/SCRIPTS/mbuild

-                      r1211
+                      r1216
 # Current revisions:
 # ------------------
+#
+# RCS renamed SOURCES
+#
 # Former revisions:
 …
     cat Makefile_check|while read line
     do
        line=$(echo $line|grep RCS)
        if [[ $line == *"RCS"* ]]
        then
           line=$(echo $line|sed 's/RCS = //g')
+       line=$(echo $line|grep SOURCES)
+       if [[ $line == *"SOURCES"* ]]
+       then
+          line=$(echo $line|sed 's/SOURCES = //g')
           break
        fi

palm/trunk/SOURCE/Makefile_check

-                      r1213
+                      r1216
 # Current revisions:
 # ------------------
+#
+# +tridia_solver
+#
 # Former revisions:
 …
       local_system.f90 message.f90 modules.f90 package_parin.f90 parin.f90 \
       poisfft.f90 random_function.f90 singleton.f90 \
       subsidence.f90 temperton_fft.f90 \
+      subsidence.f90 temperton_fft.f90 tridia_solver.f90 \
       user_3d_data_averaging.f90 user_actions.f90 \
       user_additional_routines.f90 user_check_data_output.f90 \
 …
       user_lpm_init.f90 user_lpm_set_attributes.f90 user_module.f90 \
       user_parin.f90 user_read_restart_data.f90 user_spectra.f90 \
+      user_statistics.f90 \
+      user_statistics.f90
 OBJS=$(SOURCES:.f90=.o)
 …
 package_parin.o: modules.o
 parin.o: modules.o
 poisfft.o: modules.o fft_xy.o
+poisfft.o: modules.o fft_xy.o tridia_solver.o
 random_function.o: modules.o
 singleton.o: singleton.f90
 subsidence.o: modules.o
 temperton_fft.o: modules.o
+tridia_solver.o: modules.o
 user_3d_data_averaging.o: modules.o user_module.o
 user_actions.o: modules.o user_module.o

palm/trunk/SOURCE/calc_spectra.f90

-                      r1121
+                      r1216
 ! Current revisions:
 ! -----------------
+!
+! resorting of array moved to separate routine resort_for_zx,
+! one argument removed from the transpose_..d routines
+!
 ! Former revisions:
 …
 #if defined( __parallel )
           IF ( pdims(2) /= 1 )  THEN
+             CALL transpose_zx( d, tend, d )
+             CALL resort_for_zx( d, tend )
+             CALL transpose_zx( tend, d )
           ELSE
              CALL transpose_yxd( d, tend, d )
+             CALL transpose_yxd( d, d )
           ENDIF
           CALL calc_spectra_x( d, pr, m )
 …
 #if defined( __parallel )
           CALL transpose_zyd( d, tend, d )
+          CALL transpose_zyd( d, d )
           CALL calc_spectra_y( d, pr, m )
 #else

palm/trunk/SOURCE/check_parameters.f90

-                      r1215
+                      r1216
 ! Current revisions:
 ! -----------------
+!
+! check for transpose_compute_overlap (temporary)
+!
 ! Former revisions:
 …
     LOGICAL ::  found, ldum
     REAL    ::  gradient, remote = 0.0, simulation_time_since_reference
+!
+!-- Check for overlap combinations, which are not realized yet
+    IF ( transpose_compute_overlap )  THEN
+       IF ( numprocs == 1 )  STOP '+++ transpose-compute-overlap not implemented for single PE runs'
+#if defined( __openacc )
+       STOP '+++ transpose-compute-overlap not implemented for GPU usage'
+#endif
+    ENDIF
+!

palm/trunk/SOURCE/fft_xy.f90

-                      r1211
+                      r1216
     SUBROUTINE fft_x( ar, direction )
+    SUBROUTINE fft_x( ar, direction, ar_2d )
 !----------------------------------------------------------------------!
 …
        COMPLEX(dpk), DIMENSION(0:(nx+1)/2,nys_x:nyn_x,nzb_x:nzt_x) ::  ar_tmp
 #endif
+       REAL, DIMENSION(0:nx,nys_x:nyn_x), OPTIONAL   ::  ar_2d
        REAL, DIMENSION(0:nx,nys_x:nyn_x,nzb_x:nzt_x) ::  ar
 …
                    CALL FFTW_EXECUTE_DFT_R2C( plan_xf, x_in, x_out )
+                   DO  i = 0, (nx+1)/2
+                      ar(i,j,k) = REAL( x_out(i) ) / ( nx+1 )
+                   ENDDO
+                   DO  i = 1, (nx+1)/2 - 1
+                      ar(nx+1-i,j,k) = AIMAG( x_out(i) ) / ( nx+1 )
+                   ENDDO
+                   IF ( PRESENT( ar_2d ) )  THEN
+                      DO  i = 0, (nx+1)/2
+                         ar_2d(i,j) = REAL( x_out(i) ) / ( nx+1 )
+                      ENDDO
+                      DO  i = 1, (nx+1)/2 - 1
+                         ar_2d(nx+1-i,j) = AIMAG( x_out(i) ) / ( nx+1 )
+                      ENDDO
+                   ELSE
+                      DO  i = 0, (nx+1)/2
+                         ar(i,j,k) = REAL( x_out(i) ) / ( nx+1 )
+                      ENDDO
+                      DO  i = 1, (nx+1)/2 - 1
+                         ar(nx+1-i,j,k) = AIMAG( x_out(i) ) / ( nx+1 )
+                      ENDDO
+                   ENDIF
                 ENDDO
 …
              !$OMP END PARALLEL
          ELSE
+          ELSE
              !$OMP PARALLEL PRIVATE ( work, i, j, k )
              !$OMP DO
 …
                 DO  j = nys_x, nyn_x
+                   x_out(0) = CMPLX( ar(0,j,k), 0.0 )
+                   DO  i = 1, (nx+1)/2 - 1
+                      x_out(i)   = CMPLX( ar(i,j,k), ar(nx+1-i,j,k) )
+                   ENDDO
+                   x_out((nx+1)/2) = CMPLX( ar((nx+1)/2,j,k), 0.0 )
+                   IF ( PRESENT( ar_2d ) )  THEN
+                      x_out(0) = CMPLX( ar_2d(0,j), 0.0 )
+                      DO  i = 1, (nx+1)/2 - 1
+                         x_out(i) = CMPLX( ar_2d(i,j), ar_2d(nx+1-i,j) )
+                      ENDDO
+                      x_out((nx+1)/2) = CMPLX( ar_2d((nx+1)/2,j), 0.0 )
+                   ELSE
+                      x_out(0) = CMPLX( ar(0,j,k), 0.0 )
+                      DO  i = 1, (nx+1)/2 - 1
+                         x_out(i) = CMPLX( ar(i,j,k), ar(nx+1-i,j,k) )
+                      ENDDO
+                      x_out((nx+1)/2) = CMPLX( ar((nx+1)/2,j,k), 0.0 )
+                   ENDIF
                    CALL FFTW_EXECUTE_DFT_C2R( plan_xi, x_out, x_in)
 …
              !$OMP END PARALLEL
          ENDIF
+          ENDIF
 #endif
 …
           ENDIF
+       ELSEIF ( fft_method == 'fftw' )  THEN
+#if defined( __fftw )
+          IF ( forward_fft )  THEN
+             x_in(0:nx) = ar(0:nx)
+             CALL FFTW_EXECUTE_DFT_R2C( plan_xf, x_in, x_out )
+             DO  i = 0, (nx+1)/2
+                ar(i) = REAL( x_out(i) ) / ( nx+1 )
+             ENDDO
+             DO  i = 1, (nx+1)/2 - 1
+                ar(nx+1-i) = AIMAG( x_out(i) ) / ( nx+1 )
+             ENDDO
+         ELSE
+             x_out(0) = CMPLX( ar(0), 0.0 )
+             DO  i = 1, (nx+1)/2 - 1
+                x_out(i) = CMPLX( ar(i), ar(nx+1-i) )
+             ENDDO
+             x_out((nx+1)/2) = CMPLX( ar((nx+1)/2), 0.0 )
+             CALL FFTW_EXECUTE_DFT_C2R( plan_xi, x_out, x_in)
+             ar(0:nx) = x_in(0:nx)
+         ENDIF
+#endif
        ELSEIF ( fft_method == 'system-specific' )  THEN
 …
     END SUBROUTINE fft_x_1d
+    SUBROUTINE fft_y( ar, direction )
+    SUBROUTINE fft_y( ar, direction, ar_tr, nxl_y_bound, nxr_y_bound, nxl_y_l, &
+                      nxr_y_l )
 !----------------------------------------------------------------------!
 …
 !      fft_y uses internal algorithms (Singleton or Temperton) or      !
 !           system-specific routines, if they are available            !
+!                                                                      !
+! direction:  'forward' or 'backward'                                  !
+! ar, ar_tr:  3D data arrays                                           !
+!             forward:   ar: before  ar_tr: after transformation       !
+!             backward:  ar_tr: before  ar: after transfosition        !
+!                                                                      !
+! In case of non-overlapping transposition/transformation:             !
+! nxl_y_bound = nxl_y_l = nxl_y                                        !
+! nxr_y_bound = nxr_y_l = nxr_y                                        !
+!                                                                      !
+! In case of overlapping transposition/transformation                  !
+! - nxl_y_bound  and  nxr_y_bound have the original values of          !
+!   nxl_y, nxr_y.  ar_tr is dimensioned using these values.            !
+! - nxl_y_l = nxr_y_r.  ar is dimensioned with these values, so that   !
+!   transformation is carried out for a 2D-plane only.                 !
 !----------------------------------------------------------------------!
 …
        CHARACTER (LEN=*) ::  direction
        INTEGER ::  i, j, jshape(1), k
+       INTEGER ::  nxl_y_bound, nxl_y_l, nxr_y_bound, nxr_y_l
        LOGICAL ::  forward_fft
 …
        COMPLEX(dpk), DIMENSION(0:(ny+1)/2,nxl_y:nxr_y,nzb_y:nzt_y) ::  ar_tmp
 #endif
+       REAL, DIMENSION(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) ::  ar
+       REAL, DIMENSION(0:ny,nxl_y_l:nxr_y_l,nzb_y:nzt_y) ::  ar
+       REAL, DIMENSION(0:ny,nxl_y_bound:nxr_y_bound,nzb_y:nzt_y) ::  ar_tr
        IF ( direction == 'forward' )  THEN
 …
              !$OMP DO
              DO  k = nzb_y, nzt_y
                 DO  i = nxl_y, nxr_y
+                DO  i = nxl_y_l, nxr_y_l
                    DO  j = 0, ny
 …
                    DO  j = 0, (ny+1)/2
                       ar(j,i,k) = REAL( cwork(j) )
+                      ar_tr(j,i,k) = REAL( cwork(j) )
                    ENDDO
                    DO  j = 1, (ny+1)/2 - 1
                       ar(ny+1-j,i,k) = -AIMAG( cwork(j) )
+                      ar_tr(ny+1-j,i,k) = -AIMAG( cwork(j) )
                    ENDDO
 …
              !$OMP DO
              DO  k = nzb_y, nzt_y
                 DO  i = nxl_y, nxr_y
                    cwork(0) = CMPLX( ar(0,i,k), 0.0 )
+                DO  i = nxl_y_l, nxr_y_l
+                   cwork(0) = CMPLX( ar_tr(0,i,k), 0.0 )
                    DO  j = 1, (ny+1)/2 - 1
                       cwork(j)      = CMPLX( ar(j,i,k), -ar(ny+1-j,i,k) )
                       cwork(ny+1-j) = CMPLX( ar(j,i,k),  ar(ny+1-j,i,k) )
                    ENDDO
                    cwork((ny+1)/2) = CMPLX( ar((ny+1)/2,i,k), 0.0 )
+                      cwork(j)      = CMPLX( ar_tr(j,i,k), -ar_tr(ny+1-j,i,k) )
+                      cwork(ny+1-j) = CMPLX( ar_tr(j,i,k),  ar_tr(ny+1-j,i,k) )
+                   ENDDO
+                   cwork((ny+1)/2) = CMPLX( ar_tr((ny+1)/2,i,k), 0.0 )
                    jshape = SHAPE( cwork )
 …
              !$OMP DO
              DO  k = nzb_y, nzt_y
                 DO  i = nxl_y, nxr_y
+                DO  i = nxl_y_l, nxr_y_l
                    work(0:ny) = ar(0:ny,i,k)
 …
                    DO  j = 0, (ny+1)/2
                       ar(j,i,k) = work(2*j)
+                      ar_tr(j,i,k) = work(2*j)
                    ENDDO
                    DO  j = 1, (ny+1)/2 - 1
                       ar(ny+1-j,i,k) = work(2*j+1)
+                      ar_tr(ny+1-j,i,k) = work(2*j+1)
                    ENDDO
 …
              !$OMP DO
              DO  k = nzb_y, nzt_y
                 DO  i = nxl_y, nxr_y
+                DO  i = nxl_y_l, nxr_y_l
                    DO  j = 0, (ny+1)/2
                       work(2*j) = ar(j,i,k)
+                      work(2*j) = ar_tr(j,i,k)
                    ENDDO
                    DO  j = 1, (ny+1)/2 - 1
                       work(2*j+1) = ar(ny+1-j,i,k)
+                      work(2*j+1) = ar_tr(ny+1-j,i,k)
                    ENDDO
                    work(1)    = 0.0
 …
              !$OMP DO
              DO  k = nzb_y, nzt_y
                 DO  i = nxl_y, nxr_y
+                DO  i = nxl_y_l, nxr_y_l
                    y_in(0:ny) = ar(0:ny,i,k)
 …
                    DO  j = 0, (ny+1)/2
                       ar(j,i,k) = REAL( y_out(j) )  /(ny+1)
+                      ar_tr(j,i,k) = REAL( y_out(j) ) / (ny+1)
                    ENDDO
                    DO  j = 1, (ny+1)/2 - 1
                       ar(ny+1-j,i,k) = AIMAG( y_out(j) )  /(ny+1)
+                      ar_tr(ny+1-j,i,k) = AIMAG( y_out(j) ) / (ny+1)
                    ENDDO
 …
              !$OMP DO
              DO  k = nzb_y, nzt_y
                 DO  i = nxl_y, nxr_y
                    y_out(0) = CMPLX( ar(0,i,k), 0.0 )
+                DO  i = nxl_y_l, nxr_y_l
+                   y_out(0) = CMPLX( ar_tr(0,i,k), 0.0 )
                    DO  j = 1, (ny+1)/2 - 1
                       y_out(j) = CMPLX( ar(j,i,k), ar(ny+1-j,i,k) )
                    ENDDO
                    y_out((ny+1)/2) = CMPLX( ar((ny+1)/2,i,k), 0.0 )
+                      y_out(j) = CMPLX( ar_tr(j,i,k), ar_tr(ny+1-j,i,k) )
+                   ENDDO
+                   y_out((ny+1)/2) = CMPLX( ar_tr((ny+1)/2,i,k), 0.0 )
                    CALL FFTW_EXECUTE_DFT_C2R( plan_yi, y_out, y_in )
 …
              !$OMP DO
              DO  k = nzb_y, nzt_y
                 DO  i = nxl_y, nxr_y
+                DO  i = nxl_y_l, nxr_y_l
                    CALL DRCFT( 0, ar, 1, work, 1, ny+1, 1, 1, sqr_dny, auy1, nau1, &
 …
                    DO  j = 0, (ny+1)/2
                       ar(j,i,k) = work(2*j)
+                      ar_tr(j,i,k) = work(2*j)
                    ENDDO
                    DO  j = 1, (ny+1)/2 - 1
                       ar(ny+1-j,i,k) = work(2*j+1)
+                      ar_tr(ny+1-j,i,k) = work(2*j+1)
                    ENDDO
 …
              !$OMP DO
              DO  k = nzb_y, nzt_y
                 DO  i = nxl_y, nxr_y
+                DO  i = nxl_y_l, nxr_y_l
                    DO  j = 0, (ny+1)/2
                       work(2*j) = ar(j,i,k)
+                      work(2*j) = ar_tr(j,i,k)
                    ENDDO
                    DO  j = 1, (ny+1)/2 - 1
                       work(2*j+1) = ar(ny+1-j,i,k)
+                      work(2*j+1) = ar_tr(ny+1-j,i,k)
                    ENDDO
                    work(1)    = 0.0
 …
              !$OMP DO
              DO  k = nzb_y, nzt_y
                 DO  i = nxl_y, nxr_y
+                DO  i = nxl_y_l, nxr_y_l
                    work(0:ny) = ar(0:ny,i,k)
 …
                    DO  j = 0, (ny+1)/2
                       ar(j,i,k) = work(2*j)
+                      ar_tr(j,i,k) = work(2*j)
                    ENDDO
                    DO  j = 1, (ny+1)/2 - 1
                       ar(ny+1-j,i,k) = work(2*j+1)
+                      ar_tr(ny+1-j,i,k) = work(2*j+1)
                    ENDDO
 …
              !$OMP DO
              DO  k = nzb_y, nzt_y
                 DO  i = nxl_y, nxr_y
+                DO  i = nxl_y_l, nxr_y_l
                    DO  j = 0, (ny+1)/2
                       work(2*j) = ar(j,i,k)
+                      work(2*j) = ar_tr(j,i,k)
                    ENDDO
                    DO  j = 1, (ny+1)/2 - 1
                       work(2*j+1) = ar(ny+1-j,i,k)
+                      work(2*j+1) = ar_tr(ny+1-j,i,k)
                    ENDDO
                    work(1) = 0.0
 …
           ENDIF
+       ELSEIF ( fft_method == 'fftw' )  THEN
+#if defined( __fftw )
+          IF ( forward_fft )  THEN
+             y_in(0:ny) = ar(0:ny)
+             CALL FFTW_EXECUTE_DFT_R2C( plan_yf, y_in, y_out )
+             DO  j = 0, (ny+1)/2
+                ar(j) = REAL( y_out(j) ) / (ny+1)
+             ENDDO
+             DO  j = 1, (ny+1)/2 - 1
+                ar(ny+1-j) = AIMAG( y_out(j) ) / (ny+1)
+             ENDDO
+          ELSE
+             y_out(0) = CMPLX( ar(0), 0.0 )
+             DO  j = 1, (ny+1)/2 - 1
+                y_out(j) = CMPLX( ar(j), ar(ny+1-j) )
+             ENDDO
+             y_out((ny+1)/2) = CMPLX( ar((ny+1)/2), 0.0 )
+             CALL FFTW_EXECUTE_DFT_C2R( plan_yi, y_out, y_in )
+             ar(0:ny) = y_in(0:ny)
+          ENDIF
+#endif
        ELSEIF ( fft_method == 'system-specific' )  THEN

palm/trunk/SOURCE/header.f90

-                      r1213
+                      r1216
 ! Current revisions:
 ! -----------------
+!
+! output for transpose_compute_overlap
+!
 ! Former revisions:
 …
     IF ( psolver(1:7) == 'poisfft' )  THEN
        WRITE ( io, 111 )  TRIM( fft_method )
+       IF ( transpose_compute_overlap )  WRITE( io, 115 )
     ELSEIF ( psolver == 'sor' )  THEN
        WRITE ( io, 112 )  nsor_ini, nsor, omega_sor
 …
 FORMAT (' --> Momentum advection via Piascek-Williams-Scheme (Form C3)', &
                   ' or Upstream')
+FORMAT ('     FFT and transpositions are overlapping')
 FORMAT (' --> Scalar advection via Piascek-Williams-Scheme (Form C3)', &
                   ' or Upstream')

palm/trunk/SOURCE/modules.f90

-                      r1213
+                      r1216
 ! Current revisions:
 ! ------------------
+!
+! +transpose_compute_overlap,
+! several variables are now defined in the serial (non-parallel) case also
+!
 ! Former revisions:
 …
                 scalar_rayleigh_damping = .TRUE., sloping_surface = .FALSE., &
                 stop_dt = .FALSE., synchronous_exchange = .FALSE., &
                 terminate_run = .FALSE., turbulence = .FALSE., &
                 turbulent_inflow = .FALSE., use_cmax = .TRUE., &
                 use_initial_profile_as_reference = .FALSE., &
+                terminate_run = .FALSE., transpose_compute_overlap = .FALSE., &
+                turbulence = .FALSE., turbulent_inflow = .FALSE., &
+                use_cmax = .TRUE., use_initial_profile_as_reference = .FALSE., &
                 use_prescribed_profile_data = .FALSE., &
                 use_single_reference_value = .FALSE., &
 …
     CHARACTER(LEN=2) ::  send_receive = 'al'
     CHARACTER(LEN=5) ::  myid_char = ''
+    INTEGER          ::  acc_rank, id_inflow = 0, id_recycling = 0,      &
+                         myid = 0, num_acc_per_node = 0, req_count = 0,  &
+                         target_id, npex = -1, npey = -1, numprocs = 1,  &
+                         numprocs_previous_run = -1,                     &
+                         tasks_per_node = -9999, threads_per_task = 1
+    INTEGER          ::  acc_rank, comm1dx, comm1dy, comm2d, comm_inter,       &
+                         comm_palm, id_inflow = 0, id_recycling = 0, ierr,     &
+                         myid = 0, myidx = 0, myidy = 0, ndim = 2, ngp_a,      &
+                         ngp_o, ngp_xy, ngp_y, npex = -1, npey = -1,           &
+                         numprocs = 1, numprocs_previous_run = -1,             &
+                         num_acc_per_node = 0, pleft, pnorth, pright, psouth,  &
+                         req_count = 0, sendrecvcount_xy, sendrecvcount_yz,    &
+                         sendrecvcount_zx, sendrecvcount_zyd,                  &
+                         sendrecvcount_yxd, target_id, tasks_per_node = -9999, &
+                         threads_per_task = 1, type_x, type_x_int, type_xy,    &
+                         type_y, type_y_int
     INTEGER          ::  pdims(2) = 1, req(100)
 …
     CHARACTER (LEN=MPI_MAX_PORT_NAME) ::  port_name
 #endif
-    INTEGER ::  comm1dx, comm1dy, comm2d, comm_inter, comm_palm, ierr, myidx,  &
-                myidy, ndim = 2, ngp_a, ngp_o, ngp_xy, ngp_y, pleft, pnorth,   &
-                pright, psouth, sendrecvcount_xy, sendrecvcount_yz,            &
-                sendrecvcount_zx, sendrecvcount_zyd, sendrecvcount_yxd,        &
-                type_x, type_x_int, type_xy, type_y, type_y_int
     INTEGER ::  ibuf(12), pcoord(2)

palm/trunk/SOURCE/parin.f90

-                      r1196
+                      r1216
 ! Current revisions:
 ! -----------------
+!
+! +transpose_compute_overlap in inipar
+!
 ! Former revisions:
 …
              subs_vertical_gradient_level, surface_heatflux, surface_pressure, &
              surface_scalarflux, surface_waterflux, &
+             s_surface, &
+             s_surface_initial_change, s_vertical_gradient, &
+             s_surface, s_surface_initial_change, s_vertical_gradient, &
              s_vertical_gradient_level, timestep_scheme, &
              topography, topography_grid_convention, top_heatflux, &
              top_momentumflux_u, top_momentumflux_v, top_salinityflux, &
+             turbulence, turbulent_inflow, ug_surface, ug_vertical_gradient, &
+             transpose_compute_overlap, turbulence, turbulent_inflow, &
+             ug_surface, ug_vertical_gradient, &
              ug_vertical_gradient_level, use_surface_fluxes, use_cmax, &
              use_top_fluxes, use_ug_for_galilei_tr, use_upstream_for_tke, &

palm/trunk/SOURCE/poisfft.f90

-                      r1213
+                      r1216
 ! Current revisions:
 ! -----------------
+!
+! resorting of arrays moved to separate routines resort_for_...,
+! one argument, used as temporary work array, removed from all transpose
+! routines
+! overlapping fft / transposition implemented
+!
 ! Former revisions:
 …
 #if ! defined ( __check )
     SUBROUTINE poisfft( ar, work )
        USE control_parameters,  ONLY : fft_method
+    SUBROUTINE poisfft( ar, ar_inv_test )
+       USE control_parameters,  ONLY : fft_method, transpose_compute_overlap
        USE cpulog
        USE interfaces
 …
        IMPLICIT NONE
+       REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) ::  ar, work
+       INTEGER ::  ind_even, ind_odd, ind_third, ii, iind, inew, jj, jind,  &
+                   jnew, ki, kk, knew, n, nblk, nnx_y, nny_z, nnz_t, nnz_x, &
+                   nxl_y_bound, nxr_y_bound
+       INTEGER, DIMENSION(4) ::  isave
+       REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) ::  ar
+       REAL, DIMENSION(nys:nyn,nxl:nxr,1:nz) ::  ar_inv_test ! work array tend from pres
+       !$acc declare create( ar_inv )
+       REAL, DIMENSION(nys:nyn,nxl:nxr,1:nz) ::  ar_inv
+       REAL, DIMENSION(:,:,:),   ALLOCATABLE ::  f_in, f_inv, f_out_y, f_out_z
+       REAL, DIMENSION(:,:,:,:), ALLOCATABLE ::  ar1
 …
 !--       1d-domain-decomposition along x:
 !--       FFT along y and transposition y --> x
           CALL ffty_tr_yx( ar, work, ar )
+          CALL ffty_tr_yx( ar, ar )
+!
 …
+!
 !--       Transposition x --> y and backward FFT along y
           CALL tr_xy_ffty( ar, work, ar )
+          CALL tr_xy_ffty( ar, ar )
        ELSEIF ( pdims(1) == 1  .AND.  pdims(2) > 1 )  THEN
 …
 !--       1d-domain-decomposition along y:
 !--       FFT along x and transposition x --> y
           CALL fftx_tr_xy( ar, work, ar )
+          CALL fftx_tr_xy( ar, ar )
+!
 …
+!
 !--       Transposition y --> x and backward FFT along x
           CALL tr_yx_fftx( ar, work, ar )
        ELSE
+          CALL tr_yx_fftx( ar, ar )
+       ELSEIF ( .NOT. transpose_compute_overlap )  THEN
+!
 …
 !--       Transposition z --> x
           CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
+          CALL transpose_zx( ar, work, ar )
+          CALL resort_for_zx( ar, ar_inv )
+          CALL transpose_zx( ar_inv, ar )
           CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
 …
 !--       Transposition x --> y
           CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
+          CALL transpose_xy( ar, work, ar )
+          CALL resort_for_xy( ar, ar_inv )
+          CALL transpose_xy( ar_inv, ar )
           CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
 …
              !$acc update host( ar )
           ENDIF
+          CALL fft_y( ar, 'forward' )
+          CALL fft_y( ar, 'forward', ar_tr = ar,                &
+                      nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
+                      nxl_y_l = nxl_y, nxr_y_l = nxr_y )
           IF ( fft_method /= 'system-specific' )  THEN
              !$acc update device( ar )
 …
 !--       Transposition y --> z
           CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
+          CALL transpose_yz( ar, work, ar )
+          CALL resort_for_yz( ar, ar_inv )
+          CALL transpose_yz( ar_inv, ar )
           CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
 …
 !--       Transposition z --> y
           CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
+          CALL transpose_zy( ar, work, ar )
+          CALL transpose_zy( ar, ar_inv )
+          CALL resort_for_zy( ar_inv, ar )
           CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
 …
              !$acc update host( ar )
           ENDIF
+          CALL fft_y( ar, 'backward' )
+          CALL fft_y( ar, 'backward', ar_tr = ar,               &
+                      nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
+                      nxl_y_l = nxl_y, nxr_y_l = nxr_y )
           IF ( fft_method /= 'system-specific' )  THEN
              !$acc update device( ar )
 …
 !--       Transposition y --> x
           CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
+          CALL transpose_yx( ar, work, ar )
+          CALL transpose_yx( ar, ar_inv )
+          CALL resort_for_yx( ar_inv, ar )
           CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
 …
 !--       Transposition x --> z
           CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
+          CALL transpose_xz( ar, work, ar )
+          CALL transpose_xz( ar, ar_inv )
+          CALL resort_for_xz( ar_inv, ar )
           CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
+       ELSE
+!
+!--       2d-domain-decomposition or no decomposition (1 PE run) with
+!--       overlapping transposition / fft
+          ALLOCATE( f_out_y(0:ny,nxl_y:nxr_y,nzb_y:nzt_y), &
+                    f_out_z(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
+!
+!--       Transposition z --> x + subsequent fft along x
+          ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
+          CALL resort_for_zx( ar, f_inv )
+!
+!--       Save original indices and gridpoint counter
+          isave(1) = nz
+          isave(2) = nzb_x
+          isave(3) = nzt_x
+          isave(4) = sendrecvcount_zx
+!
+!--       Set new indices for transformation
+          nblk  = nz / pdims(1)
+          nz    = pdims(1)
+          nnz_x = 1
+          nzb_x = 1 + myidx * nnz_x
+          nzt_x = ( myidx + 1 ) * nnz_x
+          sendrecvcount_zx = nnx * nny * nnz_x
+          ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x,2) )
+          ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
+          DO  kk = 1, nblk+1
+             ind_odd  = MOD( kk,   2 ) + 1
+             ind_even = MOD( kk+1, 2 ) + 1
+!$OMP sections private(ki,knew,n)
+!$OMP section
+             IF ( kk <= nblk )  THEN
+                IF ( kk == 1 )  THEN
+                   CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
+                ELSE
+                   CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
+                ENDIF
+                DO  knew = 1, nz
+                   ki = kk + nblk * ( knew - 1 )
+                   f_in(:,:,knew) = f_inv(:,:,ki)
+                ENDDO
+                CALL transpose_zx( f_in, ar1(:,:,:,ind_odd))
+                CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
+             ENDIF
+!$OMP section
+             IF ( kk >= 2 )  THEN
+                IF ( kk == 2 )  THEN
+                   CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
+                ELSE
+                   CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
+                ENDIF
+                n = isave(2) + kk - 2
+                CALL fft_x( ar1(:,:,:,ind_even), 'forward',  ar_2d = f_out_z(:,:,n))
+                CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
+             ENDIF
+!$OMP end sections
+          ENDDO
+!
+!--       Restore original indices/counters
+          nz               = isave(1)
+          nzb_x            = isave(2)
+          nzt_x            = isave(3)
+          sendrecvcount_zx = isave(4)
+          DEALLOCATE( ar1, f_in, f_inv )
+!
+!--       Transposition x --> y + subsequent fft along y
+          ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
+          CALL resort_for_xy( f_out_z, f_inv )
+!
+!--       Save original indices and gridpoint counter
+          isave(1) = nx
+          isave(2) = nxl_y
+          isave(3) = nxr_y
+          isave(4) = sendrecvcount_xy
+!
+!--       Set new indices for transformation
+          nblk  = ( ( nx+1 ) / pdims(2) ) - 1
+          nx    = pdims(2)
+          nnx_y = 1
+          nxl_y = myidy * nnx_y
+          nxr_y = ( myidy + 1 ) * nnx_y - 1
+          sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
+          ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y,2) )
+          ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
+          DO  ii = 0, nblk+1
+             ind_odd  = MOD( ii+1, 2 ) + 1
+             ind_even = MOD( ii+2, 2 ) + 1
+!$OMP sections private(ki,knew,n)
+!$OMP section
+             IF ( ii <= nblk )  THEN
+                CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
+                DO  inew = 0, nx-1
+                   iind = ii + ( nblk + 1 ) * inew
+                   f_in(:,:,inew) = f_inv(:,:,iind)
+                ENDDO
+                CALL transpose_xy( f_in, ar1(:,:,:,ind_odd) )
+                CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
+             ENDIF
+!$OMP section
+             IF ( ii >= 1 )  THEN
+                IF ( ii == 1 )  THEN
+                   CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
+                ELSE
+                   CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
+                ENDIF
+                nxl_y_bound = isave(2)
+                nxr_y_bound = isave(3)
+                n           = isave(2) + ii - 1
+!                CALL fft_y( ar1(:,:,:,ind_even), 'forward', ar_3d = f_out_y, &
+!                            ni = n )
+                CALL fft_y( ar1(:,:,:,ind_even), 'forward', ar_tr = f_out_y, &
+                            nxl_y_bound = nxl_y_bound, nxr_y_bound = nxr_y_bound, &
+                            nxl_y_l = n, nxr_y_l = n )
+                CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
+             ENDIF
+!$OMP end sections
+          ENDDO
+!
+!--       Restore original indices/counters
+          nx               = isave(1)
+          nxl_y            = isave(2)
+          nxr_y            = isave(3)
+          sendrecvcount_xy = isave(4)
+          DEALLOCATE( ar1, f_in, f_inv )
+!
+!--       Transposition y --> z + subsequent tridia + resort for z --> y
+          ALLOCATE( f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
+          CALL resort_for_yz( f_out_y, f_inv )
+!
+!--       Save original indices and gridpoint counter
+          isave(1) = ny
+          isave(2) = nys_z
+          isave(3) = nyn_z
+          isave(4) = sendrecvcount_yz
+!
+!--       Set new indices for transformation
+          nblk             = ( ( ny+1 ) / pdims(1) ) - 1
+          ny               = pdims(1)
+          nny_z            = 1
+          nys_z            = myidx * nny_z
+          nyn_z            = ( myidx + 1 ) * nny_z - 1
+          sendrecvcount_yz = ( nxr_y-nxl_y+1 ) * nny_z * ( nzt_y-nzb_y+1 )
+          ALLOCATE( ar1(nxl_z:nxr_z,nys_z:nyn_z,1:nz,3) )
+          ALLOCATE( f_in(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
+          DO  jj = 0, nblk+2
+             ind_odd   = MOD( jj+3, 3 ) + 1
+             ind_even  = MOD( jj+2, 3 ) + 1
+             ind_third = MOD( jj+1, 3 ) + 1
+!$OMP sections private(ki,knew,n)
+!$OMP section
+             IF ( jj <= nblk )  THEN
+!
+!--             Forward Fourier Transformation
+!--             Transposition y --> z
+                CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
+                DO  jnew = 0, ny-1
+                   jind = jj + ( nblk + 1 ) * jnew
+                   f_in(:,:,jnew) =f_inv(:,:,jind)
+                ENDDO
+                CALL transpose_yz( f_in, ar1(:,:,:,ind_odd) )
+                IF ( jj == nblk )  THEN
+                   CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
+                ELSE
+                   CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
+                ENDIF
+             ENDIF
+             IF ( jj >= 2 )  THEN
+!
+!--             Inverse Fourier Transformation
+!--             Transposition z --> y
+!--             Only one thread should call MPI routines, therefore forward and
+!--             backward tranpose are in the same section
+                IF ( jj == 2 )  THEN
+                   CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
+                ELSE
+                   CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
+                ENDIF
+                CALL transpose_zy( ar1(:,:,:,ind_third), f_in )
+                DO  jnew = 0, ny-1
+                   jind = jj-2 + ( nblk + 1 ) * jnew
+                   f_inv(:,:,jind) = f_in(:,:,jnew)
+                ENDDO
+                CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
+             ENDIF
+!$OMP section
+             IF ( jj >= 1  .AND.  jj <= nblk+1 )  THEN
+!
+!--             Solve the tridiagonal equation system along z
+                CALL cpu_log( log_point_s(6), 'tridia', 'start' )
+                n = isave(2) + jj - 1
+                CALL tridia_substi_overlap( ar1(:,:,:,ind_even), n )
+                CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
+             ENDIF
+!$OMP end sections
+          ENDDO
+!
+!--       Restore original indices/counters
+          ny               = isave(1)
+          nys_z            = isave(2)
+          nyn_z            = isave(3)
+          sendrecvcount_yz = isave(4)
+          CALL resort_for_zy( f_inv, f_out_y )
+          DEALLOCATE( ar1, f_in, f_inv )
+!
+!--       fft along y backward + subsequent transposition y --> x
+          ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
+!
+!--       Save original indices and gridpoint counter
+          isave(1) = nx
+          isave(2) = nxl_y
+          isave(3) = nxr_y
+          isave(4) = sendrecvcount_xy
+!
+!--       Set new indices for transformation
+          nblk             = (( nx+1 ) / pdims(2) ) - 1
+          nx               = pdims(2)
+          nnx_y            = 1
+          nxl_y            = myidy * nnx_y
+          nxr_y            = ( myidy + 1 ) * nnx_y - 1
+          sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
+          ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y,2) )
+          ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
+          DO  ii = 0, nblk+1
+             ind_odd  = MOD( ii+1, 2 ) + 1
+             ind_even = MOD( ii+2, 2 ) + 1
+!$OMP sections private(ki,knew,n)
+!$OMP section
+             IF ( ii <= nblk )  THEN
+                CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
+                n = isave(2) + ii
+                nxl_y_bound = isave(2)
+                nxr_y_bound = isave(3)
+!                CALL fft_y( ar1(:,:,:,ind_even), 'backward', ar_3d = f_out_y, &
+!                            ni = n )
+                CALL fft_y( ar1(:,:,:,ind_even), 'backward', ar_tr = f_out_y, &
+                            nxl_y_bound = nxl_y_bound, nxr_y_bound = nxr_y_bound, &
+                            nxl_y_l = n, nxr_y_l = n )
+                IF ( ii == nblk )  THEN
+                   CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
+                ELSE
+                   CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
+                ENDIF
+             ENDIF
+!$OMP section
+             IF ( ii >= 1 )  THEN
+                CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
+                CALL transpose_yx( ar1(:,:,:,ind_odd), f_in )
+                DO  inew = 0, nx-1
+                   iind = ii-1 + (nblk+1) * inew
+                   f_inv(:,:,iind) = f_in(:,:,inew)
+                ENDDO
+                CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
+             ENDIF
+!$OMP end sections
+          ENDDO
+!
+!--       Restore original indices/counters
+          nx               = isave(1)
+          nxl_y            = isave(2)
+          nxr_y            = isave(3)
+          sendrecvcount_xy = isave(4)
+          CALL resort_for_yx( f_inv, f_out_z )
+          DEALLOCATE( ar1, f_in, f_inv )
+!
+!--       fft along x backward + subsequent final transposition x --> z
+          ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
+!
+!--       Save original indices and gridpoint counter
+          isave(1) = nz
+          isave(2) = nzb_x
+          isave(3) = nzt_x
+          isave(4) = sendrecvcount_zx
+!
+!--       Set new indices for transformation
+          nblk             = nz / pdims(1)
+          nz               = pdims(1)
+          nnz_x            = 1
+          nzb_x            = 1 + myidx * nnz_x
+          nzt_x            = ( myidx + 1 ) * nnz_x
+          sendrecvcount_zx = nnx * nny * nnz_x
+          ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x,2) )
+          ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
+          DO  kk = 1, nblk+1
+             ind_odd  = MOD( kk,   2 ) + 1
+             ind_even = MOD( kk+1, 2 ) + 1
+!$OMP sections private(ki,knew,n)
+!$OMP section
+             IF ( kk <= nblk )  THEN
+                CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
+                n = isave(2) + kk - 1
+                CALL fft_x( ar1(:,:,:,ind_even), 'backward', f_out_z(:,:,n))
+                IF ( kk == nblk )  THEN
+                   CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
+                ELSE
+                   CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
+                ENDIF
+             ENDIF
+!$OMP section
+             IF ( kk >= 2 )  THEN
+                CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
+                CALL transpose_xz( ar1(:,:,:,ind_odd), f_in )
+                DO  knew = 1, nz
+                   ki = kk-1 + nblk * (knew-1)
+                   f_inv(:,:,ki) = f_in(:,:,knew)
+                ENDDO
+                IF ( kk == nblk+1 )  THEN
+                   CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
+                ELSE
+                   CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
+                ENDIF
+             ENDIF
+!$OMP end sections
+          ENDDO
+!
+!--       Restore original indices/counters
+          nz               = isave(1)
+          nzb_x            = isave(2)
+          nzt_x            = isave(3)
+          sendrecvcount_zx = isave(4)
+          CALL resort_for_xz( f_inv, ar )
+          DEALLOCATE( ar1, f_in, f_inv )
        ENDIF
 …
     SUBROUTINE ffty_tr_yx( f_in, work, f_out )
+    SUBROUTINE ffty_tr_yx( f_in, f_out )
 !------------------------------------------------------------------------------!
 …
     SUBROUTINE tr_xy_ffty( f_in, work, f_out )
+    SUBROUTINE tr_xy_ffty( f_in, f_out )
 !------------------------------------------------------------------------------!
 …
     SUBROUTINE fftx_tr_xy( f_in, work, f_out )
+    SUBROUTINE fftx_tr_xy( f_in, f_out )
 !------------------------------------------------------------------------------!
 …
     SUBROUTINE tr_yx_fftx( f_in, work, f_out )
+    SUBROUTINE tr_yx_fftx( f_in, f_out )
 !------------------------------------------------------------------------------!

palm/trunk/SOURCE/transpose.f90

-                      r1112
+                      r1216
  SUBROUTINE transpose_xy( f_in, work, f_out )
+ SUBROUTINE resort_for_xy( f_in, f_inv )
 !--------------------------------------------------------------------------------!
 …
 ! Current revisions:
 ! -----------------
+!
+! re-sorting of the transposed / to be transposed arrays moved to separate
+! routines resort_for_...
+!
 ! Former revisions:
 …
 ! Initial revision
+!
+!
+!------------------------------------------------------------------------------!
+! Description:
+! ------------
+! Resorting data for the transposition from x to y. The transposition itself
+! is carried out in transpose_xy
+!------------------------------------------------------------------------------!
+     USE indices
+     USE transpose_indices
+     IMPLICIT NONE
+     REAL ::  f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x)
+     REAL ::  f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx)
+     INTEGER ::  i, j, k
+!
+!-- Rearrange indices of input array in order to make data to be send
+!-- by MPI contiguous
+    !$OMP  PARALLEL PRIVATE ( i, j, k )
+    !$OMP  DO
+    !$acc kernels present( f_in, f_inv )
+    !$acc loop
+     DO  i = 0, nx
+         DO  k = nzb_x, nzt_x
+             !$acc loop vector( 32 )
+             DO  j = nys_x, nyn_x
+                 f_inv(j,k,i) = f_in(i,j,k)
+             ENDDO
+         ENDDO
+     ENDDO
+     !$acc end kernels
+     !$OMP  END PARALLEL
+ END SUBROUTINE resort_for_xy
+ SUBROUTINE transpose_xy( f_inv, f_out )
+!------------------------------------------------------------------------------!
 ! Description:
 ! ------------
 …
     INTEGER ::  i, j, k, l, ys
     REAL ::  f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x), f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y)
+    REAL ::  f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx), f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y)
     REAL, DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) ::  work
-    !$acc declare create( f_inv )
-    REAL ::  f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx)
+!
-!-- Rearrange indices of input array in order to make data to be send
-!-- by MPI contiguous
-!$OMP  PARALLEL PRIVATE ( i, j, k )
-!$OMP  DO
-    !$acc kernels present( f_in )
-    !$acc loop
-    DO  i = 0, nx
-       DO  k = nzb_x, nzt_x
-          !$acc loop vector( 32 )
-          DO  j = nys_x, nyn_x
-             f_inv(j,k,i) = f_in(i,j,k)
-          ENDDO
-       ENDDO
-    ENDDO
-    !$acc end kernels
-!$OMP  END PARALLEL
     IF ( numprocs /= 1 )  THEN
 …
                           work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
                           comm1dy, ierr )
-       !$acc update device( work )
        CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
 …
 !$OMP  PARALLEL PRIVATE ( i, j, k, l, ys )
 !$OMP  DO
+       !$acc data copyin( work )
        DO  l = 0, pdims(2) - 1
           ys = 0 + l * ( nyn_x - nys_x + 1 )
 …
           !$acc end kernels
        ENDDO
+       !$acc end data
 !$OMP  END PARALLEL
 #endif
 …
 !$OMP  PARALLEL PRIVATE ( i, j, k )
 !$OMP  DO
        !$acc kernels present( f_out )
+       !$acc kernels present( f_inv, f_out )
        !$acc loop
        DO  k = nzb_y, nzt_y
 …
+ SUBROUTINE transpose_xz( f_in, work, f_out )
+ SUBROUTINE resort_for_xz( f_inv, f_out )
+!------------------------------------------------------------------------------!
+! Description:
+! ------------
+! Resorting data after the transposition from x to z. The transposition itself
+! is carried out in transpose_xz
+!------------------------------------------------------------------------------!
+     USE indices
+     USE transpose_indices
+     IMPLICIT NONE
+     REAL ::  f_inv(nys:nyn,nxl:nxr,1:nz)
+     REAL ::  f_out(1:nz,nys:nyn,nxl:nxr)
+     INTEGER ::  i, j, k
+!
+!-- Rearrange indices of input array in order to make data to be send
+!-- by MPI contiguous.
+!-- In case of parallel fft/transposition, scattered store is faster in
+!-- backward direction!!!
+    !$OMP  PARALLEL PRIVATE ( i, j, k )
+    !$OMP  DO
+    !$acc kernels present( f_inv, f_out )
+    !$acc loop
+     DO  k = 1, nz
+         DO  i = nxl, nxr
+             !$acc loop vector( 32 )
+             DO  j = nys, nyn
+                 f_out(k,j,i) = f_inv(j,i,k)
+             ENDDO
+         ENDDO
+     ENDDO
+     !$acc end kernels
+     !$OMP  END PARALLEL
+ END SUBROUTINE resort_for_xz
+ SUBROUTINE transpose_xz( f_in, f_inv )
 !------------------------------------------------------------------------------!
 …
     INTEGER ::  i, j, k, l, xs
     REAL ::  f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x), f_out(1:nz,nys:nyn,nxl:nxr)
+    REAL ::  f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x), f_inv(nys:nyn,nxl:nxr,1:nz)
     REAL, DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) ::  work
-    !$acc declare create( f_inv )
-    REAL ::  f_inv(nys:nyn,nxl:nxr,1:nz)
 …
 !$OMP  PARALLEL PRIVATE ( i, j, k, l, xs )
 !$OMP  DO
+       !$acc data copyout( work )
        DO  l = 0, pdims(1) - 1
           xs = 0 + l * nnx
 …
           !$acc end kernels
        ENDDO
+       !$acc end data
 !$OMP  END PARALLEL
 …
        CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
        IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
-       !$acc update host( work )
        CALL MPI_ALLTOALL( work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
                           f_inv(nys,nxl,1),      sendrecvcount_zx, MPI_REAL, &
 …
        !$acc update device( f_inv )
        CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
+!
-!--    Reorder transposed array in a way that the z index is in first position
-!$OMP  PARALLEL PRIVATE ( i, j, k )
-!$OMP  DO
-       !$acc kernels present( f_out )
-       !$acc loop
-       DO  k = 1, nz
-          DO  i = nxl, nxr
-             !$acc loop vector( 32 )
-             DO  j = nys, nyn
-                f_out(k,j,i) = f_inv(j,i,k)
-             ENDDO
-          ENDDO
-       ENDDO
-       !$acc end kernels
-!$OMP  END PARALLEL
 #endif
 …
 !$OMP  PARALLEL PRIVATE ( i, j, k )
 !$OMP  DO
        !$acc kernels present( f_in )
+       !$acc kernels present( f_in, f_inv )
        !$acc loop
        DO  i = nxl, nxr
 …
 !$OMP  END PARALLEL
-!$OMP  PARALLEL PRIVATE ( i, j, k )
-!$OMP  DO
-       !$acc kernels present( f_out )
-       !$acc loop
-       DO  k = 1, nz
-          DO  i = nxl, nxr
-             !$acc loop vector( 32 )
-             DO  j = nys, nyn
-                f_out(k,j,i) = f_inv(j,i,k)
-             ENDDO
-          ENDDO
-       ENDDO
-       !$acc end kernels
-!$OMP  END PARALLEL
     ENDIF
 …
+ SUBROUTINE transpose_yx( f_in, work, f_out )
+ SUBROUTINE resort_for_yx( f_inv, f_out )
+!------------------------------------------------------------------------------!
+! Description:
+! ------------
+! Resorting data after the transposition from y to x. The transposition itself
+! is carried out in transpose_yx
+!------------------------------------------------------------------------------!
+     USE indices
+     USE transpose_indices
+     IMPLICIT NONE
+     REAL ::  f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx)
+     REAL ::  f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x)
+     INTEGER ::  i, j, k
+!
+!-- Rearrange indices of input array in order to make data to be send
+!-- by MPI contiguous
+    !$OMP  PARALLEL PRIVATE ( i, j, k )
+    !$OMP  DO
+    !$acc kernels present( f_inv, f_out )
+    !$acc loop
+     DO  i = 0, nx
+         DO  k = nzb_x, nzt_x
+             !$acc loop vector( 32 )
+             DO  j = nys_x, nyn_x
+                 f_out(i,j,k) = f_inv(j,k,i)
+             ENDDO
+         ENDDO
+     ENDDO
+     !$acc end kernels
+     !$OMP  END PARALLEL
+ END SUBROUTINE resort_for_yx
+ SUBROUTINE transpose_yx( f_in, f_inv )
 !------------------------------------------------------------------------------!
 …
     INTEGER ::  i, j, k, l, ys
     REAL ::  f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y), f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x)
+    REAL ::  f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y), f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx)
     REAL, DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) ::  work
-    !$acc declare create( f_inv )
-    REAL ::  f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx)
 …
 !$OMP  PARALLEL PRIVATE ( i, j, k, l, ys )
 !$OMP  DO
+       !$acc data copyout( work )
        DO  l = 0, pdims(2) - 1
           ys = 0 + l * ( nyn_x - nys_x + 1 )
 …
           !$acc end kernels
        ENDDO
+       !$acc end data
 !$OMP  END PARALLEL
 …
        CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
        IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
-       !$acc update host( work )
        CALL MPI_ALLTOALL( work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
                           f_inv(nys_x,nzb_x,0),  sendrecvcount_xy, MPI_REAL, &
 …
 !$OMP  PARALLEL PRIVATE ( i, j, k )
 !$OMP  DO
        !$acc kernels present( f_in )
+       !$acc kernels present( f_in, f_inv )
        !$acc loop
        DO  i = nxl_y, nxr_y
 …
     ENDIF
+!
-!-- Reorder transposed array in a way that the x index is in first position
-!$OMP  PARALLEL PRIVATE ( i, j, k )
-!$OMP  DO
-    !$acc kernels present( f_out )
-    !$acc loop
-    DO  i = 0, nx
-       DO  k = nzb_x, nzt_x
-          !$acc loop vector( 32 )
-          DO  j = nys_x, nyn_x
-             f_out(i,j,k) = f_inv(j,k,i)
-          ENDDO
-       ENDDO
-    ENDDO
-    !$acc end kernels
-!$OMP  END PARALLEL
  END SUBROUTINE transpose_yx
  SUBROUTINE transpose_yxd( f_in, work, f_out )
+ SUBROUTINE transpose_yxd( f_in, f_out )
 !------------------------------------------------------------------------------!
 …
+ SUBROUTINE transpose_yz( f_in, work, f_out )
+ SUBROUTINE resort_for_yz( f_in, f_inv )
+!------------------------------------------------------------------------------!
+! Description:
+! ------------
+! Resorting data for the transposition from y to z. The transposition itself
+! is carried out in transpose_yz
+!------------------------------------------------------------------------------!
+     USE indices
+     USE transpose_indices
+     IMPLICIT NONE
+     REAL ::  f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y)
+     REAL ::  f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny)
+     INTEGER ::  i, j, k
+!
+!-- Rearrange indices of input array in order to make data to be send
+!-- by MPI contiguous
+    !$OMP  PARALLEL PRIVATE ( i, j, k )
+    !$OMP  DO
+    !$acc kernels present( f_in, f_inv )
+    !$acc loop
+     DO  j = 0, ny
+         DO  k = nzb_y, nzt_y
+             !$acc loop vector( 32 )
+             DO  i = nxl_y, nxr_y
+                 f_inv(i,k,j) = f_in(j,i,k)
+             ENDDO
+         ENDDO
+     ENDDO
+     !$acc end kernels
+     !$OMP  END PARALLEL
+ END SUBROUTINE resort_for_yz
+ SUBROUTINE transpose_yz( f_inv, f_out )
 !------------------------------------------------------------------------------!
 …
     INTEGER ::  i, j, k, l, zs
     REAL ::  f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y), f_out(nxl_z:nxr_z,nys_z:nyn_z,1:nz)
+    REAL ::  f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny), f_out(nxl_z:nxr_z,nys_z:nyn_z,1:nz)
     REAL, DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) ::  work
+    !$acc declare create( f_inv )
+    REAL ::  f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny)
+!
+!-- Rearrange indices of input array in order to make data to be send
+!-- by MPI contiguous
+!$OMP  PARALLEL PRIVATE ( i, j, k )
+!$OMP  DO
+    !$acc kernels present( f_in )
+    !$acc loop
+    DO  j = 0, ny
+       DO  k = nzb_y, nzt_y
+          !$acc loop vector( 32 )
+          DO  i = nxl_y, nxr_y
+             f_inv(i,k,j) = f_in(j,i,k)
+          ENDDO
+       ENDDO
+    ENDDO
+    !$acc end kernels
+!$OMP  END PARALLEL
+!
+!-- Move data to different array, because memory location of work1 is
+!-- needed further below (work1 = work2).
+!
 !-- If the PE grid is one-dimensional along y, only local reordering
 !-- of the data is necessary and no transposition has to be done.
 …
 !$OMP  PARALLEL PRIVATE ( i, j, k )
 !$OMP  DO
        !$acc kernels present( f_out )
+       !$acc kernels present( f_inv, f_out )
        !$acc loop
        DO  j = 0, ny
 …
                           work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
                           comm1dx, ierr )
-       !$acc update device( work )
        CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
 …
 !$OMP  PARALLEL PRIVATE ( i, j, k, l, zs )
 !$OMP  DO
+       !$acc data copyin( work )
        DO  l = 0, pdims(1) - 1
           zs = 1 + l * ( nzt_y - nzb_y + 1 )
           !$acc kernels present( f_out, work )
+          !$acc kernels present( f_out )
           !$acc loop
           DO  j = nys_z, nyn_z
 …
           !$acc end kernels
        ENDDO
+       !$acc end data
 !$OMP  END PARALLEL
 #endif
 …
+ SUBROUTINE transpose_zx( f_in, work, f_out )
+ SUBROUTINE resort_for_zx( f_in, f_inv )
+!------------------------------------------------------------------------------!
+! Description:
+! ------------
+! Resorting data for the transposition from z to x. The transposition itself
+! is carried out in transpose_zx
+!------------------------------------------------------------------------------!
+     USE indices
+     USE transpose_indices
+     IMPLICIT NONE
+     REAL ::  f_in(1:nz,nys:nyn,nxl:nxr)
+     REAL ::  f_inv(nys:nyn,nxl:nxr,1:nz)
+     INTEGER ::  i, j, k
+!
+!-- Rearrange indices of input array in order to make data to be send
+!-- by MPI contiguous
+    !$OMP  PARALLEL PRIVATE ( i, j, k )
+    !$OMP  DO
+    !$acc kernels present( f_in, f_inv )
+    !$acc loop
+     DO  k = 1,nz
+         DO  i = nxl, nxr
+             !$acc loop vector( 32 )
+             DO  j = nys, nyn
+                 f_inv(j,i,k) = f_in(k,j,i)
+             ENDDO
+         ENDDO
+     ENDDO
+     !$acc end kernels
+     !$OMP  END PARALLEL
+ END SUBROUTINE resort_for_zx
+ SUBROUTINE transpose_zx( f_inv, f_out )
 !------------------------------------------------------------------------------!
 …
     INTEGER ::  i, j, k, l, xs
     REAL ::  f_in(1:nz,nys:nyn,nxl:nxr), f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x)
+    REAL ::  f_inv(nys:nyn,nxl:nxr,1:nz), f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x)
     REAL, DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) ::  work
+    !$acc declare create( f_inv )
+    REAL ::  f_inv(nys:nyn,nxl:nxr,1:nz)
+!
+!-- Rearrange indices of input array in order to make data to be send
+!-- by MPI contiguous
+!$OMP  PARALLEL PRIVATE ( i, j, k )
+!$OMP  DO
+    !$acc kernels present( f_in )
+    !$acc loop
+    DO  k = 1,nz
+       DO  i = nxl, nxr
+          !$acc loop vector( 32 )
+          DO  j = nys, nyn
+             f_inv(j,i,k) = f_in(k,j,i)
+          ENDDO
+       ENDDO
+    ENDDO
+    !$acc end kernels
+!$OMP  END PARALLEL
+!
+!-- Move data to different array, because memory location of work1 is
+!-- needed further below (work1 = work2).
+!
 !-- If the PE grid is one-dimensional along y, only local reordering
 !-- of the data is necessary and no transposition has to be done.
 …
 !$OMP  PARALLEL PRIVATE ( i, j, k )
 !$OMP  DO
        !$acc kernels present( f_out )
+       !$acc kernels present( f_inv, f_out )
        !$acc loop
        DO  k = 1, nz
 …
                           work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
                           comm1dx, ierr )
-       !$acc update device( work )
        CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
 …
 !$OMP  PARALLEL PRIVATE ( i, j, k, l, xs )
 !$OMP  DO
+       !$acc data copyin( work )
        DO  l = 0, pdims(1) - 1
           xs = 0 + l * nnx
           !$acc kernels present( f_out, work )
+          !$acc kernels present( f_out )
           !$acc loop
           DO  k = nzb_x, nzt_x
 …
           !$acc end kernels
        ENDDO
+       !$acc end data
 !$OMP  END PARALLEL
 #endif
 …
+ SUBROUTINE transpose_zy( f_in, work, f_out )
+ SUBROUTINE resort_for_zy( f_inv, f_out )
+!------------------------------------------------------------------------------!
+! Description:
+! ------------
+! Resorting data after the transposition from z to y. The transposition itself
+! is carried out in transpose_zy
+!------------------------------------------------------------------------------!
+     USE indices
+     USE transpose_indices
+     IMPLICIT NONE
+     REAL ::  f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny)
+     REAL ::  f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y)
+     INTEGER ::  i, j, k
+!
+!-- Rearrange indices of input array in order to make data to be send
+!-- by MPI contiguous
+    !$OMP  PARALLEL PRIVATE ( i, j, k )
+    !$OMP  DO
+    !$acc kernels present( f_inv, f_out )
+    !$acc loop
+     DO  k = nzb_y, nzt_y
+         DO  j = 0, ny
+             !$acc loop vector( 32 )
+             DO  i = nxl_y, nxr_y
+                 f_out(j,i,k) = f_inv(i,k,j)
+             ENDDO
+         ENDDO
+     ENDDO
+     !$acc end kernels
+     !$OMP  END PARALLEL
+ END SUBROUTINE resort_for_zy
+ SUBROUTINE transpose_zy( f_in, f_inv )
 !------------------------------------------------------------------------------!
 …
     INTEGER ::  i, j, k, l, zs
     REAL ::  f_in(nxl_z:nxr_z,nys_z:nyn_z,1:nz), f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y)
+    REAL ::  f_in(nxl_z:nxr_z,nys_z:nyn_z,1:nz), f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny)
     REAL, DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) ::  work
-    !$acc declare create( f_inv )
-    REAL ::  f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny)
 …
 !$OMP  PARALLEL PRIVATE ( i, j, k, l, zs )
 !$OMP  DO
+       !$acc data copyout( work )
        DO  l = 0, pdims(1) - 1
           zs = 1 + l * ( nzt_y - nzb_y + 1 )
 …
           !$acc end kernels
        ENDDO
+       !$acc end data
 !$OMP  END PARALLEL
 …
        CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
        IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
-       !$acc update host( work )
        CALL MPI_ALLTOALL( work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
                           f_inv(nxl_y,nzb_y,0),  sendrecvcount_yz, MPI_REAL, &
 …
 !$OMP  PARALLEL PRIVATE ( i, j, k )
 !$OMP  DO
        !$acc kernels present( f_in )
+       !$acc kernels present( f_in, f_inv )
        !$acc loop
        DO  k = nzb_y, nzt_y
 …
     ENDIF
+!
-!-- Reorder transposed array in a way that the y index is in first position
-!$OMP  PARALLEL PRIVATE ( i, j, k )
-!$OMP  DO
-    !$acc kernels present( f_out )
-    !$acc loop
-    DO  k = nzb_y, nzt_y
-       DO  i = nxl_y, nxr_y
-          !$acc loop vector( 32 )
-          DO  j = 0, ny
-             f_out(j,i,k) = f_inv(i,k,j)
-          ENDDO
-       ENDDO
-    ENDDO
-    !$acc end kernels
-!$OMP  END PARALLEL
  END SUBROUTINE transpose_zy
  SUBROUTINE transpose_zyd( f_in, work, f_out )
+ SUBROUTINE transpose_zyd( f_in, f_out )
 !------------------------------------------------------------------------------!

palm/trunk/SOURCE/tridia_solver.f90

-                      r1213
+                      r1216
 ! Current revisions:
 ! ------------------
+!
+! +tridia_substi_overlap for handling overlapping fft / transposition
+!
 ! Former revisions:
 …
     END INTERFACE tridia_substi
+    PUBLIC  tridia_substi, tridia_init, tridia_1dd
+    INTERFACE tridia_substi_overlap
+       MODULE PROCEDURE tridia_substi_overlap
+    END INTERFACE tridia_substi_overlap
+    PUBLIC  tridia_substi, tridia_substi_overlap, tridia_init, tridia_1dd
  CONTAINS
 …
+    SUBROUTINE tridia_substi_overlap( ar, jj )
+!------------------------------------------------------------------------------!
+! Substitution (Forward and Backward) (Thomas algorithm)
+!------------------------------------------------------------------------------!
+          USE arrays_3d,  ONLY: tri
+          USE control_parameters
+          IMPLICIT NONE
+          INTEGER ::  i, j, jj, k
+          REAL    ::  ar(nxl_z:nxr_z,nys_z:nyn_z,1:nz)
+          !$acc declare create( ar1 )
+          REAL, DIMENSION(nxl_z:nxr_z,nys_z:nyn_z,0:nz-1)   ::  ar1
+!
+!--       Forward substitution
+          DO  k = 0, nz - 1
+             !$acc kernels present( ar, tri )
+             !$acc loop
+             DO  j = nys_z, nyn_z
+                DO  i = nxl_z, nxr_z
+                   IF ( k == 0 )  THEN
+                      ar1(i,j,k) = ar(i,j,k+1)
+                   ELSE
+                      ar1(i,j,k) = ar(i,j,k+1) - tri(i,jj,k,2) * ar1(i,j,k-1)
+                   ENDIF
+                ENDDO
+             ENDDO
+             !$acc end kernels
+          ENDDO
+!
+!--       Backward substitution
+!--       Note, the 1.0E-20 in the denominator is due to avoid divisions
+!--       by zero appearing if the pressure bc is set to neumann at the top of
+!--       the model domain.
+          DO  k = nz-1, 0, -1
+             !$acc kernels present( ar, tri )
+             !$acc loop
+             DO  j = nys_z, nyn_z
+                DO  i = nxl_z, nxr_z
+                   IF ( k == nz-1 )  THEN
+                      ar(i,j,k+1) = ar1(i,j,k) / ( tri(i,jj,k,1) + 1.0E-20 )
+                   ELSE
+                      ar(i,j,k+1) = ( ar1(i,j,k) - ddzuw(k,2) * ar(i,j,k+2) ) &
+                              / tri(i,jj,k,1)
+                   ENDIF
+                ENDDO
+             ENDDO
+             !$acc end kernels
+          ENDDO
+!
+!--       Indices i=0, j=0 correspond to horizontally averaged pressure.
+!--       The respective values of ar should be zero at all k-levels if
+!--       acceleration of horizontally averaged vertical velocity is zero.
+          IF ( ibc_p_b == 1  .AND.  ibc_p_t == 1 )  THEN
+             IF ( nys_z == 0  .AND.  nxl_z == 0 )  THEN
+                !$acc kernels loop present( ar )
+                DO  k = 1, nz
+                   ar(nxl_z,nys_z,k) = 0.0
+                ENDDO
+             ENDIF
+          ENDIF
+    END SUBROUTINE tridia_substi_overlap
     SUBROUTINE split

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 1216

Legend:

Download in other formats: