Changeset 1106

palm/trunk/INSTALL/example_cbl_rc

-                      r1097
+                      r1106
  ***************************         ------------------------------------------
  * PALM 3.9  Rev: 1096     *         atmosphere - 3D - run without 1D - prerun
  ***************************         ------------------------------------------
  Date:              03-02-13         Run:       example_cbl
  Time:              04:25:21         Run-No.:   00
  Run on host:         lcsgih
  Number of PEs:            8         Processor grid (x,y): (  2,  4) calculated
+ ******************************      ------------------------------------------
+ * PALM 3.9  Rev: 1106        *      atmosphere - 3D - run without 1D - prerun
+ ******************************      ------------------------------------------
+ Date:                 04-03-13      Run:       example_cbl
+ Time:                 06:06:34      Run-No.:   00
+ Run on host:            lcsgih
+ Number of PEs:               8      Processor grid (x,y): (  2,  4) calculated
  ------------------------------------------------------------------------------
 …
  ----------------------------------
  Timestep:          variable     maximum value: 20.000 s    CFL-factor: 0.90
  Start time:           0.000 s
  End time:          3600.000 s
+ Timestep:             variable     maximum value: 20.000 s    CFL-factor: 0.90
+ Start time:              0.000 s
+ End time:             3600.000 s

palm/trunk/SCRIPTS/mrun

-                      r1104
+                      r1106
 # Current revisions:
 # ------------------
+#
+# --stdin argument for mpiexec on lckyuh
+# -y and -Y settings output to header
+#
 # Former revisions:
 …
  if [[ -n $numprocs ]]
  then
+    spalte1="number of PEs:"; spalte2=$numprocs
+    if [[ $run_coupled_model = false ]]
+    then
+       spalte1="number of PEs:"; spalte2=$numprocs
+    else
+       spalte1="number of PEs:"; spalte2="$numprocs  (atmosphere: $numprocs_atmos, ocean: $numprocs_ocean)"
+    fi
     printf "| %-25s%-45s | \n" "$spalte1" "$spalte2"
  fi
 …
  spalte1="OUTPUT control list:"; spalte2=$(echo $output_list)
  printf "| %-25s%-45s | \n" "$spalte1" "$spalte2"
+ if [[ "$ocean_file_appendix" = true ]]
+ then
+    printf "| %-35s%-35s | \n" "suffix \"_O\" is added to local files" " "
+ fi
  if [[ $do_batch = true  ||  "$LOADLBATCH" = yes ]]
 …
                 elif [[ $host = lckyu* ]]
                 then
                    mpiexec -n $ii  ./a.out  < runfile_atmos  $ROPTS
+                   mpiexec -n $ii --stdin runfile_atmos  ./a.out  $ROPTS
                 else
                    mpiexec  -machinefile hostfile  -n $ii  a.out  < runfile_atmos  $ROPTS
+                   mpiexec  -machinefile hostfile  -n $ii  a.out  <  runfile_atmos  $ROPTS
                 fi
              else

palm/trunk/SOURCE/Makefile

-                      r1055
+                      r1106
 # Current revisions:
 # ------------------
+# +cuda_fft_interfaces
+#
 # Former revisions:
 …
         calc_radiation.f90 calc_spectra.f90 check_for_restart.f90 \
         check_open.f90 check_parameters.f90 close_file.f90 compute_vpt.f90 \
+        coriolis.f90 cpu_log.f90 cpu_statistics.f90 data_log.f90 \
+        coriolis.f90 cpu_log.f90 cpu_statistics.f90 cuda_fft_interfaces.f90 \
+        data_log.f90 \
         data_output_dvrp.f90 data_output_mask.f90 data_output_profiles.f90 \
         data_output_ptseries.f90 data_output_spectra.f90 \
 …
         lpm_set_attributes.f90 lpm_sort_arrays.f90 \
         lpm_write_exchange_statistics.f90 lpm_write_restart_file.f90 \
         message.f90 modules.f90 netcdf.f90 package_parin.f90 palm.f90 \
         parin.f90 plant_canopy_model.f90 poisfft.f90 \
+        message.f90 microphysics.f90 modules.f90 netcdf.f90 package_parin.f90 \
+        palm.f90 parin.f90 plant_canopy_model.f90 poisfft.f90 \
         poisfft_hybrid.f90 poismg.f90 prandtl_fluxes.f90 pres.f90 print_1d.f90 \
         production_e.f90 prognostic_equations.f90 random_function.f90 \
 …
         user_parin.f90 user_read_restart_data.f90 \
         user_spectra.f90 user_statistics.f90 wall_fluxes.f90 \
+        write_3d_binary.f90 write_compressed.f90 write_var_list.f90 \
+        microphysics.f90
+        write_3d_binary.f90 write_compressed.f90 write_var_list.f90
 OBJS =  advec_s_bc.o advec_s_pw.o advec_s_up.o advec_u_pw.o advec_u_up.o \
 …
         calc_spectra.o check_for_restart.o check_open.o check_parameters.o \
         close_file.o compute_vpt.o coriolis.o cpu_log.o cpu_statistics.o \
         data_log.o data_output_dvrp.o data_output_mask.o \
+        cuda_fft_interfaces.o data_log.o data_output_dvrp.o data_output_mask.o \
         data_output_profiles.o data_output_ptseries.o \
         data_output_spectra.o data_output_tseries.o data_output_2d.o \
 …
         lpm_pack_arrays.o lpm_read_restart_file.o lpm_release_set.o \
         lpm_set_attributes.o lpm_sort_arrays.o lpm_write_exchange_statistics.o \
         lpm_write_restart_file.o message.o modules.o netcdf.o package_parin.o \
         palm.o parin.o plant_canopy_model.o poisfft.o \
+        lpm_write_restart_file.o message.o microphysics.o modules.o netcdf.o \
+        package_parin.o palm.o parin.o plant_canopy_model.o poisfft.o \
         poisfft_hybrid.o poismg.o prandtl_fluxes.o pres.o print_1d.o \
         production_e.o prognostic_equations.o random_function.o random_gauss.o \
 …
         user_lpm_init.o user_lpm_set_attributes.o user_module.o user_parin.o \
         user_read_restart_data.o user_spectra.o user_statistics.o \
+        wall_fluxes.o write_3d_binary.o write_compressed.o write_var_list.o \
+        microphysics.o
+        wall_fluxes.o write_3d_binary.o write_compressed.o write_var_list.o
 CC = cc
 …
 cpu_log.o: modules.o
 cpu_statistics.o: modules.o
+cuda_fft_interfaces.o: cuda_fft_interfaces.f90
 data_log.o: modules.o
 data_output_dvrp.o: modules.o
 …
 exchange_horiz.o: modules.o
 exchange_horiz_2d.o: modules.o
 fft_xy.o: modules.o singleton.o temperton_fft.o
+fft_xy.o: cuda_fft_interfaces.o modules.o singleton.o temperton_fft.o
 flow_statistics.o: modules.o
 global_min_max.o: modules.o
 …
 lpm_write_restart_file.o: modules.o
 message.o: modules.o
+microphysics.o: modules.o
 modules.o: modules.f90
 netcdf.o: modules.o
 …
 write_compressed.o: modules.o
 write_var_list.o: modules.o
-microphysics.o: modules.o

palm/trunk/SOURCE/Makefile_check

-                      r1037
+                      r1106
 # Current revisions:
 # ------------------
+#
+# +cuda_fft_interfaces
+#
 # Former revisions:
 …
 RCS = check_open.f90 check_namelist_files.f90 check_parameters.f90 \
       close_file.f90 cpu_log.f90 exchange_horiz.f90 exchange_horiz_2d.f90 \
       fft_xy.f90 init_grid.f90 init_masks.f90 init_cloud_physics.f90 \
       init_pegrid.f90 local_flush.f90 local_stop.f90 local_system.f90 \
       message.f90 modules.f90 package_parin.f90 parin.f90 poisfft.f90 \
       poisfft_hybrid.f90 random_function.f90 singleton.f90 subsidence.f90 \
       temperton_fft.f90 \
+      close_file.f90 cpu_log.f90 cuda_fft_interfaces.f90 exchange_horiz.f90 \
+      exchange_horiz_2d.f90 fft_xy.f90 init_grid.f90 init_masks.f90 \
+      init_cloud_physics.f90 init_pegrid.f90 local_flush.f90 local_stop.f90 \
+      local_system.f90 message.f90 modules.f90 package_parin.f90 parin.f90 \
+      poisfft.f90 poisfft_hybrid.f90 random_function.f90 singleton.f90 \
+      subsidence.f90 temperton_fft.f90 \
       user_3d_data_averaging.f90 user_actions.f90 \
       user_additional_routines.f90 user_check_data_output.f90 \
 …
 OBJS = check_open.o check_namelist_files.o check_parameters.o close_file.o \
       cpu_log.o exchange_horiz.o exchange_horiz_2d.o fft_xy.o init_grid.o \
       init_masks.o init_pegrid.o init_cloud_physics.o\
       local_flush.o local_stop.o local_system.o message.o \
       modules.o package_parin.o parin.o poisfft.o \
       poisfft_hybrid.o random_function.o singleton.o subsidence.o temperton_fft.o \
       user_3d_data_averaging.o user_actions.o user_additional_routines.o \
       user_check_data_output.o user_check_data_output_pr.o \
       user_check_parameters.o user_data_output_2d.o user_data_output_3d.o \
       user_data_output_mask.o user_data_output_dvrp.o \
       user_define_netcdf_grid.o user_dvrp_coltab.o user_header.o \
       user_init.o user_init_3d_model.o user_init_grid.o \
       user_init_plant_canopy.o user_last_actions.o user_lpm_advec.o \
       user_lpm_init.o user_lpm_set_attributes.o user_module.o user_parin.o \
       user_read_restart_data.o user_spectra.o user_statistics.o \
+       cpu_log.o cuda_fft_interfaces.o exchange_horiz.o exchange_horiz_2d.o \
+       fft_xy.o init_grid.o init_masks.o init_pegrid.o init_cloud_physics.o\
+       local_flush.o local_stop.o local_system.o message.o \
+       modules.o package_parin.o parin.o poisfft.o \
+       poisfft_hybrid.o random_function.o singleton.o subsidence.o temperton_fft.o \
+       user_3d_data_averaging.o user_actions.o user_additional_routines.o \
+       user_check_data_output.o user_check_data_output_pr.o \
+       user_check_parameters.o user_data_output_2d.o user_data_output_3d.o \
+       user_data_output_mask.o user_data_output_dvrp.o \
+       user_define_netcdf_grid.o user_dvrp_coltab.o user_header.o \
+       user_init.o user_init_3d_model.o user_init_grid.o \
+       user_init_plant_canopy.o user_last_actions.o user_lpm_advec.o \
+       user_lpm_init.o user_lpm_set_attributes.o user_module.o user_parin.o \
+       user_read_restart_data.o user_spectra.o user_statistics.o \
 CC = cc
 …
 close_file.o: modules.o
 cpu_log.o: modules.o
+cuda_fft_interfaces.o: cuda_fft_interfaces.f90
 exchange_horiz.o: modules.o
 exchange_horiz_2d.o: modules.o
 fft_xy.o: modules.o singleton.o temperton_fft.o
+fft_xy.o: cuda_fft_interfaces.o modules.o singleton.o temperton_fft.o
 init_cloud_physics.o: modules.o
 init_grid.o: modules.o

palm/trunk/SOURCE/check_open.f90

-                      r1093
+                      r1106
 ! Current revisions:
 ! -----------------
+!
+! array_kind renamed precision_kind
+!
 ! Former revisions:
 …
 !------------------------------------------------------------------------------!
-    USE array_kind
     USE arrays_3d
     USE control_parameters
 …
     USE particle_attributes
     USE pegrid
+    USE precision_kind
     USE profil_parameter
     USE statistics

palm/trunk/SOURCE/data_output_3d.f90

-                      r1077
+                      r1106
+!
 ! Current revisions:
+! -----------------
+! ------------------
+! array_kind renamed precision_kind
+!
 ! Former revisions:
 …
 !------------------------------------------------------------------------------!
-    USE array_kind
     USE arrays_3d
     USE averaging
 …
     USE particle_attributes
     USE pegrid
+    USE precision_kind
     IMPLICIT NONE

palm/trunk/SOURCE/data_output_profiles.f90

-                      r1093
+                      r1106
 ! Current revisions:
 ! -----------------
+!
+! bugfix: initial time for preruns of coupled runs is output as -coupling_start_time
+!
 ! Former revisions:
 …
 #if defined( __netcdf )
+!
+!--             Store initial time (t=0) to time axis, but only if an output
+!--             is required for at least one of the profiles
+!--             Store initial time to time axis, but only if an output
+!--             is required for at least one of the profiles. The initial time
+!--             is either 0, or, in case of a prerun for coupled atmosphere-ocean
+!--             runs, has a negative value
                 DO  i = 1, dopr_n
                 IF ( dopr_initial_index(i) /= 0 )  THEN
                    nc_stat = NF90_PUT_VAR( id_set_pr, id_var_time_pr,  &
                                               (/ 0.0 /), start = (/ 1 /), &
                                               count = (/ 1 /) )
+                                           (/ -coupling_start_time /), &
+                                           start = (/ 1 /), count = (/ 1 /) )
                       CALL handle_netcdf_error( 'data_output_profiles', 329 )
                       output_for_t0 = .TRUE.

palm/trunk/SOURCE/fft_xy.f90

-                      r1093
+                      r1106
 ! Current revisions:
 ! -----------------
+!
+! CUDA fft added
+! array_kind renamed precision_kind, 3D- instead of 1D-loops in fft_x and fft_y
+! old fft_x, fft_y become fft_x_1d, fft_y_1d and are used for 1D-decomposition
+!
 ! Former revisions:
 …
 !------------------------------------------------------------------------------!
-    USE array_kind
     USE control_parameters
     USE indices
+    USE precision_kind
     USE singleton
     USE temperton_fft
+    USE transpose_indices
     IMPLICIT NONE
     PRIVATE
     PUBLIC fft_x, fft_y, fft_init, fft_x_m, fft_y_m
+    PUBLIC fft_x, fft_x_1d, fft_y, fft_y_1d, fft_init, fft_x_m, fft_y_m
     INTEGER, DIMENSION(:), ALLOCATABLE, SAVE ::  ifax_x, ifax_y
 …
     LOGICAL, SAVE                            ::  init_fft = .FALSE.
     REAL, SAVE ::  sqr_nx, sqr_ny
+    REAL, SAVE ::  dnx, dny, sqr_dnx, sqr_dny
     REAL, DIMENSION(:), ALLOCATABLE, SAVE    ::  trigs_x, trigs_y
 …
     REAL, DIMENSION(:), ALLOCATABLE, SAVE ::  trig_xb, trig_xf, trig_yb, &
                                               trig_yf
+#elif defined( __cuda_fft )
+    INTEGER, SAVE ::  plan_xf, plan_xi, plan_yf, plan_yi, total_points_x_transpo, &
+                      total_points_y_transpo
 #endif
 …
     END INTERFACE fft_x
+    INTERFACE fft_x_1d
+       MODULE PROCEDURE fft_x_1d
+    END INTERFACE fft_x_1d
     INTERFACE fft_y
        MODULE PROCEDURE fft_y
     END INTERFACE fft_y
+    INTERFACE fft_y_1d
+       MODULE PROCEDURE fft_y_1d
+    END INTERFACE fft_y_1d
     INTERFACE fft_x_m
        MODULE PROCEDURE fft_x_m
 …
     SUBROUTINE fft_init
+       USE cuda_fft_interfaces
        IMPLICIT NONE
 …
        IF ( fft_method == 'system-specific' )  THEN
+          sqr_nx = SQRT( 1.0 / ( nx + 1.0 ) )
+          sqr_ny = SQRT( 1.0 / ( ny + 1.0 ) )
+          dnx = 1.0 / ( nx + 1.0 )
+          dny = 1.0 / ( ny + 1.0 )
+          sqr_dnx = SQRT( dnx )
+          sqr_dny = SQRT( dny )
 #if defined( __ibm )  &&  ! defined( __ibmy_special )
+!
 !--       Initialize tables for fft along x
           CALL DRCFT( 1, workx, 1, workx, 1, nx+1, 1,  1, sqr_nx, aux1, nau1, &
+          CALL DRCFT( 1, workx, 1, workx, 1, nx+1, 1,  1, sqr_dnx, aux1, nau1, &
                       aux2, nau2 )
           CALL DCRFT( 1, workx, 1, workx, 1, nx+1, 1, -1, sqr_nx, aux3, nau1, &
+          CALL DCRFT( 1, workx, 1, workx, 1, nx+1, 1, -1, sqr_dnx, aux3, nau1, &
                       aux4, nau2 )
+!
 !--       Initialize tables for fft along y
           CALL DRCFT( 1, worky, 1, worky, 1, ny+1, 1,  1, sqr_ny, auy1, nau1, &
+          CALL DRCFT( 1, worky, 1, worky, 1, ny+1, 1,  1, sqr_dny, auy1, nau1, &
                       auy2, nau2 )
           CALL DCRFT( 1, worky, 1, worky, 1, ny+1, 1, -1, sqr_ny, auy3, nau1, &
+          CALL DCRFT( 1, worky, 1, worky, 1, ny+1, 1, -1, sqr_dny, auy3, nau1, &
                       auy4, nau2 )
 #elif defined( __nec )
 …
+!
 !--       Initialize tables for fft along x (non-vector and vector case (M))
           CALL DZFFT( 0, nx+1, sqr_nx, work_x, work_x, trig_xf, workx, 0 )
           CALL ZDFFT( 0, nx+1, sqr_nx, work_x, work_x, trig_xb, workx, 0 )
           CALL DZFFTM( 0, nx+1, nz1, sqr_nx, work_x, nx+4, work_x, nx+4, &
+          CALL DZFFT( 0, nx+1, sqr_dnx, work_x, work_x, trig_xf, workx, 0 )
+          CALL ZDFFT( 0, nx+1, sqr_dnx, work_x, work_x, trig_xb, workx, 0 )
+          CALL DZFFTM( 0, nx+1, nz1, sqr_dnx, work_x, nx+4, work_x, nx+4, &
                        trig_xf, workx, 0 )
           CALL ZDFFTM( 0, nx+1, nz1, sqr_nx, work_x, nx+4, work_x, nx+4, &
+          CALL ZDFFTM( 0, nx+1, nz1, sqr_dnx, work_x, nx+4, work_x, nx+4, &
                        trig_xb, workx, 0 )
+!
 !--       Initialize tables for fft along y (non-vector and vector case (M))
           CALL DZFFT( 0, ny+1, sqr_ny, work_y, work_y, trig_yf, worky, 0 )
           CALL ZDFFT( 0, ny+1, sqr_ny, work_y, work_y, trig_yb, worky, 0 )
           CALL DZFFTM( 0, ny+1, nz1, sqr_ny, work_y, ny+4, work_y, ny+4, &
+          CALL DZFFT( 0, ny+1, sqr_dny, work_y, work_y, trig_yf, worky, 0 )
+          CALL ZDFFT( 0, ny+1, sqr_dny, work_y, work_y, trig_yb, worky, 0 )
+          CALL DZFFTM( 0, ny+1, nz1, sqr_dny, work_y, ny+4, work_y, ny+4, &
                        trig_yf, worky, 0 )
           CALL ZDFFTM( 0, ny+1, nz1, sqr_ny, work_y, ny+4, work_y, ny+4, &
+          CALL ZDFFTM( 0, ny+1, nz1, sqr_dny, work_y, ny+4, work_y, ny+4, &
                        trig_yb, worky, 0 )
+#elif defined( __cuda_fft )
+          total_points_x_transpo = (nx+1) * (nyn_x-nys_x+1) * (nzt_x-nzb_x+1)
+          total_points_y_transpo = (ny+1) * (nxr_y-nxl_y+1) * (nzt_y-nzb_y+1)
+          CALL CUFFTPLAN1D( plan_xf, nx+1, CUFFT_D2Z, (ny+1)*nz )
+          CALL CUFFTPLAN1D( plan_xi, nx+1, CUFFT_Z2D, (ny+1)*nz )
+          CALL CUFFTPLAN1D( plan_yf, ny+1, CUFFT_D2Z, (nx+1)*nz )
+          CALL CUFFTPLAN1D( plan_yi, ny+1, CUFFT_Z2D, (nx+1)*nz )
 #else
           message_string = 'no system-specific fft-call available'
 …
 !                                                                      !
 !               Fourier-transformation along x-direction               !
+!                     Version for 2D-decomposition                     !
 !                                                                      !
 !      fft_x uses internal algorithms (Singleton or Temperton) or      !
 …
 !----------------------------------------------------------------------!
+       USE cuda_fft_interfaces
+       IMPLICIT NONE
+       CHARACTER (LEN=*) ::  direction
+       INTEGER ::  i, ishape(1), j, k, m
+       LOGICAL ::  forward_fft
+       REAL, DIMENSION(0:nx+2)   ::  work
+       REAL, DIMENSION(nx+2)     ::  work1
+       COMPLEX, DIMENSION(:), ALLOCATABLE ::  cwork
+#if defined( __ibm )
+       REAL, DIMENSION(nau2)     ::  aux2, aux4
+#elif defined( __nec )
+       REAL, DIMENSION(6*(nx+1)) ::  work2
+#elif defined( __cuda_fft )
+       REAL(dpk), DEVICE, DIMENSION(:), ALLOCATABLE    ::  cuda_a_device
+       COMPLEX(dpk), DEVICE, DIMENSION(:), ALLOCATABLE ::  cuda_b_device
+       COMPLEX(dpk), DIMENSION(:), ALLOCATABLE         ::  cuda_host
+#endif
+       REAL, DIMENSION(0:nx,nys_x:nyn_x,nzb_x:nzt_x) ::  ar
+       IF ( direction == 'forward' )  THEN
+          forward_fft = .TRUE.
+       ELSE
+          forward_fft = .FALSE.
+       ENDIF
+       IF ( fft_method == 'singleton-algorithm' )  THEN
+!
+!--       Performing the fft with singleton's software works on every system,
+!--       since it is part of the model
+          ALLOCATE( cwork(0:nx) )
+          IF ( forward_fft )   then
+             !$OMP PARALLEL PRIVATE ( cwork, i, ishape, j, k )
+             !$OMP DO
+             DO  k = nzb_x, nzt_x
+                DO  j = nys_x, nyn_x
+                   DO  i = 0, nx
+                      cwork(i) = CMPLX( ar(i,j,k) )
+                   ENDDO
+                   ishape = SHAPE( cwork )
+                   CALL FFTN( cwork, ishape )
+                   DO  i = 0, (nx+1)/2
+                      ar(i,j,k) = REAL( cwork(i) )
+                   ENDDO
+                   DO  i = 1, (nx+1)/2 - 1
+                      ar(nx+1-i,j,k) = -AIMAG( cwork(i) )
+                   ENDDO
+                ENDDO
+             ENDDO
+             !$OMP END PARALLEL
+          ELSE
+             !$OMP PARALLEL PRIVATE ( cwork, i, ishape, j, k )
+             !$OMP DO
+             DO  k = nzb_x, nzt_x
+                DO  j = nys_x, nyn_x
+                   cwork(0) = CMPLX( ar(0,j,k), 0.0 )
+                   DO  i = 1, (nx+1)/2 - 1
+                      cwork(i)      = CMPLX( ar(i,j,k), -ar(nx+1-i,j,k) )
+                      cwork(nx+1-i) = CMPLX( ar(i,j,k),  ar(nx+1-i,j,k) )
+                   ENDDO
+                   cwork((nx+1)/2) = CMPLX( ar((nx+1)/2,j,k), 0.0 )
+                   ishape = SHAPE( cwork )
+                   CALL FFTN( cwork, ishape, inv = .TRUE. )
+                   DO  i = 0, nx
+                      ar(i,j,k) = REAL( cwork(i) )
+                   ENDDO
+                ENDDO
+             ENDDO
+             !$OMP END PARALLEL
+          ENDIF
+          DEALLOCATE( cwork )
+       ELSEIF ( fft_method == 'temperton-algorithm' )  THEN
+!
+!--       Performing the fft with Temperton's software works on every system,
+!--       since it is part of the model
+          IF ( forward_fft )  THEN
+             !$OMP PARALLEL PRIVATE ( work, i, j, k )
+             !$OMP DO
+             DO  k = nzb_x, nzt_x
+                DO  j = nys_x, nyn_x
+                   work(0:nx) = ar(0:nx,j,k)
+                   CALL fft991cy( work, work1, trigs_x, ifax_x, 1, nx+1, nx+1, 1, -1 )
+                   DO  i = 0, (nx+1)/2
+                      ar(i,j,k) = work(2*i)
+                   ENDDO
+                   DO  i = 1, (nx+1)/2 - 1
+                      ar(nx+1-i,j,k) = work(2*i+1)
+                   ENDDO
+                ENDDO
+             ENDDO
+             !$OMP END PARALLEL
+          ELSE
+             !$OMP PARALLEL PRIVATE ( work, i, j, k )
+             !$OMP DO
+             DO  k = nzb_x, nzt_x
+                DO  j = nys_x, nyn_x
+                   DO  i = 0, (nx+1)/2
+                      work(2*i) = ar(i,j,k)
+                   ENDDO
+                   DO  i = 1, (nx+1)/2 - 1
+                      work(2*i+1) = ar(nx+1-i,j,k)
+                   ENDDO
+                   work(1)    = 0.0
+                   work(nx+2) = 0.0
+                   CALL fft991cy( work, work1, trigs_x, ifax_x, 1, nx+1, nx+1, 1, 1 )
+                   ar(0:nx,j,k) = work(0:nx)
+                ENDDO
+             ENDDO
+             !$OMP END PARALLEL
+          ENDIF
+       ELSEIF ( fft_method == 'system-specific' )  THEN
+#if defined( __ibm )  &&  ! defined( __ibmy_special )
+          IF ( forward_fft )  THEN
+             !$OMP PARALLEL PRIVATE ( work, i, j, k )
+             !$OMP DO
+             DO  k = nzb_x, nzt_x
+                DO  j = nys_x, nyn_x
+                   CALL DRCFT( 0, ar, 1, work, 1, nx+1, 1, 1, sqr_dnx, aux1, nau1, &
+                               aux2, nau2 )
+                   DO  i = 0, (nx+1)/2
+                      ar(i,j,k) = work(2*i)
+                   ENDDO
+                   DO  i = 1, (nx+1)/2 - 1
+                      ar(nx+1-i,j,k) = work(2*i+1)
+                   ENDDO
+                ENDDO
+             ENDDO
+             !$OMP END PARALLEL
+          ELSE
+             !$OMP PARALLEL PRIVATE ( work, i, j, k )
+             !$OMP DO
+             DO  k = nzb_x, nzt_x
+                DO  j = nys_x, nyn_x
+                   DO  i = 0, (nx+1)/2
+                      work(2*i) = ar(i,j,k)
+                   ENDDO
+                   DO  i = 1, (nx+1)/2 - 1
+                      work(2*i+1) = ar(nx+1-i,j,k)
+                   ENDDO
+                   work(1) = 0.0
+                   work(nx+2) = 0.0
+                   CALL DCRFT( 0, work, 1, work, 1, nx+1, 1, -1, sqr_dnx, aux3, nau1, &
+                               aux4, nau2 )
+                   DO  i = 0, nx
+                      ar(i,j,k) = work(i)
+                   ENDDO
+                ENDDO
+             ENDDO
+             !$OMP END PARALLEL
+          ENDIF
+#elif defined( __nec )
+          IF ( forward_fft )  THEN
+             !$OMP PARALLEL PRIVATE ( work, i, j, k )
+             !$OMP DO
+             DO  k = nzb_x, nzt_x
+                DO  j = nys_x, nyn_x
+                   work(0:nx) = ar(0:nx,j,k)
+                   CALL DZFFT( 1, nx+1, sqr_dnx, work, work, trig_xf, work2, 0 )
+                   DO  i = 0, (nx+1)/2
+                      ar(i,j,k) = work(2*i)
+                   ENDDO
+                   DO  i = 1, (nx+1)/2 - 1
+                      ar(nx+1-i,j,k) = work(2*i+1)
+                   ENDDO
+                ENDDO
+             ENDDO
+             !$END OMP PARALLEL
+          ELSE
+             !$OMP PARALLEL PRIVATE ( work, i, j, k )
+             !$OMP DO
+             DO  k = nzb_x, nzt_x
+                DO  j = nys_x, nyn_x
+                   DO  i = 0, (nx+1)/2
+                      work(2*i) = ar(i,j,k)
+                   ENDDO
+                   DO  i = 1, (nx+1)/2 - 1
+                      work(2*i+1) = ar(nx+1-i,j,k)
+                   ENDDO
+                   work(1) = 0.0
+                   work(nx+2) = 0.0
+                   CALL ZDFFT( -1, nx+1, sqr_dnx, work, work, trig_xb, work2, 0 )
+                   ar(0:nx,j,k) = work(0:nx)
+                ENDDO
+             ENDDO
+             !$OMP END PARALLEL
+          ENDIF
+#elif defined( __cuda_fft )
+          ALLOCATE( cuda_a_device(0:total_points_x_transpo-1) )
+          ALLOCATE( cuda_b_device(0:((nx+1)/2+1) * (nyn_x-nys_x+1) * (nzt_x-nzb_x+1) - 1) )
+          ALLOCATE( cuda_host(0:((nx+1)/2+1) * (nyn_x-nys_x+1) * (nzt_x-nzb_x+1) - 1) )
+          m = 0
+          IF ( forward_fft )  THEN
+             cuda_a_device = ar(0:total_points_x_transpo-1,nys_x,nzb_x)
+             CALL CUFFTEXECD2Z( plan_xf, cuda_a_device, cuda_b_device )
+             cuda_host = cuda_b_device
+             DO  k = nzb_x, nzt_x
+                DO  j = nys_x, nyn_x
+                   DO  i = 0, (nx+1)/2
+                      ar(i,j,k)      = REAL( cuda_host(m+i) )  * dnx
+                   ENDDO
+                   DO  i = 1, (nx+1)/2 - 1
+                      ar(nx+1-i,j,k) = AIMAG( cuda_host(m+i) ) * dnx
+                   ENDDO
+                   m = m + (nx+1)/2 + 1
+                ENDDO
+             ENDDO
+          ELSE
+             DO  k = nzb_x, nzt_x
+                DO  j = nys_x, nyn_x
+                   cuda_host(m) = CMPLX( ar(0,j,k), 0.0 )
+                   DO  i = 1, (nx+1)/2 - 1
+                      cuda_host(m+i) = CMPLX( ar(i,j,k), ar(nx+1-i,j,k) )
+                   ENDDO
+                   cuda_host(m+(nx+1)/2) = CMPLX( ar((nx+1)/2,j,k), 0.0 )
+                   m = m + (nx+1)/2 + 1
+                ENDDO
+             ENDDO
+             cuda_b_device = cuda_host
+             CALL CUFFTEXECZ2D( plan_xi, cuda_b_device, cuda_a_device )
+             ar(0:total_points_x_transpo-1,nys_x,nzb_x) = cuda_a_device
+          ENDIF
+          DEALLOCATE( cuda_a_device, cuda_b_device, cuda_host )
+#else
+          message_string = 'no system-specific fft-call available'
+          CALL message( 'fft_x', 'PA0188', 1, 2, 0, 6, 0 )
+#endif
+       ELSE
+          message_string = 'fft method "' // TRIM( fft_method) // &
+                           '" not available'
+          CALL message( 'fft_x', 'PA0189', 1, 2, 0, 6, 0 )
+       ENDIF
+    END SUBROUTINE fft_x
+    SUBROUTINE fft_x_1d( ar, direction )
+!----------------------------------------------------------------------!
+!                               fft_x_1d                               !
+!                                                                      !
+!               Fourier-transformation along x-direction               !
+!                     Version for 1D-decomposition                     !
+!                                                                      !
+!      fft_x uses internal algorithms (Singleton or Temperton) or      !
+!           system-specific routines, if they are available            !
+!----------------------------------------------------------------------!
        IMPLICIT NONE
 …
        INTEGER ::  i, ishape(1)
+!kk    REAL, DIMENSION(:)        ::  ar !kk Does NOT work (Bug??)
+       LOGICAL ::  forward_fft
        REAL, DIMENSION(0:nx)     ::  ar
        REAL, DIMENSION(0:nx+2)   ::  work
 …
 #endif
+       IF ( direction == 'forward' )  THEN
+          forward_fft = .TRUE.
+       ELSE
+          forward_fft = .FALSE.
+       ENDIF
        IF ( fft_method == 'singleton-algorithm' )  THEN
 …
           ALLOCATE( cwork(0:nx) )
           IF ( direction == 'forward')   then
+          IF ( forward_fft )   then
              DO  i = 0, nx
 …
              ishape = SHAPE( cwork )
              CALL FFTN( cwork, ishape )
              DO  i = 0, (nx+1)/2
                 ar(i) = REAL( cwork(i) )
 …
 !--       Performing the fft with Temperton's software works on every system,
 !--       since it is part of the model
           IF ( direction == 'forward' )  THEN
+          IF ( forward_fft )  THEN
              work(0:nx) = ar
 …
 #if defined( __ibm )  &&  ! defined( __ibmy_special )
           IF ( direction == 'forward' )  THEN
              CALL DRCFT( 0, ar, 1, work, 1, nx+1, 1, 1, sqr_nx, aux1, nau1, &
+          IF ( forward_fft )  THEN
+             CALL DRCFT( 0, ar, 1, work, 1, nx+1, 1, 1, sqr_dnx, aux1, nau1, &
                          aux2, nau2 )
 …
              work(nx+2) = 0.0
              CALL DCRFT( 0, work, 1, work, 1, nx+1, 1, -1, sqr_nx, aux3, nau1, &
+             CALL DCRFT( 0, work, 1, work, 1, nx+1, 1, -1, sqr_dnx, aux3, nau1, &
                          aux4, nau2 )
 …
           ENDIF
 #elif defined( __nec )
           IF ( direction == 'forward' )  THEN
+          IF ( forward_fft )  THEN
              work(0:nx) = ar(0:nx)
              CALL DZFFT( 1, nx+1, sqr_nx, work, work, trig_xf, work2, 0 )
+             CALL DZFFT( 1, nx+1, sqr_dnx, work, work, trig_xf, work2, 0 )
              DO  i = 0, (nx+1)/2
                 ar(i) = work(2*i)
 …
              work(nx+2) = 0.0
              CALL ZDFFT( -1, nx+1, sqr_nx, work, work, trig_xb, work2, 0 )
+             CALL ZDFFT( -1, nx+1, sqr_dnx, work, work, trig_xb, work2, 0 )
              ar(0:nx) = work(0:nx)
 …
 #else
           message_string = 'no system-specific fft-call available'
           CALL message( 'fft_x', 'PA0188', 1, 2, 0, 6, 0 )
+          CALL message( 'fft_x_1d', 'PA0188', 1, 2, 0, 6, 0 )
 #endif
        ELSE
           message_string = 'fft method "' // TRIM( fft_method) // &
                            '" not available'
           CALL message( 'fft_x', 'PA0189', 1, 2, 0, 6, 0 )
+          CALL message( 'fft_x_1d', 'PA0189', 1, 2, 0, 6, 0 )
        ENDIF
     END SUBROUTINE fft_x
+    END SUBROUTINE fft_x_1d
     SUBROUTINE fft_y( ar, direction )
 …
 !                                                                      !
 !               Fourier-transformation along y-direction               !
+!                     Version for 2D-decomposition                     !
 !                                                                      !
 !      fft_y uses internal algorithms (Singleton or Temperton) or      !
 …
 !----------------------------------------------------------------------!
+       USE cuda_fft_interfaces
+       IMPLICIT NONE
+       CHARACTER (LEN=*) ::  direction
+       INTEGER ::  i, j, jshape(1), k, m
+       LOGICAL ::  forward_fft
+       REAL, DIMENSION(0:ny+2)   ::  work
+       REAL, DIMENSION(ny+2)     ::  work1
+       COMPLEX, DIMENSION(:), ALLOCATABLE ::  cwork
+#if defined( __ibm )
+       REAL, DIMENSION(nau2)     ::  auy2, auy4
+#elif defined( __nec )
+       REAL, DIMENSION(6*(ny+1)) ::  work2
+#elif defined( __cuda_fft )
+       REAL(dpk), DEVICE, DIMENSION(:), ALLOCATABLE    ::  cuda_a_device
+       COMPLEX(dpk), DEVICE, DIMENSION(:), ALLOCATABLE ::  cuda_b_device
+       COMPLEX(dpk), DIMENSION(:), ALLOCATABLE         ::  cuda_host
+#endif
+       REAL, DIMENSION(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) ::  ar
+       IF ( direction == 'forward' )  THEN
+          forward_fft = .TRUE.
+       ELSE
+          forward_fft = .FALSE.
+       ENDIF
+       IF ( fft_method == 'singleton-algorithm' )  THEN
+!
+!--       Performing the fft with singleton's software works on every system,
+!--       since it is part of the model
+          ALLOCATE( cwork(0:ny) )
+          IF ( forward_fft )   then
+             !$OMP PARALLEL PRIVATE ( cwork, i, jshape, j, k )
+             !$OMP DO
+             DO  k = nzb_y, nzt_y
+                DO  i = nxl_y, nxr_y
+                   DO  j = 0, ny
+                      cwork(j) = CMPLX( ar(j,i,k) )
+                   ENDDO
+                   jshape = SHAPE( cwork )
+                   CALL FFTN( cwork, jshape )
+                   DO  j = 0, (ny+1)/2
+                      ar(j,i,k) = REAL( cwork(j) )
+                   ENDDO
+                   DO  j = 1, (ny+1)/2 - 1
+                      ar(ny+1-j,i,k) = -AIMAG( cwork(j) )
+                   ENDDO
+                ENDDO
+             ENDDO
+             !$OMP END PARALLEL
+          ELSE
+             !$OMP PARALLEL PRIVATE ( cwork, i, jshape, j, k )
+             !$OMP DO
+             DO  k = nzb_y, nzt_y
+                DO  i = nxl_y, nxr_y
+                   cwork(0) = CMPLX( ar(0,i,k), 0.0 )
+                   DO  j = 1, (ny+1)/2 - 1
+                      cwork(j)      = CMPLX( ar(j,i,k), -ar(ny+1-j,i,k) )
+                      cwork(ny+1-j) = CMPLX( ar(j,i,k),  ar(ny+1-j,i,k) )
+                   ENDDO
+                   cwork((ny+1)/2) = CMPLX( ar((ny+1)/2,i,k), 0.0 )
+                   jshape = SHAPE( cwork )
+                   CALL FFTN( cwork, jshape, inv = .TRUE. )
+                   DO  j = 0, ny
+                      ar(j,i,k) = REAL( cwork(j) )
+                   ENDDO
+                ENDDO
+             ENDDO
+             !$OMP END PARALLEL
+          ENDIF
+          DEALLOCATE( cwork )
+       ELSEIF ( fft_method == 'temperton-algorithm' )  THEN
+!
+!--       Performing the fft with Temperton's software works on every system,
+!--       since it is part of the model
+          IF ( forward_fft )  THEN
+             !$OMP PARALLEL PRIVATE ( work, i, j, k )
+             !$OMP DO
+             DO  k = nzb_y, nzt_y
+                DO  i = nxl_y, nxr_y
+                   work(0:ny) = ar(0:ny,i,k)
+                   CALL fft991cy( work, work1, trigs_y, ifax_y, 1, ny+1, ny+1, 1, -1 )
+                   DO  j = 0, (ny+1)/2
+                      ar(j,i,k) = work(2*j)
+                   ENDDO
+                   DO  j = 1, (ny+1)/2 - 1
+                      ar(ny+1-j,i,k) = work(2*j+1)
+                   ENDDO
+                ENDDO
+             ENDDO
+             !$OMP END PARALLEL
+          ELSE
+             !$OMP PARALLEL PRIVATE ( work, i, j, k )
+             !$OMP DO
+             DO  k = nzb_y, nzt_y
+                DO  i = nxl_y, nxr_y
+                   DO  j = 0, (ny+1)/2
+                      work(2*j) = ar(j,i,k)
+                   ENDDO
+                   DO  j = 1, (ny+1)/2 - 1
+                      work(2*j+1) = ar(ny+1-j,i,k)
+                   ENDDO
+                   work(1)    = 0.0
+                   work(ny+2) = 0.0
+                   CALL fft991cy( work, work1, trigs_y, ifax_y, 1, ny+1, ny+1, 1, 1 )
+                   ar(0:ny,i,k) = work(0:ny)
+                ENDDO
+             ENDDO
+             !$OMP END PARALLEL
+          ENDIF
+       ELSEIF ( fft_method == 'system-specific' )  THEN
+#if defined( __ibm )  &&  ! defined( __ibmy_special )
+          IF ( forward_fft)  THEN
+             !$OMP PARALLEL PRIVATE ( work, i, j, k )
+             !$OMP DO
+             DO  k = nzb_y, nzt_y
+                DO  i = nxl_y, nxr_y
+                   CALL DRCFT( 0, ar, 1, work, 1, ny+1, 1, 1, sqr_dny, auy1, nau1, &
+                               auy2, nau2 )
+                   DO  j = 0, (ny+1)/2
+                      ar(j,i,k) = work(2*j)
+                   ENDDO
+                   DO  j = 1, (ny+1)/2 - 1
+                      ar(ny+1-j,i,k) = work(2*j+1)
+                   ENDDO
+                ENDDO
+             ENDDO
+             !$OMP END PARALLEL
+          ELSE
+             !$OMP PARALLEL PRIVATE ( work, i, j, k )
+             !$OMP DO
+             DO  k = nzb_y, nzt_y
+                DO  i = nxl_y, nxr_y
+                   DO  j = 0, (ny+1)/2
+                      work(2*j) = ar(j,i,k)
+                   ENDDO
+                   DO  j = 1, (ny+1)/2 - 1
+                      work(2*j+1) = ar(ny+1-j,i,k)
+                   ENDDO
+                   work(1)    = 0.0
+                   work(ny+2) = 0.0
+                   CALL DCRFT( 0, work, 1, work, 1, ny+1, 1, -1, sqr_dny, auy3, nau1, &
+                               auy4, nau2 )
+                   DO  j = 0, ny
+                      ar(j,i,k) = work(j)
+                   ENDDO
+                ENDDO
+             ENDDO
+             !$OMP END PARALLEL
+          ENDIF
+#elif defined( __nec )
+          IF ( forward_fft )  THEN
+             !$OMP PARALLEL PRIVATE ( work, i, j, k )
+             !$OMP DO
+             DO  k = nzb_y, nzt_y
+                DO  i = nxl_y, nxr_y
+                   work(0:ny) = ar(0:ny,i,k)
+                   CALL DZFFT( 1, ny+1, sqr_dny, work, work, trig_yf, work2, 0 )
+                   DO  j = 0, (ny+1)/2
+                      ar(j,i,k) = work(2*j)
+                   ENDDO
+                   DO  j = 1, (ny+1)/2 - 1
+                      ar(ny+1-j,i,k) = work(2*j+1)
+                   ENDDO
+                ENDDO
+             ENDDO
+             !$END OMP PARALLEL
+          ELSE
+             !$OMP PARALLEL PRIVATE ( work, i, j, k )
+             !$OMP DO
+             DO  k = nzb_y, nzt_y
+                DO  i = nxl_y, nxr_y
+                   DO  j = 0, (ny+1)/2
+                      work(2*j) = ar(j,i,k)
+                   ENDDO
+                   DO  j = 1, (ny+1)/2 - 1
+                      work(2*j+1) = ar(ny+1-j,i,k)
+                   ENDDO
+                   work(1) = 0.0
+                   work(ny+2) = 0.0
+                   CALL ZDFFT( -1, ny+1, sqr_dny, work, work, trig_yb, work2, 0 )
+                   ar(0:ny,i,k) = work(0:ny)
+                ENDDO
+             ENDDO
+             !$OMP END PARALLEL
+          ENDIF
+#elif defined( __cuda_fft )
+          ALLOCATE( cuda_a_device(0:total_points_y_transpo-1) )
+          ALLOCATE( cuda_b_device(0:((ny+1)/2+1) * (nxr_y-nxl_y+1) * (nzt_y-nzb_y+1) - 1) )
+          ALLOCATE( cuda_host(0:((ny+1)/2+1) * (nxr_y-nxl_y+1) * (nzt_y-nzb_y+1) - 1) )
+          m = 0
+          IF ( forward_fft )  THEN
+             cuda_a_device = ar(0:total_points_y_transpo-1,nxl_y,nzb_y)
+             CALL CUFFTEXECD2Z( plan_yf, cuda_a_device, cuda_b_device )
+             cuda_host = cuda_b_device
+             DO  k = nzb_y, nzt_y
+                DO  i = nxl_y, nxr_y
+                   DO  j = 0, (ny+1)/2
+                      ar(j,i,k)      = REAL( cuda_host(m+j) )  * dny
+                   ENDDO
+                   DO  j = 1, (ny+1)/2 - 1
+                      ar(ny+1-j,i,k) = AIMAG( cuda_host(m+j) ) * dny
+                   ENDDO
+                   m = m + (ny+1)/2 + 1
+                ENDDO
+             ENDDO
+          ELSE
+             DO  k = nzb_y, nzt_y
+                DO  i = nxl_y, nxr_y
+                   cuda_host(m) = CMPLX( ar(0,i,k), 0.0 )
+                   DO  j = 1, (ny+1)/2 - 1
+                      cuda_host(m+j) = CMPLX( ar(j,i,k), ar(ny+1-j,i,k) )
+                   ENDDO
+                   cuda_host(m+(ny+1)/2) = CMPLX( ar((ny+1)/2,i,k), 0.0 )
+                   m = m + (ny+1)/2 + 1
+                ENDDO
+             ENDDO
+             cuda_b_device = cuda_host
+             CALL CUFFTEXECZ2D( plan_yi, cuda_b_device, cuda_a_device )
+             ar(0:total_points_y_transpo-1,nxl_y,nzb_y) = cuda_a_device
+          ENDIF
+          DEALLOCATE( cuda_a_device, cuda_b_device, cuda_host )
+#else
+          message_string = 'no system-specific fft-call available'
+          CALL message( 'fft_y', 'PA0188', 1, 2, 0, 6, 0 )
+#endif
+       ELSE
+          message_string = 'fft method "' // TRIM( fft_method) // &
+                           '" not available'
+          CALL message( 'fft_y', 'PA0189', 1, 2, 0, 6, 0 )
+       ENDIF
+    END SUBROUTINE fft_y
+    SUBROUTINE fft_y_1d( ar, direction )
+!----------------------------------------------------------------------!
+!                               fft_y_1d                               !
+!                                                                      !
+!               Fourier-transformation along y-direction               !
+!                     Version for 1D-decomposition                     !
+!                                                                      !
+!      fft_y uses internal algorithms (Singleton or Temperton) or      !
+!           system-specific routines, if they are available            !
+!----------------------------------------------------------------------!
        IMPLICIT NONE
 …
        INTEGER ::  j, jshape(1)
+!kk    REAL, DIMENSION(:)        ::  ar !kk Does NOT work (Bug??)
+       LOGICAL ::  forward_fft
        REAL, DIMENSION(0:ny)     ::  ar
        REAL, DIMENSION(0:ny+2)   ::  work
 …
 #endif
+       IF ( direction == 'forward' )  THEN
+          forward_fft = .TRUE.
+       ELSE
+          forward_fft = .FALSE.
+       ENDIF
        IF ( fft_method == 'singleton-algorithm' )  THEN
 …
           ALLOCATE( cwork(0:ny) )
           IF ( direction == 'forward')  THEN
+          IF ( forward_fft )  THEN
              DO  j = 0, ny
 …
 !--       Performing the fft with Temperton's software works on every system,
 !--       since it is part of the model
           IF ( direction == 'forward' )  THEN
+          IF ( forward_fft )  THEN
              work(0:ny) = ar
 …
 #if defined( __ibm )  &&  ! defined( __ibmy_special )
           IF ( direction == 'forward')  THEN
              CALL DRCFT( 0, ar, 1, work, 1, ny+1, 1, 1, sqr_ny, auy1, nau1, &
+          IF ( forward_fft )  THEN
+             CALL DRCFT( 0, ar, 1, work, 1, ny+1, 1, 1, sqr_dny, auy1, nau1, &
                          auy2, nau2 )
 …
              work(ny+2) = 0.0
              CALL DCRFT( 0, work, 1, work, 1, ny+1, 1, -1, sqr_ny, auy3, nau1, &
+             CALL DCRFT( 0, work, 1, work, 1, ny+1, 1, -1, sqr_dny, auy3, nau1, &
                          auy4, nau2 )
 …
           ENDIF
 #elif defined( __nec )
           IF ( direction == 'forward' )  THEN
+          IF ( forward_fft )  THEN
              work(0:ny) = ar(0:ny)
              CALL DZFFT( 1, ny+1, sqr_ny, work, work, trig_yf, work2, 0 )
+             CALL DZFFT( 1, ny+1, sqr_dny, work, work, trig_yf, work2, 0 )
              DO  j = 0, (ny+1)/2
 …
              work(ny+2) = 0.0
              CALL ZDFFT( -1, ny+1, sqr_ny, work, work, trig_yb, work2, 0 )
+             CALL ZDFFT( -1, ny+1, sqr_dny, work, work, trig_yb, work2, 0 )
              ar(0:ny) = work(0:ny)
 …
 #else
           message_string = 'no system-specific fft-call available'
           CALL message( 'fft_y', 'PA0188', 1, 2, 0, 6, 0 )
+          CALL message( 'fft_y_1d', 'PA0188', 1, 2, 0, 6, 0 )
 #endif
 …
           message_string = 'fft method "' // TRIM( fft_method) // &
                            '" not available'
           CALL message( 'fft_y', 'PA0189', 1, 2, 0, 6, 0 )
+          CALL message( 'fft_y_1d', 'PA0189', 1, 2, 0, 6, 0 )
        ENDIF
     END SUBROUTINE fft_y
+    END SUBROUTINE fft_y_1d
     SUBROUTINE fft_x_m( ar, direction )
 …
 !--          Tables are initialized once more. This call should not be
 !--          necessary, but otherwise program aborts in asymmetric case
              CALL DZFFTM( 0, nx+1, nz1, sqr_nx, work, nx+4, work, nx+4, &
+             CALL DZFFTM( 0, nx+1, nz1, sqr_dnx, work, nx+4, work, nx+4, &
                           trig_xf, work1, 0 )
 …
              ENDIF
              CALL DZFFTM( 1, nx+1, nz1, sqr_nx, ai, siza, work, sizw, &
+             CALL DZFFTM( 1, nx+1, nz1, sqr_dnx, ai, siza, work, sizw, &
                           trig_xf, work1, 0 )
 …
 !--          Tables are initialized once more. This call should not be
 !--          necessary, but otherwise program aborts in asymmetric case
              CALL ZDFFTM( 0, nx+1, nz1, sqr_nx, work, nx+4, work, nx+4, &
+             CALL ZDFFTM( 0, nx+1, nz1, sqr_dnx, work, nx+4, work, nx+4, &
                           trig_xb, work1, 0 )
 …
              ENDDO
              CALL ZDFFTM( -1, nx+1, nz1, sqr_nx, work, sizw, ai, siza, &
+             CALL ZDFFTM( -1, nx+1, nz1, sqr_dnx, work, sizw, ai, siza, &
                           trig_xb, work1, 0 )
 …
 !--          Tables are initialized once more. This call should not be
 !--          necessary, but otherwise program aborts in asymmetric case
              CALL DZFFTM( 0, ny+1, nz1, sqr_ny, work, ny+4, work, ny+4, &
+             CALL DZFFTM( 0, ny+1, nz1, sqr_dny, work, ny+4, work, ny+4, &
                           trig_yf, work1, 0 )
 …
              ENDIF
              CALL DZFFTM( 1, ny+1, nz1, sqr_ny, ai, siza, work, sizw, &
+             CALL DZFFTM( 1, ny+1, nz1, sqr_dny, ai, siza, work, sizw, &
                           trig_yf, work1, 0 )
 …
 !--          Tables are initialized once more. This call should not be
 !--          necessary, but otherwise program aborts in asymmetric case
              CALL ZDFFTM( 0, ny+1, nz1, sqr_ny, work, ny+4, work, ny+4, &
+             CALL ZDFFTM( 0, ny+1, nz1, sqr_dny, work, ny+4, work, ny+4, &
                           trig_yb, work1, 0 )
 …
              ENDDO
              CALL ZDFFTM( -1, ny+1, nz1, sqr_ny, work, sizw, ai, siza, &
+             CALL ZDFFTM( -1, ny+1, nz1, sqr_dny, work, sizw, ai, siza, &
                           trig_yb, work1, 0 )
 …
     END SUBROUTINE fft_y_m
  END MODULE fft_xy

palm/trunk/SOURCE/header.f90

-                      r1093
+                      r1106
 ! Current revisions:
 ! -----------------
+!
+! some format changes for coupled runs
+!
 ! Former revisions:
 …
     CHARACTER (LEN=10) ::  coor_chr, host_chr
     CHARACTER (LEN=16) ::  begin_chr
     CHARACTER (LEN=23) ::  ver_rev
+    CHARACTER (LEN=26) ::  ver_rev
     CHARACTER (LEN=40) ::  output_format
     CHARACTER (LEN=70) ::  char1, char2, dopr_chr, &
 …
        WRITE ( io, 101 )  mpi_type, coupling_mode
     ENDIF
+    IF ( coupling_start_time /= 0.0 )  THEN
+       IF ( coupling_start_time > simulated_time_at_begin )  THEN
+          WRITE ( io, 109 )
+       ELSE
+          WRITE ( io, 114 )
+       ENDIF
+    ENDIF
     WRITE ( io, 102 )  run_date, run_identifier, run_time, runnr, &
                        ADJUSTR( host_chr )
 …
        IF ( time_restart /= 9999999.9  .AND.  time_restart < end_time )  THEN
           IF ( dt_restart == 9999999.9 )  THEN
              WRITE ( io, 204 )  ' Next restart at:  ',time_restart
+             WRITE ( io, 204 )  ' Next restart at:     ',time_restart
           ELSE
              WRITE ( io, 205 )  ' Next restart at:  ',time_restart, dt_restart
+             WRITE ( io, 205 )  ' Next restart at:     ',time_restart, dt_restart
           ENDIF
        ENDIF
 …
+!
 !-- Start time for coupled runs, if independent precursor runs for atmosphere
 !-- and ocean are used. In this case, coupling_start_time defines the time
 !-- when the coupling is switched on.
+!-- and ocean are used or have been used. In this case, coupling_start_time
+!-- defines the time when the coupling is switched on.
     IF ( coupling_start_time /= 0.0 )  THEN
+       IF ( coupling_start_time >= simulated_time_at_begin )  THEN
+          char1 = 'Precursor run for a coupled atmosphere-ocean run'
+       ELSE
+          char1 = 'Coupled atmosphere-ocean run following independent ' // &
+                  'precursor runs'
+       ENDIF
+       WRITE ( io, 207 )  char1, coupling_start_time
+       WRITE ( io, 207 )  coupling_start_time
     ENDIF
 …
 FORMAT (1X,78('-'))
 FORMAT (/1X,'***************************',9X,42('-')/        &
 X,'* ',A,' *',9X,A/                               &
 X,'***************************',9X,42('-'))
+FORMAT (/1X,'******************************',6X,42('-')/        &
+X,'* ',A,' *',6X,A/                               &
+X,'******************************',6X,42('-'))
 FORMAT (37X,'coupled run using MPI-',I1,': ',A/ &
 X,42('-'))
 FORMAT (/' Date:              ',A8,9X,'Run:       ',A20/      &
             ' Time:              ',A8,9X,'Run-No.:   ',I2.2/     &
             ' Run on host:     ',A10)
+FORMAT (/' Date:                 ',A8,6X,'Run:       ',A20/      &
+            ' Time:                 ',A8,6X,'Run-No.:   ',I2.2/     &
+            ' Run on host:        ',A10)
 #if defined( __parallel )
 FORMAT (' Number of PEs:',8X,I5,9X,'Processor grid (x,y): (',I3,',',I3, &
+FORMAT (' Number of PEs:',10X,I6,6X,'Processor grid (x,y): (',I3,',',I3, &
               ')',1X,A)
 FORMAT (' Number of PEs:',8X,I5,9X,'Tasks:',I4,'   threads per task:',I4/ &
 …
 FORMAT (37X,'A 1d-decomposition along ',A,' is used')
 FORMAT (37X,'Max. # of parallel I/O streams is ',I5)
+FORMAT (37X,'Precursor run for coupled atmos-ocean run'/ &
+X,42('-'))
+FORMAT (37X,'Coupled atmosphere-ocean run following'/ &
+X,'independent precursor runs'/             &
+X,42('-'))
 #endif
 FORMAT (/' Numerical Schemes:'/ &
 …
                   ' or Upstream')
 FORMAT (' --> Scalar advection via Bott-Chlond-Scheme')
 FORMAT (' --> Galilei-Transform applied to horizontal advection', &
             '     Translation velocity = ',A/ &
+FORMAT (' --> Galilei-Transform applied to horizontal advection:'/ &
+            '     translation velocity = ',A/ &
             '     distance advected ',A,':  ',F8.3,' km(x)  ',F8.3,' km(y)')
 FORMAT (' --> Time differencing scheme: ',A)
 …
 FORMAT (//' Run time and time step information:'/ &
              ' ----------------------------------'/)
 FORMAT ( ' Timestep:          variable     maximum value: ',F6.3,' s', &
+FORMAT ( ' Timestep:             variable     maximum value: ',F6.3,' s', &
              '    CFL-factor: ',F4.2)
 FORMAT ( ' Timestep:       dt = ',F6.3,' s'/)
 FORMAT ( ' Start time:       ',F9.3,' s'/ &
              ' End time:         ',F9.3,' s')
+FORMAT ( ' Timestep:          dt = ',F6.3,' s'/)
+FORMAT ( ' Start time:          ',F9.3,' s'/ &
+             ' End time:            ',F9.3,' s')
 FORMAT ( A,F9.3,' s')
 FORMAT ( A,F9.3,' s',5X,'restart every',17X,F9.3,' s')
 FORMAT (/' Time reached:     ',F9.3,' s'/ &
              ' CPU-time used:    ',F9.3,' s     per timestep:               ', &
                '  ',F9.3,' s'/                                                 &
              '                                   per second of simulated tim', &
+FORMAT (/' Time reached:        ',F9.3,' s'/ &
+             ' CPU-time used:       ',F9.3,' s     per timestep:               ', &
+               '  ',F9.3,' s'/                                                    &
+             '                                   per second of simulated tim',    &
                'e: ',F9.3,' s')
 FORMAT ( A/' Coupling start time:',F9.3,' s')
+FORMAT ( ' Coupling start time: ',F9.3,' s')
 FORMAT (//' Computational grid and domain size:'/ &
               ' ----------------------------------'// &

palm/trunk/SOURCE/microphysics.f90

-                      r1093
+                      r1106
 ! Current revisions:
 ! ------------------
+!
+! small changes in code formatting
+!
 ! Former revisions:
 …
     END SUBROUTINE dsd_properties
     SUBROUTINE autoconversion
 …
     END SUBROUTINE autoconversion
     SUBROUTINE accretion
 …
     END SUBROUTINE accretion
     SUBROUTINE selfcollection_breakup
 …
     END SUBROUTINE selfcollection_breakup
     SUBROUTINE evaporation_rain
 …
     END SUBROUTINE evaporation_rain
     SUBROUTINE sedimentation_cloud
 …
     END SUBROUTINE sedimentation_cloud
     SUBROUTINE sedimentation_rain
 …
           ENDDO
        ENDDO
     END SUBROUTINE sedimentation_rain
 …
     END SUBROUTINE dsd_properties_ij
     SUBROUTINE autoconversion_ij( i, j )
 …
        REAL    ::  k_au, autocon, phi_au, tau_cloud, xc, nu_c, rc,   &
                    l_mix, re_lambda, alpha_cc, r_cc, sigma_cc, epsilon
        k_au = k_cc / ( 20.0 * x0 )
 …
 !--          Tendencies for q, qr, nr, pt:
              tend_qr(k,j,i) = tend_qr(k,j,i) + autocon
              tend_q(k,j,i)  = tend_q(k,j,i) - autocon
+             tend_q(k,j,i)  = tend_q(k,j,i)  - autocon
              tend_nr(k,j,i) = tend_nr(k,j,i) + autocon / x0 * hyrho(k)
+             tend_pt(k,j,i) = tend_pt(k,j,i) + autocon * l_d_cp * pt_d_t(k)
+             tend_pt(k,j,i) = tend_pt(k,j,i) + autocon * l_d_cp * pt_d_t(k)
           ENDIF
 …
     END SUBROUTINE autoconversion_ij
     SUBROUTINE accretion_ij( i, j )
 …
        DO  k = nzb_2d(j,i)+1, nzt
           IF ( ( ql(k,j,i) > 0.0 )  .AND.  ( qr(k,j,i) > eps_sb ) )  THEN
+!
 …
 !--          Tendencies for q, qr, pt:
              tend_qr(k,j,i) = tend_qr(k,j,i) + accr
              tend_q(k,j,i)  = tend_q(k,j,i) - accr
+             tend_q(k,j,i)  = tend_q(k,j,i)  - accr
              tend_pt(k,j,i) = tend_pt(k,j,i) + accr * l_d_cp * pt_d_t(k)
           ENDIF
        ENDDO
 …
        REAL    ::  selfcoll, breakup, phi_br, phi_sc
        DO  k = nzb_2d(j,i)+1, nzt
           IF ( qr(k,j,i) > eps_sb )  THEN
+!
 …
 !--          Tendency for nr:
              tend_nr(k,j,i) = tend_nr(k,j,i) + selfcoll
           ENDIF
        ENDDO
     END SUBROUTINE selfcollection_breakup_ij
     SUBROUTINE evaporation_rain_ij( i, j )
 …
                    mu_r_2, mu_r_5d2, nr_0
        DO  k = nzb_2d(j,i)+1, nzt
           IF ( qr(k,j,i) > eps_sb )  THEN
+!
 …
              q_s = q_s * ( 1.0 + alpha * q(k,j,i) ) / ( 1.0 + alpha * q_s )
+!
 !--          Oversaturation:
+!--          Supersaturation:
              sat = MIN( 0.0, ( q(k,j,i) - ql(k,j,i) ) / q_s - 1.0 )
+!
 …
              evap = 2.0 * pi * nr_0 * g_evap * f_vent * sat /    &
                     hyrho(k)
              evap = MAX( evap, -qr(k,j,i) / ( dt_3d *                         &
+             evap = MAX( evap, -qr(k,j,i) / ( dt_3d *            &
                          weight_substep(intermediate_timestep_count) ) )
+!
 !--          Tendencies for q, qr, nr, pt:
              tend_qr(k,j,i) = tend_qr(k,j,i) + evap
              tend_q(k,j,i)  = tend_q(k,j,i) - evap
+             tend_q(k,j,i)  = tend_q(k,j,i)  - evap
              tend_nr(k,j,i) = tend_nr(k,j,i) + c_evap * evap / xr(k) * hyrho(k)
              tend_pt(k,j,i) = tend_pt(k,j,i) + evap * l_d_cp * pt_d_t(k)
           ENDIF
        ENDDO
     END SUBROUTINE evaporation_rain_ij
     SUBROUTINE sedimentation_cloud_ij( i, j )
 …
        INTEGER ::  i, j, k
        REAL    ::  sed_q_const, sigma_gc = 1.3, k_st = 1.2E8
+!
 !--    Sedimentation of cloud droplets (Heus et al., 2010):
 …
        ENDDO
+!
 !--   Tendency for q, pt:
+!--    Tendency for q, pt:
        DO  k = nzb_2d(j,i)+1, nzt
           tend_q(k,j,i)  = tend_q(k,j,i) + ( sed_q(k+1) - sed_q(k) ) *        &
 …
     END SUBROUTINE sedimentation_cloud_ij
     SUBROUTINE sedimentation_rain_ij( i, j )
 …
 !--    Computation of sedimentation flux. Implementation according to Stevens
 !--    and Seifert (2008).
        IF ( intermediate_timestep_count == 1 )  prr(:,j,i) = 0.0
 …
        ELSE
           nr_slope = 0.0
           qr_slope = 0.0
        ENDIF
+!
 …
           k_run = k
           c_run = MIN( 1.0, c_qr(k) )
           DO WHILE ( c_run > 0.0  .AND.  k_run <= nzt-1 )
              flux  = flux + hyrho(k_run) *                                    &
                      ( qr(k_run,j,i) + qr_slope(k_run) * ( 1.0 - c_run ) *    &
 …
              k_run = k_run + 1
              c_run = MIN( 1.0, c_qr(k_run) - z_run * ddzu(k_run) )
           ENDDO
+!
 …
     END SUBROUTINE sedimentation_rain_ij
+!
 …
        REAL    ::  gamm, ser, tmp, x_gamm, xx, y_gamm
        INTEGER ::  j
        x_gamm = xx
 …
        tmp = ( x_gamm + 0.5 ) * LOG( tmp ) - tmp
        ser = 1.000000000190015
+       do j = 1, 6
+       DO  j = 1, 6
           y_gamm = y_gamm + 1.0
           ser    = ser + cof( j ) / y_gamm
+       enddo
+       ENDDO
+!
 !--    Until this point the algorithm computes the logarithm of the gamma
 …
 !       gamm = EXP( tmp + LOG( stp * ser / x_gamm ) )
        gamm = EXP( tmp ) * stp * ser / x_gamm
        RETURN

palm/trunk/SOURCE/modules.f90

-                      r1096
+                      r1106
 ! Current revisions:
 ! ------------------
+! test: different dpk definition for CUDA FFT
+! test: different dpk definition for CUDA FFT  removed!!!! delete this comment
+! before next check in
+! array_kind renamed precision_kind, pdims defined in serial code
+! bugfix: default value assigned to coupling_start_time
+!
 ! Former revisions:
 …
  MODULE array_kind
+ MODULE precision_kind
 !------------------------------------------------------------------------------!
 …
                            spk = SELECTED_REAL_KIND( 6 )
+!-- test for CUDA FFT
+!    INTEGER, PARAMETER ::  single_kind = KIND( 0.0 ), &
+!                           double_kind = KIND( 0.0D0 )
+!    INTEGER, PARAMETER ::  dpk = double_kind
+    SAVE
+ END MODULE array_kind
+    SAVE
+ END MODULE precision_kind
 …
 !------------------------------------------------------------------------------!
     USE array_kind
+    USE precision_kind
     REAL, DIMENSION(:), ALLOCATABLE ::                                         &
 …
              canyon_wall_left = 9999999.9, canyon_wall_south = 9999999.9, &
              cthf = 0.0, cfl_factor = -1.0, cos_alpha_surface, &
              coupling_start_time, disturbance_amplitude = 0.25, &
+             coupling_start_time = 0.0, disturbance_amplitude = 0.25, &
              disturbance_energy_limit = 0.01, &
              disturbance_level_b = -9999999.9, &
 …
 !------------------------------------------------------------------------------!
     USE array_kind
+    USE precision_kind
     CHARACTER (LEN=15)  ::  bc_par_lr = 'cyclic',  bc_par_ns = 'cyclic', &
 …
 #endif
 #endif
+    CHARACTER(LEN=5)       ::  myid_char = ''
+    INTEGER                ::  acc_rank, id_inflow = 0, id_recycling = 0,      &
+                               myid = 0, num_acc_per_node = 0,                 &
+                               target_id, npex = -1, npey = -1, numprocs = 1,  &
+                               numprocs_previous_run = -1,                     &
+                               tasks_per_node = -9999, threads_per_task = 1
+    CHARACTER(LEN=5) ::  myid_char = ''
+    INTEGER          ::  acc_rank, id_inflow = 0, id_recycling = 0,      &
+                         myid = 0, num_acc_per_node = 0,                 &
+                         target_id, npex = -1, npey = -1, numprocs = 1,  &
+                         numprocs_previous_run = -1,                     &
+                         tasks_per_node = -9999, threads_per_task = 1
+    INTEGER          ::  pdims(2) = 1
     INTEGER, DIMENSION(:,:), ALLOCATABLE ::  hor_index_bounds, &
 …
                 type_x, type_x_int, type_xy, type_y, type_y_int
     INTEGER ::  ibuf(12), pcoord(2), pdims(2)
+    INTEGER ::  ibuf(12), pcoord(2)
 #if ! defined ( __check )

palm/trunk/SOURCE/poisfft.f90

-                      r1104
+                      r1106
 ! Current revisions:
 ! -----------------
+!
+! routines fftx, ffty, fftxp, fftyp removed, calls replaced by fft_x, fft_y,
+! in the 1D-decomposition routines fft_x, ffty are replaced by fft_x_1d,
+! fft_y_1d
+!
 ! Former revisions:
 …
           CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
           CALL fftxp( ar, 'forward' )
+          CALL fft_x( ar, 'forward' )
           CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
 …
           CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
           CALL fftyp( ar, 'forward' )
+          CALL fft_y( ar, 'forward' )
           CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
 …
+!
 !--       Solve the Poisson equation in z-direction in cartesian space.
+!--       Solve the tridiagonal equation system along z
           CALL cpu_log( log_point_s(6), 'tridia', 'start' )
           CALL tridia( ar )
 …
           CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
           CALL fftyp( ar, 'backward' )
+          CALL fft_y( ar, 'backward' )
           CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
 …
           CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
           CALL fftxp( ar, 'backward' )
+          CALL fft_x( ar, 'backward' )
           CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
 …
+!
 !--    Two-dimensional Fourier Transformation along x- and y-direction.
+       CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
+       !$acc data copyin( ar, work )
+       CALL transpose_zx( ar, work, ar )
+       !$acc update host( ar )
+       CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
        CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
        CALL fftx( ar, 'forward' )
+       CALL fft_x( ar, 'forward' )
        CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
+       CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
+       CALL transpose_xy( ar, work, ar )
+       CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
        CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
        CALL ffty( ar, 'forward' )
+       CALL fft_y( ar, 'forward' )
        CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
+!
+!--    Solve the Poisson equation in z-direction in cartesian space.
+!--    Solve the tridiagonal equation system along z
+       CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
+       CALL transpose_yz( ar, work, ar )
+       CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
        CALL cpu_log( log_point_s(6), 'tridia', 'start' )
        CALL tridia( ar )
        CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
+       CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
+       CALL transpose_zy( ar, work, ar )
+       CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
+!
 !--    Inverse Fourier Transformation.
        CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
        CALL ffty( ar, 'backward' )
+       CALL fft_y( ar, 'backward' )
        CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
+       CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
+       CALL transpose_yx( ar, work, ar )
+       CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
        CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
        CALL fftx( ar, 'backward' )
+       CALL fft_x( ar, 'backward' )
        CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
+       CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
+       CALL transpose_xz( ar, work, ar )
+       CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
+       !$acc end data
 #endif
 …
        REAL, DIMENSION(5,nxl_z:nxr_z,0:nz-1) ::  tri
-#if defined( __parallel )
        REAL    ::  ar(nxl_z:nxr_z,nys_z:nyn_z,1:nz)
-#else
-       REAL    ::  ar(1:nz,nys_z:nyn_z,nxl_z:nxr_z)
-#endif
 …
 !--       Forward substitution.
           DO  i = nxl_z, nxr_z
-#if defined( __parallel )
              ar1(i,0) = ar(i,j,1)
-#else
-             ar1(i,0) = ar(1,j,i)
-#endif
           ENDDO
           DO  k = 1, nz - 1
              DO  i = nxl_z, nxr_z
-#if defined( __parallel )
                 ar1(i,k) = ar(i,j,k+1) - tri(5,i,k) * ar1(i,k-1)
-#else
-                ar1(i,k) = ar(k+1,j,i) - tri(5,i,k) * ar1(i,k-1)
-#endif
              ENDDO
           ENDDO
 …
 !--       the model domain.
           DO  i = nxl_z, nxr_z
-#if defined( __parallel )
              ar(i,j,nz) = ar1(i,nz-1) / ( tri(4,i,nz-1) + 1.0E-20 )
-#else
-             ar(nz,j,i) = ar1(i,nz-1) / ( tri(4,i,nz-1) + 1.0E-20 )
-#endif
           ENDDO
           DO  k = nz-2, 0, -1
              DO  i = nxl_z, nxr_z
-#if defined( __parallel )
                 ar(i,j,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,j,k+2) ) &
                               / tri(4,i,k)
-#else
-                ar(k+1,j,i) = ( ar1(i,k) - tri(3,i,k) * ar(k+2,j,i) ) &
-                              / tri(4,i,k)
-#endif
              ENDDO
           ENDDO
 …
           IF ( ibc_p_b == 1  .AND.  ibc_p_t == 1 )  THEN
              IF ( j == 0  .AND.  nxl_z == 0 )  THEN
-#if defined( __parallel )
                 DO  k = 1, nz
                    ar(nxl_z,j,k) = 0.0
                 ENDDO
-#else
-                DO  k = 1, nz
-                   ar(k,j,nxl_z) = 0.0
-                ENDDO
-#endif
              ENDIF
           ENDIF
 …
     END SUBROUTINE tridia
-#if defined( __parallel )
-    SUBROUTINE fftxp( ar, direction )
-!------------------------------------------------------------------------------!
-! Fourier-transformation along x-direction                 Parallelized version
-!------------------------------------------------------------------------------!
-       IMPLICIT NONE
-       CHARACTER (LEN=*) ::  direction
-       INTEGER           ::  j, k
-       REAL              ::  ar(0:nx,nys_x:nyn_x,nzb_x:nzt_x)
+!
-!--    Performing the fft with one of the methods implemented
-!$OMP  PARALLEL PRIVATE ( j, k )
-!$OMP  DO
-       DO  k = nzb_x, nzt_x
-          DO  j = nys_x, nyn_x
-             CALL fft_x( ar(0:nx,j,k), direction )
-          ENDDO
-       ENDDO
-!$OMP  END PARALLEL
-    END SUBROUTINE fftxp
-#else
-    SUBROUTINE fftx( ar, direction )
-!------------------------------------------------------------------------------!
-! Fourier-transformation along x-direction                 Non parallel version
-!------------------------------------------------------------------------------!
-       IMPLICIT NONE
-       CHARACTER (LEN=*) ::  direction
-       INTEGER           ::  i, j, k
-       REAL              ::  ar(1:nz,0:ny,0:nx)
+!
-!--    Performing the fft with one of the methods implemented
-!$OMP  PARALLEL PRIVATE ( j, k )
-!$OMP  DO
-       DO  k = 1, nz
-          DO  j = 0, ny
-             CALL fft_x( ar(k,j,0:nx), direction )
-          ENDDO
-       ENDDO
-!$OMP  END PARALLEL
-    END SUBROUTINE fftx
-#endif
-#if defined( __parallel )
-    SUBROUTINE fftyp( ar, direction )
-!------------------------------------------------------------------------------!
-! Fourier-transformation along y-direction                 Parallelized version
-!------------------------------------------------------------------------------!
-       IMPLICIT NONE
-       CHARACTER (LEN=*) ::  direction
-       INTEGER           ::  i, k
-       REAL              ::  ar(0:ny,nxl_y:nxr_y,nzb_y:nzt_y)
+!
-!--    Performing the fft with one of the methods implemented
-!$OMP  PARALLEL PRIVATE ( i, k )
-!$OMP  DO
-       DO  k = nzb_y, nzt_y
-          DO  i = nxl_y, nxr_y
-             CALL fft_y( ar(0:ny,i,k), direction )
-          ENDDO
-       ENDDO
-!$OMP  END PARALLEL
-    END SUBROUTINE fftyp
-#else
-    SUBROUTINE ffty( ar, direction )
-!------------------------------------------------------------------------------!
-! Fourier-transformation along y-direction                 Non parallel version
-!------------------------------------------------------------------------------!
-       IMPLICIT NONE
-       CHARACTER (LEN=*) ::  direction
-       INTEGER           ::  i, k
-       REAL              ::  ar(1:nz,0:ny,0:nx)
+!
-!--    Performing the fft with one of the methods implemented
-!$OMP  PARALLEL PRIVATE ( i, k )
-!$OMP  DO
-       DO  k = 1, nz
-          DO  i = 0, nx
-             CALL fft_y( ar(k,0:ny,i), direction )
-          ENDDO
-       ENDDO
-!$OMP  END PARALLEL
-    END SUBROUTINE ffty
-#endif
 #if defined( __parallel )
 …
 !--    1d-decomposition along x. Resort the data in a way that x becomes
 !--    the first index.
        CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
+       CALL cpu_log( log_point_s(7), 'fft_y_1d', 'start' )
        IF ( host(1:3) == 'nec' )  THEN
 …
+!
 !--                FFT along y
                    CALL fft_y( work_ffty(:,ir), 'forward' )
+                   CALL fft_y_1d( work_ffty(:,ir), 'forward' )
                 ENDDO
 …
        ENDIF
        CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
+       CALL cpu_log( log_point_s(7), 'fft_y_1d', 'pause' )
+!
 …
 !--    Resort the data in a way that y becomes the first index and carry out the
 !--    backward fft along y.
        CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
+       CALL cpu_log( log_point_s(7), 'fft_y_1d', 'continue' )
        IF ( host(1:3) == 'nec' )  THEN
 …
 !--                FFT along y
                    ir = i-iouter+1  ! counter within a stride
                    CALL fft_y( work_ffty(:,ir), 'backward' )
+                   CALL fft_y_1d( work_ffty(:,ir), 'backward' )
                    DO  j = 0, ny
 …
        ENDIF
        CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
+       CALL cpu_log( log_point_s(7), 'fft_y_1d', 'stop' )
     END SUBROUTINE tr_xy_ffty
 …
        CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'start' )
+       CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'start' )
        ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) )
 …
                 ENDDO
                 CALL fft_x( work_fftx, 'forward' )
+                CALL fft_x_1d( work_fftx, 'forward' )
                 DO  i = 0, nx
 …
                 ENDDO
                 CALL fft_x( work_fftx, 'backward' )
+                CALL fft_x_1d( work_fftx, 'backward' )
                 m = 0
 …
        DEALLOCATE( tri )
        CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'stop' )
+       CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'stop' )
     END SUBROUTINE fftx_tri_fftx
 …
 !--    1d-decomposition along y. Resort the data in a way that y becomes
 !--    the first index.
        CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
+       CALL cpu_log( log_point_s(4), 'fft_x_1d', 'start' )
        IF ( host(1:3) == 'nec' )  THEN
 …
              DO  k = 1, nz
                 CALL fft_x( work_fftx(0:nx,k,j), 'forward' )
+                CALL fft_x_1d( work_fftx(0:nx,k,j), 'forward' )
                 DO  i = 0, nx
 …
        ENDIF
        CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
+       CALL cpu_log( log_point_s(4), 'fft_x_1d', 'pause' )
+!
 …
 !--    1d-decomposition along y. Resort the data in a way that y becomes
 !--    the first index.
        CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
+       CALL cpu_log( log_point_s(4), 'fft_x_1d', 'continue' )
        IF ( host(1:3) == 'nec' )  THEN
 …
                 ENDDO
                 CALL fft_x( work_fftx(0:nx,k,j), 'backward' )
+                CALL fft_x_1d( work_fftx(0:nx,k,j), 'backward' )
              ENDDO
 …
        ENDIF
        CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
+       CALL cpu_log( log_point_s(4), 'fft_x_1d', 'stop' )
     END SUBROUTINE tr_yx_fftx
 …
        CALL cpu_log( log_point_s(39), 'fft_y + tridia', 'start' )
+       CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'start' )
        ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) )
 …
                 ENDDO
                 CALL fft_y( work_ffty, 'forward' )
+                CALL fft_y_1d( work_ffty, 'forward' )
                 DO  j = 0, ny
 …
                 ENDDO
                 CALL fft_y( work_ffty, 'backward' )
+                CALL fft_y_1d( work_ffty, 'backward' )
                 m = 0
 …
        DEALLOCATE( tri )
        CALL cpu_log( log_point_s(39), 'fft_y + tridia', 'stop' )
+       CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'stop' )
     END SUBROUTINE ffty_tri_ffty

palm/trunk/SOURCE/poisfft_hybrid.f90

-                      r1037
+                      r1106
 ! Current revisions:
 ! -----------------
+!
+! calls of fft_x, fft_y replaced by fft_x_1d, fft_y_1d
+!
 ! Former revisions:
 …
        CALL cpu_log( log_point_s(30), 'poisfft_hybrid_omp', 'start' )
        CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
+       CALL cpu_log( log_point_s(7), 'fft_y_1d', 'start' )
 !$OMP  PARALLEL PRIVATE (i,iouter,ii,ir,iei,j,k,m,n,ffty_ar)
 …
                 ENDDO
                 CALL fft_y( ffty_ar(:,ir), 'forward' )
+                CALL fft_y_1d( ffty_ar(:,ir), 'forward' )
              ENDDO
 …
 !$OMP  END PARALLEL
        CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
+       CALL cpu_log( log_point_s(7), 'fft_y_1d', 'pause' )
 #if defined( __parallel )
 …
 #endif
        CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'start' )
+       CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'start' )
 #if defined( __KKMP )
 …
              ENDDO
              CALL fft_x( fftx_ar, 'forward' )
+             CALL fft_x_1d( fftx_ar, 'forward' )
              DO  i = nxl_a, nxr_a
 …
              ENDDO
              CALL fft_x( fftx_ar, 'backward' )
+             CALL fft_x_1d( fftx_ar, 'backward' )
              m = nxl_a
 …
 #endif
        CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'stop' )
+       CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'stop' )
 #if defined( __parallel )
 …
 #endif
        CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
+       CALL cpu_log( log_point_s(7), 'fft_y_1d', 'continue' )
 !$OMP  PARALLEL PRIVATE (i,iouter,ii,ir,iei,j,k,m,n,ffty_ar)
 …
                 ii = nxl + i
                 ir = i - iouter + 1
                 CALL fft_y( ffty_ar(:,ir), 'backward' )
+                CALL fft_y_1d( ffty_ar(:,ir), 'backward' )
                 DO  j = nys_a, nyn_a
 …
 !$OMP  END PARALLEL
        CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
+       CALL cpu_log( log_point_s(7), 'fft_y_1d', 'stop' )
        CALL cpu_log( log_point_s(30), 'poisfft_hybrid_omp', 'stop' )
 …
        CALL cpu_log( log_point_s(30), 'poisfft_hybrid_nodes', 'start' )
        CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
+       CALL cpu_log( log_point_s(7), 'fft_y_1d', 'start' )
+!
 …
                 ENDDO
                 CALL fft_y( ffty_ar(:,ir), 'forward' )
+                CALL fft_y_1d( ffty_ar(:,ir), 'forward' )
              ENDDO
 …
        ENDDO
        CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
+       CALL cpu_log( log_point_s(7), 'fft_y_1d', 'pause' )
        CALL cpu_log( log_point_s(32), 'alltoall_task', 'start' )
 …
           CALL cascade( 2, j, nys_p, nyn_p )
           CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'start' )
+          CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'start' )
           DO  k = 1, nz
 …
              ENDDO
              CALL fft_x( fftx_ar, 'forward' )
+             CALL fft_x_1d( fftx_ar, 'forward' )
              DO  i = nxl_a, nxr_a
 …
              ENDDO
              CALL fft_x( fftx_ar, 'backward' )
+             CALL fft_x_1d( fftx_ar, 'backward' )
              m = nxl_a
 …
           ENDDO
           CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'stop' )
+          CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'stop' )
           nw2 = nw1 * SIZE( work1, 3 )
           CALL cpu_log( log_point_s(37), 'alltoall_node', 'continue' )
 …
        CALL cpu_log( log_point_s(32), 'alltoall_task', 'stop' )
        CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
+       CALL cpu_log( log_point_s(7), 'fft_y_1d', 'continue' )
        DO  iouter = nxl_p, nxr_p, istride
 …
                 ii = nxl + i
                 ir = i - iouter + 1
                 CALL fft_y( ffty_ar(:,ir), 'backward' )
+                CALL fft_y_1d( ffty_ar(:,ir), 'backward' )
                 DO  j = nys_a, nyn_a
 …
        ENDDO
        CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
+       CALL cpu_log( log_point_s(7), 'fft_y_1d', 'stop' )
        CALL cpu_log( log_point_s(30), 'poisfft_hybrid_nodes', 'stop' )

palm/trunk/SOURCE/prognostic_equations.f90

-                      r1093
+                      r1106
 ! Current revisions:
 ! ------------------
+!
+! small changes in code formatting
+!
 ! Former revisions:
 …
+!
+!--      If required, calculate tendencies for total water content, rain water
+!--      content, rain drop concentration and liquid temperature
+         IF ( cloud_physics .AND. icloud_scheme == 0 )  THEN
+            tend_q(:,j,i)  = 0.0
+            tend_qr(:,j,i) = 0.0
+            tend_nr(:,j,i) = 0.0
+            tend_pt(:,j,i) = 0.0
+!
+!--         Droplet size distribution (dsd) properties are needed for the
+!--         computation of selfcollection, breakup, evaporation and
+!--         sedimentation of rain.
+            IF ( precipitation )  THEN
+               CALL dsd_properties( i,j )
+               CALL autoconversion( i,j )
+               CALL accretion( i,j )
+               CALL selfcollection_breakup( i,j )
+               CALL evaporation_rain( i,j )
+               CALL sedimentation_rain( i,j )
+            ENDIF
+            IF ( drizzle )  CALL sedimentation_cloud( i,j )
+         ENDIF
+!--       If required, calculate tendencies for total water content, rain water
+!--       content, rain drop concentration and liquid temperature
+          IF ( cloud_physics  .AND.  icloud_scheme == 0 )  THEN
+             tend_q(:,j,i)  = 0.0
+             tend_qr(:,j,i) = 0.0
+             tend_nr(:,j,i) = 0.0
+             tend_pt(:,j,i) = 0.0
+!
+!--          Droplet size distribution (dsd) properties are needed for the
+!--          computation of selfcollection, breakup, evaporation and
+!--          sedimentation of rain
+             IF ( precipitation )  THEN
+                CALL dsd_properties( i,j )
+                CALL autoconversion( i,j )
+                CALL accretion( i,j )
+                CALL selfcollection_breakup( i,j )
+                CALL evaporation_rain( i,j )
+                CALL sedimentation_rain( i,j )
+             ENDIF
+             IF ( drizzle )  CALL sedimentation_cloud( i,j )
+          ENDIF
+!
 …
              ENDIF
+!
 !--          Using microphysical tendencies (latent heat)
              IF ( cloud_physics )  THEN
                 IF ( icloud_scheme == 0 )  THEN
                    tend(:,j,i) = tend(:,j,i) + tend_pt(:,j,i)
                 ELSEIF ( icloud_scheme == 1 .AND. precipitation)  THEN
+                ELSEIF ( icloud_scheme == 1  .AND.  precipitation )  THEN
                    CALL impact_of_latent_heat( i, j )
                 ENDIF
 …
+!
 !--          Consideration of heat sources within the plant canopy
              IF ( plant_canopy .AND. ( cthf /= 0.0 ) ) THEN
+             IF ( plant_canopy  .AND.  cthf /= 0.0 )  THEN
                 CALL plant_canopy_model( i, j, 4 )
              ENDIF
+!
 !--          If required, compute influence of large-scale subsidence/ascent
+!--          If required, compute effect of large-scale subsidence/ascent
              IF ( large_scale_subsidence )  THEN
                 CALL subsidence( i, j, tend, pt, pt_init )
              ENDIF
              CALL user_actions( i, j, 'pt-tendency' )
 …
                 IF ( icloud_scheme == 0 )  THEN
                    tend(:,j,i) = tend(:,j,i) + tend_q(:,j,i)
                 ELSEIF ( icloud_scheme == 1 .AND. precipitation )  THEN
+                ELSEIF ( icloud_scheme == 1  .AND.  precipitation )  THEN
                    CALL calc_precipitation( i, j )
                 ENDIF
 …
+!
 !--          Sink or source of scalar concentration due to canopy elements
              IF ( plant_canopy ) CALL plant_canopy_model( i, j, 5 )
+             IF ( plant_canopy )  CALL plant_canopy_model( i, j, 5 )
+!
 …
 !--          If required, calculate prognostic equations for rain water content
 !--          and rain drop concentration
              IF ( cloud_physics .AND. icloud_scheme == 0 )  THEN
+             IF ( cloud_physics  .AND.  icloud_scheme == 0 )  THEN
+!
 !--             Calculate prognostic equation for rain water content
 …
                 IF ( timestep_scheme(1:5) == 'runge' )  THEN
                    IF ( ws_scheme_sca )  THEN
                       CALL advec_s_ws( i, j, nr, 'nr', flux_s_nr,    &
                                     diss_s_nr, flux_l_nr, diss_l_nr, &
                                     i_omp_start, tn )
+                      CALL advec_s_ws( i, j, nr, 'nr', flux_s_nr,       &
+                                       diss_s_nr, flux_l_nr, diss_l_nr, &
+                                       i_omp_start, tn )
                    ELSE
                       CALL advec_s_pw( i, j, nr )

palm/trunk/SOURCE/transpose.f90

-                      r1093
+                      r1106
 ! Current revisions:
 ! -----------------
+!
+! preprocessor lines rearranged so that routines can also be used in serial
+! (non-parallel) mode
+!
 ! Former revisions:
 …
              work(nnx*nny*nnz)
-#if defined( __parallel )
+!
 !-- Rearrange indices of input array in order to make data to be send
 …
 !$OMP  END PARALLEL
+!
+!-- Transpose array
+    CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
+    IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
+    CALL MPI_ALLTOALL( f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
+                       work(1),              sendrecvcount_xy, MPI_REAL, &
+                       comm1dy, ierr )
+    CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
+!
+!-- Reorder transposed array
+    IF ( numprocs /= 1 )  THEN
+#if defined( __parallel )
+!
+!--    Transpose array
+       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
+       IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
+       CALL MPI_ALLTOALL( f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
+                          work(1),              sendrecvcount_xy, MPI_REAL, &
+                          comm1dy, ierr )
+       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
+!
+!--    Reorder transposed array
 !$OMP  PARALLEL PRIVATE ( i, j, k, l, m, ys )
 !$OMP  DO
+    DO  l = 0, pdims(2) - 1
+       m  = l * ( nxr_y - nxl_y + 1 ) * ( nzt_y - nzb_y + 1 ) * &
+                ( nyn_x - nys_x + 1 )
+       ys = 0 + l * ( nyn_x - nys_x + 1 )
+       DO  i = nxl_y, nxr_y
+          DO  k = nzb_y, nzt_y
+             DO  j = ys, ys + nyn_x - nys_x
+                m = m + 1
+                f_out(j,i,k) = work(m)
+             ENDDO
+          ENDDO
+       ENDDO
+    ENDDO
+!$OMP  END PARALLEL
+       DO  l = 0, pdims(2) - 1
+          m  = l * ( nxr_y - nxl_y + 1 ) * ( nzt_y - nzb_y + 1 ) * &
+                   ( nyn_x - nys_x + 1 )
+          ys = 0 + l * ( nyn_x - nys_x + 1 )
+          DO  i = nxl_y, nxr_y
+             DO  k = nzb_y, nzt_y
+                DO  j = ys, ys + nyn_x - nys_x
+                   m = m + 1
+                   f_out(j,i,k) = work(m)
+                ENDDO
+             ENDDO
+          ENDDO
+       ENDDO
+!$OMP  END PARALLEL
 #endif
+    ELSE
+!
+!--    Reorder transposed array
+!$OMP  PARALLEL PRIVATE ( i, j, k )
+!$OMP  DO
+       DO  k = nzb_y, nzt_y
+          DO  i = nxl_y, nxr_y
+             DO  j = 0, ny
+                f_out(j,i,k) = f_inv(j,k,i)
+             ENDDO
+          ENDDO
+       ENDDO
+!$OMP  END PARALLEL
+    ENDIF
  END SUBROUTINE transpose_xy
 …
              work(nnx*nny*nnz)
-#if defined( __parallel )
+!
 …
 !-- reordered locally and therefore no transposition has to be done.
     IF ( pdims(1) /= 1 )  THEN
+#if defined( __parallel )
+!
 !--    Reorder input array for transposition
 …
        ENDDO
 !$OMP  END PARALLEL
+#endif
     ELSE
+!
 !--    Reorder the array in a way that the z index is in first position
 …
     ENDIF
-#endif
  END SUBROUTINE transpose_xz
 …
              work(nnx*nny*nnz)
+    IF ( numprocs /= 1 )  THEN
 #if defined( __parallel )
+!
+!-- Reorder input array for transposition
+!
+!--    Reorder input array for transposition
 !$OMP  PARALLEL PRIVATE ( i, j, k, l, m, ys )
 !$OMP  DO
+    DO  l = 0, pdims(2) - 1
+       m  = l * ( nxr_y - nxl_y + 1 ) * ( nzt_y - nzb_y + 1 ) * &
+                ( nyn_x - nys_x + 1 )
+       ys = 0 + l * ( nyn_x - nys_x + 1 )
+       DO  l = 0, pdims(2) - 1
+          m  = l * ( nxr_y - nxl_y + 1 ) * ( nzt_y - nzb_y + 1 ) * &
+                   ( nyn_x - nys_x + 1 )
+          ys = 0 + l * ( nyn_x - nys_x + 1 )
+          DO  i = nxl_y, nxr_y
+             DO  k = nzb_y, nzt_y
+                DO  j = ys, ys + nyn_x - nys_x
+                   m = m + 1
+                   work(m) = f_in(j,i,k)
+                ENDDO
+             ENDDO
+          ENDDO
+       ENDDO
+!$OMP  END PARALLEL
+!
+!--    Transpose array
+       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
+       IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
+       CALL MPI_ALLTOALL( work(1),              sendrecvcount_xy, MPI_REAL, &
+                          f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
+                          comm1dy, ierr )
+       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
+#endif
+    ELSE
+!
+!--    Reorder array f_in the same way as ALLTOALL did it
+!$OMP  PARALLEL PRIVATE ( i, j, k )
+!$OMP  DO
        DO  i = nxl_y, nxr_y
           DO  k = nzb_y, nzt_y
+             DO  j = ys, ys + nyn_x - nys_x
+                m = m + 1
+                work(m) = f_in(j,i,k)
+             ENDDO
+          ENDDO
+       ENDDO
+    ENDDO
+!$OMP  END PARALLEL
+!
+!-- Transpose array
+    CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
+    IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
+    CALL MPI_ALLTOALL( work(1),              sendrecvcount_xy, MPI_REAL, &
+                       f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
+                       comm1dy, ierr )
+    CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
+             DO  j = 0, ny
+                f_inv(j,k,i) = f_in(j,i,k)
+             ENDDO
+          ENDDO
+       ENDDO
+!$OMP  END PARALLEL
+    ENDIF
+!
 …
     ENDDO
 !$OMP  END PARALLEL
-#endif
  END SUBROUTINE transpose_yx
 …
              work(nnx*nny*nnz)
-#if defined( __parallel )
+!
 !-- Rearrange indices of input array in order to make data to be send
 …
 !-- of the data is necessary and no transposition has to be done.
     IF ( pdims(1) == 1 )  THEN
 !$OMP  PARALLEL PRIVATE ( i, j, k )
 !$OMP  DO
 …
        ENDDO
 !$OMP  END PARALLEL
+       RETURN
+    ENDIF
+!
+!-- Transpose array
+    CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
+    IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
+    CALL MPI_ALLTOALL( f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
+                       work(1),              sendrecvcount_yz, MPI_REAL, &
+                       comm1dx, ierr )
+    CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
+!
+!-- Reorder transposed array
+    ELSE
+#if defined( __parallel )
+!
+!--    Transpose array
+       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
+       IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
+       CALL MPI_ALLTOALL( f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
+                          work(1),              sendrecvcount_yz, MPI_REAL, &
+                          comm1dx, ierr )
+       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
+!
+!--    Reorder transposed array
 !$OMP  PARALLEL PRIVATE ( i, j, k, l, m, zs )
 !$OMP  DO
+    DO  l = 0, pdims(1) - 1
+       m  = l * ( nyn_z - nys_z + 1 ) * ( nzt_y - nzb_y + 1 ) * &
+                ( nxr_z - nxl_z + 1 )
+       zs = 1 + l * ( nzt_y - nzb_y + 1 )
+       DO  j = nys_z, nyn_z
+          DO  k = zs, zs + nzt_y - nzb_y
+             DO  i = nxl_z, nxr_z
+                m = m + 1
+                f_out(i,j,k) = work(m)
+             ENDDO
+          ENDDO
+       ENDDO
+    ENDDO
+!$OMP  END PARALLEL
+       DO  l = 0, pdims(1) - 1
+          m  = l * ( nyn_z - nys_z + 1 ) * ( nzt_y - nzb_y + 1 ) * &
+                   ( nxr_z - nxl_z + 1 )
+          zs = 1 + l * ( nzt_y - nzb_y + 1 )
+          DO  j = nys_z, nyn_z
+             DO  k = zs, zs + nzt_y - nzb_y
+                DO  i = nxl_z, nxr_z
+                   m = m + 1
+                   f_out(i,j,k) = work(m)
+                ENDDO
+             ENDDO
+          ENDDO
+       ENDDO
+!$OMP  END PARALLEL
 #endif
+   ENDIF
  END SUBROUTINE transpose_yz
 …
     INTEGER ::  i, j, k, l, m, xs
+    REAL ::  f_in(1:nz,nys:nyn,nxl:nxr), f_inv(nys:nyn,nxl:nxr,1:nz), &
+             f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x),                     &
+    REAL ::  f_in(1:nz,nys:nyn,nxl:nxr), f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x), &
              work(nnx*nny*nnz)
+#if defined( __parallel )
+    !$acc declare create ( f_inv )
+    REAL ::  f_inv(nys:nyn,nxl:nxr,1:nz)
+!
 …
 !$OMP  PARALLEL PRIVATE ( i, j, k )
 !$OMP  DO
+    !$acc kernels present( f_in )
+    !$acc loop
     DO  k = 1,nz
        DO  i = nxl, nxr
+          !$acc loop vector( 32 )
           DO  j = nys, nyn
              f_inv(j,i,k) = f_in(k,j,i)
 …
 !-- of the data is necessary and no transposition has to be done.
     IF ( pdims(1) == 1 )  THEN
+!$OMP  PARALLEL PRIVATE ( i, j, k )
+!$OMP  DO
+!$OMP  PARALLEL PRIVATE ( i, j, k )
+!$OMP  DO
+       !$acc kernels present( f_out )
+       !$acc loop
        DO  k = 1, nz
           DO  i = nxl, nxr
+             !$acc loop vector( 32 )
              DO  j = nys, nyn
                 f_out(i,j,k) = f_inv(j,i,k)
 …
        ENDDO
 !$OMP  END PARALLEL
+       RETURN
+    ELSE
+#if defined( __parallel )
+!
+!--    Transpose array
+       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
+       IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
+       CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
+                          work(1),          sendrecvcount_zx, MPI_REAL, &
+                          comm1dx, ierr )
+       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
+!
+!--    Reorder transposed array
+!$OMP  PARALLEL PRIVATE ( i, j, k, l, m, xs )
+!$OMP  DO
+       DO  l = 0, pdims(1) - 1
+          m  = l * ( nzt_x - nzb_x + 1 ) * nnx * ( nyn_x - nys_x + 1 )
+          xs = 0 + l * nnx
+          DO  k = nzb_x, nzt_x
+             DO  i = xs, xs + nnx - 1
+                DO  j = nys_x, nyn_x
+                   m = m + 1
+                   f_out(i,j,k) = work(m)
+                ENDDO
+             ENDDO
+          ENDDO
+       ENDDO
+!$OMP  END PARALLEL
+#endif
     ENDIF
+!
-!-- Transpose array
-    CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
-    IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
-    CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
-                       work(1),          sendrecvcount_zx, MPI_REAL, &
-                       comm1dx, ierr )
-    CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
+!
-!-- Reorder transposed array
-!$OMP  PARALLEL PRIVATE ( i, j, k, l, m, xs )
-!$OMP  DO
-    DO  l = 0, pdims(1) - 1
-       m  = l * ( nzt_x - nzb_x + 1 ) * nnx * ( nyn_x - nys_x + 1 )
-       xs = 0 + l * nnx
-       DO  k = nzb_x, nzt_x
-          DO  i = xs, xs + nnx - 1
-             DO  j = nys_x, nyn_x
-                m = m + 1
-                f_out(i,j,k) = work(m)
-             ENDDO
-          ENDDO
-       ENDDO
-    ENDDO
-!$OMP  END PARALLEL
-#endif
  END SUBROUTINE transpose_zx
 …
              work(nnx*nny*nnz)
-#if defined( __parallel )
+!
 !-- If the PE grid is one-dimensional along y, the array has only to be
 !-- reordered locally and therefore no transposition has to be done.
     IF ( pdims(1) /= 1 )  THEN
+#if defined( __parallel )
+!
 !--    Reorder input array for transposition
 …
                           comm1dx, ierr )
        CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
+!
+!--    Reorder transposed array in a way that the y index is in first position
+!$OMP  PARALLEL PRIVATE ( i, j, k )
+!$OMP  DO
+       DO  j = 0, ny
+          DO  k = nzb_y, nzt_y
+             DO  i = nxl_y, nxr_y
+                f_out(j,i,k) = f_inv(i,k,j)
+             ENDDO
+          ENDDO
+       ENDDO
+!$OMP  END PARALLEL
+#endif
     ELSE
+!
 !--    Reorder the array in a way that the y index is in first position
+!--    Reorder the array in the same way like ALLTOALL did it
 !$OMP  PARALLEL PRIVATE ( i, j, k )
 !$OMP  DO
 …
        ENDDO
 !$OMP  END PARALLEL
+!
-!--    Move data to output array
-!$OMP  PARALLEL PRIVATE ( i, j, k )
-!$OMP  DO
-       DO  k = nzb_y, nzt_y
-          DO  i = nxl_y, nxr_y
-             DO  j = 0, ny
-                f_out(j,i,k) = f_inv(i,k,j)
-             ENDDO
-          ENDDO
-       ENDDO
-!$OMP  END PARALLEL
     ENDIF
+#endif
+!
+!-- Reorder transposed array in a way that the y index is in first position
+!$OMP  PARALLEL PRIVATE ( i, j, k )
+!$OMP  DO
+    DO  k = nzb_y, nzt_y
+       DO  i = nxl_y, nxr_y
+          DO  j = 0, ny
+             f_out(j,i,k) = f_inv(i,k,j)
+          ENDDO
+       ENDDO
+    ENDDO
+!$OMP  END PARALLEL
  END SUBROUTINE transpose_zy

palm/trunk/SOURCE/user_data_output_3d.f90

-                      r1037
+                      r1106
+!
 ! Current revisions:
+! -----------------
+! ------------------
+! array_kind renamed precision_kind
+!
 ! Former revisions:
 …
 !------------------------------------------------------------------------------!
-    USE array_kind
     USE indices
+    USE precision_kind
     USE user

Context Navigation

Legend:

palm/trunk/INSTALL/example_cbl_rc

palm/trunk/SCRIPTS/mrun

palm/trunk/SOURCE/Makefile

palm/trunk/SOURCE/Makefile_check

palm/trunk/SOURCE/check_open.f90

palm/trunk/SOURCE/data_output_3d.f90

palm/trunk/SOURCE/data_output_profiles.f90

palm/trunk/SOURCE/fft_xy.f90

palm/trunk/SOURCE/header.f90

palm/trunk/SOURCE/microphysics.f90

palm/trunk/SOURCE/modules.f90

palm/trunk/SOURCE/poisfft.f90

palm/trunk/SOURCE/poisfft_hybrid.f90

palm/trunk/SOURCE/prognostic_equations.f90

palm/trunk/SOURCE/transpose.f90

palm/trunk/SOURCE/user_data_output_3d.f90

Download in other formats: