Home

Context Navigation

← Previous Changeset
Next Changeset →

Changeset 164

Timestamp:

May 15, 2008 8:46:15 AM (17 years ago)

Author:

raasch

Message:

optimization of transpositions for 2D decompositions, workaround for using -env option with mpiexec, adjustments for lcxt4

Location:

Files:

: 11 edited

SCRIPTS/mrun (modified) (9 diffs)
SCRIPTS/subjob (modified) (5 diffs)
SOURCE/CURRENT_MODIFICATIONS (modified) (2 diffs)
SOURCE/advec_s_ups.f90 (modified) (5 diffs)
SOURCE/advec_u_ups.f90 (modified) (4 diffs)
SOURCE/advec_v_ups.f90 (modified) (5 diffs)
SOURCE/advec_w_ups.f90 (modified) (5 diffs)
SOURCE/calc_spectra.f90 (modified) (3 diffs)
SOURCE/palm.f90 (modified) (2 diffs)
SOURCE/poisfft.f90 (modified) (7 diffs)
SOURCE/transpose.f90 (modified) (44 diffs)

Legend:

: Unmodified
: Added
: Removed

palm/trunk/SCRIPTS/mrun

-                      r149
+                      r164
      #                     true, mrun tries "ln -f" on local output and resorts
      #                     to "cp" or "cp -r" on error
+     # 15/04/08 - Siggi  - argument -c introduced to most of the subjob calls,
+     #                     which allows the user to choose his own job catalog
+     #                     by setting job_catalog in the configuration file
+     #                     (default is ~/job_queue),
+     #                     workaround for mpixec with -env option,
+     #                     adjustments for lcxt4 (Bergen Center for Computational
+     #                     Science)
     # VARIABLENVEREINBARUNGEN + DEFAULTWERTE
 …
  input_list=""
  interpreted_config_file=""
+ job_catalog="~/job_queue"
  job_on_file=""
  keep_data_from_previous_run=false
 …
  lopts="$lopts $netcdf_lib $dvrp_lib"
  ROPTS="$ropts"
  if [[ ( $(echo $host | cut -c1-3) = nec  ||  $(echo $host | cut -c1-3) = ibm  ||  $host = lctit  ||  $host = lcfimm )  &&  -n $numprocs ]]
+ if [[ ( $(echo $host | cut -c1-3) = nec  ||  $(echo $host | cut -c1-3) = ibm  ||  $host = lctit  ||  $host = lcfimm  ||  $host = lcxt4 )  &&  -n $numprocs ]]
  then
     XOPT="-X $numprocs"
 …
              then
                 printf "\n\n"
+                mpiexec  -machinefile hostfile  -n $ii  a.out  $ROPTS
+                if [[ $host = lcxt4 ]]
+                then
+                   aprun  -n $ii  -N $tasks_per_node  a.out  $ROPTS
+                else
+                   mpiexec  -machinefile hostfile  -n $ii  a.out  $ROPTS
+                fi
              else
                 ((  iii = ii / 2 ))
+                echo "atmosphere_to_ocean"  >  runfile_atmos
+                echo "ocean_to_atmosphere"  >  runfile_ocean
                 printf "\n      coupled run ($iii atmosphere, $iii ocean)"
                 printf "\n\n"
+                mpiexec  -machinefile hostfile  -n $iii  -env coupling_mode atmosphere_to_ocean  a.out  $ROPTS  &
+                mpiexec  -machinefile hostfile  -n $iii  -env coupling_mode ocean_to_atmosphere  a.out  $ROPTS  &
+                if [[ $host == lcxt4 ]]
+                then
+                   aprun  -n $iii  -N $tasks_per_node  a.out < runfile_atmos  $ROPTS  &
+                   aprun  -n $iii  -N $tasks_per_node  a.out < runfile_ocean  $ROPTS  &
+                else
+                      # WORKAROUND BECAUSE mpiexec WITH -env option IS NOT AVAILABLE ON SOME SYSTEMS
+                   mpiexec  -machinefile hostfile  -n $iii  a.out  $ROPTS  <  runfile_atmos &
+                   mpiexec  -machinefile hostfile  -n $iii  a.out  $ROPTS  <  runfile_ocean &
+#                   mpiexec  -machinefile hostfile  -n $iii  -env coupling_mode atmosphere_to_ocean  a.out  $ROPTS  &
+#                   mpiexec  -machinefile hostfile  -n $iii  -env coupling_mode ocean_to_atmosphere  a.out  $ROPTS  &
+                fi
                 wait
              fi
 …
                       if [[ "$LOGNAME" = b323013 ]]
                       then
                          subjob  -v  -q c1  -X 0  -m 1000  -t 900  transfer_${localout[$i]}
+                         subjob  -v  -q c1  -X 0  -m 1000  -t 900  -c $job_catalog  transfer_${localout[$i]}
                       else
                          subjob  -d  -v  -q c1  -X 0  -m 1000  -t 900  transfer_${localout[$i]}
+                         subjob  -d  -v  -q c1  -X 0  -m 1000  -t 900  -c $job_catalog  transfer_${localout[$i]}
                       fi
                    fi
 …
                       if [[ $LOGNAME = b323013 ]]
                       then
                          subjob  -v  -q c1  -X 0  -m 1000  -t 900  transfer_${localout[$i]}
+                         subjob  -v  -q c1  -X 0  -m 1000  -t 900  -c $job_catalog  transfer_${localout[$i]}
                       else
                          subjob  -d  -v  -q c1  -X 0  -m 1000  -t 900  transfer_${localout[$i]}
+                         subjob  -d  -v  -q c1  -X 0  -m 1000  -t 900  -c $job_catalog  transfer_${localout[$i]}
                       fi
                    fi
 …
                    if [[ $localhost = ibmh  ||  $localhost = ibmb ]]
                    then
 #                      subjob  -d  -v  -q cdata  -X 0  -m 1000  -t 43200  archive_${frelout[$i]}
                       subjob   -v  -q cdata  -X 0  -m 1000  -t 43200  archive_${frelout[$i]}
+#                      subjob  -d  -v  -q cdata  -X 0  -m 1000  -t 43200  -c $job_catalog  archive_${frelout[$i]}
+                      subjob   -v  -q cdata  -X 0  -m 1000  -t 43200  -c $job_catalog  archive_${frelout[$i]}
                    elif [[ $localhost = nech ]]
                    then
 …
                 fi
                 subjob  -v  -d  -q cdata  -X 0  -m 1000  -t 43200  archive_${frelout[$i]}
+                subjob  -v  -d  -q cdata  -X 0  -m 1000  -t 43200  -c $job_catalog  archive_${frelout[$i]}
                 printf "              Archiving of $tmp_data_catalog/${frelout[$i]} initiated (batch job submitted)\n"
                 file_saved=true
 …
     fi
     subjob  $job_on_file  -h $host  -u $remote_username -g $group_number -q $queue  -m $memory  -N $node_usage -t $cpumax  $XOPT  $TOPT  $OOPT  -n $fname  -v   $jobfile
+    subjob  $job_on_file  -h $host  -u $remote_username -g $group_number -q $queue  -m $memory  -N $node_usage -t $cpumax  $XOPT  $TOPT  $OOPT  -n $fname  -v  -c $job_catalog  $jobfile
     rm -rf  $jobfile

palm/trunk/SCRIPTS/subjob

-                      r129
+                      r164
      # 19/10/07 - Siggi - a ";" was missing in the last change done by Marcus
      # 30/10/07 - Marcus- further adjustments for queues on lctit
+     # 15/05/08 - Siggi - adjustments for lcxt4 (Bergen Center for Computational
+     #                    Science)
 …
      (gregale)               local_addres=130.75.105.109; local_host=lcmuk;;
      (hababai)               local_addres=130.75.105.108; local_host=lcmuk;;
+     (hexagon.bccs.uib.no)   local_addres=129.177.20.113; local_host=lcxt4;;
      (hreg*-en0|hanni*-en0)  local_addres=130.75.4.10;    local_host=ibmh;;
      (irifi)                 local_addres=130.75.105.104; local_host=lcmuk;;
      (levanto)               local_addres=130.75.105.45;  local_host=lcmuk;;
      (maestro)               local_addres=130.75.105.2;   local_host=lcmuk;;
+     (nid*)                  local_addres=129.177.20.113; local_host=lcxt4;;
      (nobel*)                local_addres=150.183.5.101;  local_host=ibms;;
      (orkan)                 local_addres=130.75.105.3;   local_host=lcmuk;;
 …
         (lcfimm) remote_addres=172.20.4.2; submcom=/opt/torque/bin/qsub;;
         (lctit)  queue=lctit; remote_addres=172.17.75.161; submcom=/n1ge/TITECH_GRID/tools/bin/n1ge;;
+        (lcxt4)  remote_addres=129.177.20.113; submcom=/opt/torque/2.3.0/bin/qsub;;
         (nech)   qsubmem=memsz_job; qsubtime=cputim_job; remote_addres=136.172.44.147; submcom="/usr/local/bin/qsub";;
         (neck)   qsubmem=memsz_job; qsubtime=cputim_job; remote_addres=133.5.178.11; submcom="/usr/bin/nqsII/qsub";;
 …
 %%END%%
+ elif [[ $remote_host = lcxt4 ]]
+ then
+    if [[ $numprocs != 0 ]]
+    then
+       cat > $job_to_send << %%END%%
+#!/bin/ksh
+#PBS -S /bin/ksh
+#PBS -N $job_name
+#PBS -A nersc
+#PBS -l walltime=$timestring
+#PBS -l mppwidth=${numprocs}
+#PBS -l mppnppn=${tasks_per_node}
+#PBS -m abe
+#PBS -M igore@nersc.no
+#PBS -o $remote_dayfile
+#PBS -e $remote_dayfile
+%%END%%
     else
        cat > $job_to_send << %%END%%
 …
        then
           eval  $submcom  $job_on_remhost
+       elif [[ $local_host = lcfimm ]]
+       then
+          eval  $submcom  $job_on_remhost
+          echo "$submcom  $job_on_remhost"
+          chmod  u+x  $job_on_remhost
+       elif [[ $local_host = lctit ]]
+       elif [[  $local_host = lcfimm  ||  $local_host = lctit  ||  $localhost = lcxt4 ]]
        then
           eval  $submcom  $job_on_remhost

palm/trunk/SOURCE/CURRENT_MODIFICATIONS

-                      r158
+                      r164
 User-defined spectra.
+Argument -c introduced to most of the subjob calls, which allows the user to
+choose his own job catalog by setting job_catalog in the configuration file
+(default is ~/job_queue). Workaround for mpixec with -env option.
+Adjustments for lcxt4 (Bergen Center for Computational Science) (mrun, subjob)
 advec_particles, calc_spectra, check_open, check_parameters, data_output_spectra, header, init_particles, init_pegrid, init_3d_model, modules, netcdf, parin, particle_boundary_conds, plant_canopy_model, prognostic_equations, read_var_list, read_3d_binary, time_integration, user_interface, write_var_list, write_3d_binary
 …
 informations are now contained in file _0000. (parin, check_open)
+check_open, init_3d_model, modules, parin, read_var_list, read_3d_binary, write_var_list, write_3d_binary
+Transpositions for the 2D domain decomposition have been optimized by using
+f_inv as an automatic array instead of providing the memory by a dummy argument.
+This spares one copy loop per transposition. Order of indices in the 3D loops
+in some of the transpose routines have been rearranged for better cache utilization.
+Both have been suggested by Roland Richter (SGI) as part of the
+HLRN-II benchmark process. (transpose)
+Workaround for getting information about the coupling mode. (palm)
+advec_s_ups, advec_u_ups, advec_v_ups, advec_w_ups, calc_spectra, check_open, init_3d_model, modules, palm, parin, poisfft, read_var_list, read_3d_binary, transpose, write_var_list, write_3d_binary

palm/trunk/SOURCE/advec_s_ups.f90

-                      r4
+                      r164
 ! Actual revisions:
 ! -----------------
+!
+! Arguments removed from transpose routines
+!
 ! Former revisions:
 …
+!
 !-- Transpose the component to be advected: z --> x
     CALL transpose_zx( v_ad, tend, v_ad, tend, v_ad )
+    CALL transpose_zx( v_ad, tend, v_ad )
 #else
 …
+!
 !-- Transpose the advecting componnet: z --> x
     CALL transpose_zx( d, tend, d, tend, d )
+    CALL transpose_zx( d, tend, d )
 #endif
 …
+!
 !-- Transpose the advecting component: z --> y
     CALL transpose_zx( d, tend, d, tend, d )
     CALL transpose_xy( d, tend, d, tend, d )
+    CALL transpose_zx( d, tend, d )
+    CALL transpose_xy( d, tend, d )
+!
 !-- Transpose the component to be advected: x --> y
     CALL transpose_xy( v_ad, tend, v_ad, tend, v_ad )
+    CALL transpose_xy( v_ad, tend, v_ad )
 #endif
 …
+!
 !-- Transpose the component to be advected: y --> z (= y --> x + x --> z)
     CALL transpose_yx( v_ad, tend, v_ad, tend, v_ad )
     CALL transpose_xz( v_ad, tend, v_ad, tend, v_ad )
+    CALL transpose_yx( v_ad, tend, v_ad )
+    CALL transpose_xz( v_ad, tend, v_ad )
+!

palm/trunk/SOURCE/advec_u_ups.f90

r4	r164
4	4	! Actual revisions:
5	5	! -----------------
6		!
	6	! Arguments removed from transpose routines
7	7	!
8	8	! Former revisions:
…	…
69	69	!
70	70	!-- Transpose the component to be advected: z --> x
71		CALL transpose_zx( v_ad, tend, v_ad~~, tend, v_ad~~ )
	71	CALL transpose_zx( v_ad, tend, v_ad )
72	72
73	73	!
…	…
115	115	!
116	116	!-- Transpose the advecting component: z --> y
117		CALL transpose_zx( d, tend, d~~, tend, d~~ )
118		CALL transpose_xy( d, tend, d~~, tend, d~~ )
	117	CALL transpose_zx( d, tend, d )
	118	CALL transpose_xy( d, tend, d )
119	119
120	120	!
121	121	!-- Transpose the component to be advected: x --> y
122		CALL transpose_xy( v_ad, tend, v_ad~~, tend, v_ad~~ )
	122	CALL transpose_xy( v_ad, tend, v_ad )
123	123
124	124	#endif
…	…
148	148	!
149	149	!-- Transpose the component to be advected: y --> z (= y --> x + x --> z)
150		CALL transpose_yx( v_ad, tend, v_ad~~, tend, v_ad~~ )
151		CALL transpose_xz( v_ad, tend, v_ad~~, tend, v_ad~~ )
	150	CALL transpose_yx( v_ad, tend, v_ad )
	151	CALL transpose_xz( v_ad, tend, v_ad )
152	152
153	153	!

palm/trunk/SOURCE/advec_v_ups.f90

-                      r4
+                      r164
 ! Actual revisions:
 ! -----------------
+!
+! Arguments removed from transpose routines
+!
 ! Former revisions:
 …
+!
 !-- Transpose the component to be advected: z --> x
     CALL transpose_zx( v_ad, tend, v_ad, tend, v_ad )
+    CALL transpose_zx( v_ad, tend, v_ad )
 #else
 …
+!
 !-- Transpose the advecting component: z --> x
     CALL transpose_zx( d, tend, d, tend, d )
+    CALL transpose_zx( d, tend, d )
 #endif
 …
+!
 !-- Transpose the advecting component: z --> y
     CALL transpose_zx( d, tend, d, tend, d )
     CALL transpose_xy( d, tend, d, tend, d )
+    CALL transpose_zx( d, tend, d )
+    CALL transpose_xy( d, tend, d )
+!
 !-- Transpose the component to be advected: x --> y
     CALL transpose_xy( v_ad, tend, v_ad, tend, v_ad )
+    CALL transpose_xy( v_ad, tend, v_ad )
 #endif
 …
+!
 !-- Transpose the component to be advected: y --> z (= y --> x + x --> z)
     CALL transpose_yx( v_ad, tend, v_ad, tend, v_ad )
     CALL transpose_xz( v_ad, tend, v_ad, tend, v_ad )
+    CALL transpose_yx( v_ad, tend, v_ad )
+    CALL transpose_xz( v_ad, tend, v_ad )
+!

palm/trunk/SOURCE/advec_w_ups.f90

-                      r4
+                      r164
 ! Actual revisions:
 ! -----------------
+!
+! Arguments removed from transpose routines
+!
 ! Former revisions:
 …
+!
 !-- Transpose the component to be advected: z --> x
     CALL transpose_zx( v_ad, tend, v_ad, tend, v_ad )
+    CALL transpose_zx( v_ad, tend, v_ad )
 #else
 …
+!
 !-- Transpose the component to be advected: z --> x
     CALL transpose_zx( d, tend, d, tend, d )
+    CALL transpose_zx( d, tend, d )
 #endif
 …
+!
 !-- Transpose the advecting component: z --> y
     CALL transpose_zx( d, tend, d, tend, d )
     CALL transpose_xy( d, tend, d, tend, d )
+    CALL transpose_zx( d, tend, d )
+    CALL transpose_xy( d, tend, d )
+!
 !-- Transpose the component to be advected: x --> y
     CALL transpose_xy( v_ad, tend, v_ad, tend, v_ad )
+    CALL transpose_xy( v_ad, tend, v_ad )
 #endif
 …
+!
 !-- Transpose the component to be advected: y --> z (= y --> x + x --> z)
     CALL transpose_yx( v_ad, tend, v_ad, tend, v_ad )
     CALL transpose_xz( v_ad, tend, v_ad, tend, v_ad )
+    CALL transpose_yx( v_ad, tend, v_ad )
+    CALL transpose_xz( v_ad, tend, v_ad )
+!

palm/trunk/SOURCE/calc_spectra.f90

-                      r146
+                      r164
 ! Actual revisions:
 ! -----------------
 ! user-defined spectra
+! user-defined spectra, arguments removed from transpose routines
+!
 ! Former revisions:
 …
 #if defined( __parallel )
           IF ( pdims(2) /= 1 )  THEN
              CALL transpose_zx( d, tend, d, tend, d )
+             CALL transpose_zx( d, tend, d )
           ELSE
              CALL transpose_yxd( d, tend, d, tend, d )
+             CALL transpose_yxd( d, tend, d )
           ENDIF
           CALL calc_spectra_x( d, pr, m )
 …
 #if defined( __parallel )
           CALL transpose_zyd( d, tend, d, tend, d )
+          CALL transpose_zyd( d, tend, d )
           CALL calc_spectra_y( d, pr, m )
 #else

palm/trunk/SOURCE/palm.f90

-                      r114
+                      r164
 ! Actual revisions:
 ! -----------------
+!
+! Workaround for getting information about the coupling mode
+!
 ! Former revisions:
 …
+!
 !-- Get information about the coupling mode from the environment variable
+!-- which has been set by the mpiexec command
+    CALL local_getenv( 'coupling_mode', 13, coupling_mode, i )
+    IF ( i == 0 )  coupling_mode = 'uncoupled'
+!-- which has been set by the mpiexec command.
+!-- This method is currently not used because the mpiexec command is not
+!-- available on some machines
+!    CALL local_getenv( 'coupling_mode', 13, coupling_mode, i )
+!    IF ( i == 0 )  coupling_mode = 'uncoupled'
+!    IF ( coupling_mode == 'ocean_to_atmosphere' )  coupling_char = '_O'
+!
+!-- Get information about the coupling mode from standard input (PE0 only) and
+!-- distribute it to the other PEs
+    CALL MPI_COMM_RANK( MPI_COMM_WORLD, myid, ierr )
+    IF ( myid == 0 )  THEN
+       READ (*,*,ERR=10,END=10)  coupling_mode
+     IF ( TRIM( coupling_mode ) == 'atmosphere_to_ocean' )  THEN
+          i = 1
+       ELSEIF ( TRIM( coupling_mode ) ==  'ocean_to_atmosphere' )  THEN
+          i = 2
+       ELSE
+          i = 0
+       ENDIF
+    ENDIF
+    CALL MPI_BCAST( i, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr )
+    IF ( i == 0 )  THEN
+       coupling_mode = 'uncoupled'
+    ELSEIF ( i == 1 )  THEN
+       coupling_mode = 'atmosphere_to_ocean'
+    ELSEIF ( i == 2 )  THEN
+       coupling_mode = 'ocean_to_atmosphere'
+    ENDIF
     IF ( coupling_mode == 'ocean_to_atmosphere' )  coupling_char = '_O'
 #endif

palm/trunk/SOURCE/poisfft.f90

-                      r139
+                      r164
 ! Actual revisions:
 ! -----------------
+!
+! Arguments removed from transpose routines
+!
 ! Former revisions:
 …
 !--       Transposition z --> x
           CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
           CALL transpose_zx( ar, work, ar, work, ar )
+          CALL transpose_zx( ar, work, ar )
           CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
 …
 !--       Transposition x --> y
           CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
           CALL transpose_xy( ar, work, ar, work, ar )
+          CALL transpose_xy( ar, work, ar )
           CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
 …
 !--       Transposition y --> z
           CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
           CALL transpose_yz( ar, work, ar, work, ar )
+          CALL transpose_yz( ar, work, ar )
           CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
 …
 !--       Transposition z --> y
           CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
           CALL transpose_zy( ar, work, ar, work, ar )
+          CALL transpose_zy( ar, work, ar )
           CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
 …
 !--       Transposition y --> x
           CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
           CALL transpose_yx( ar, work, ar, work, ar )
+          CALL transpose_yx( ar, work, ar )
           CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
 …
 !--       Transposition x --> z
           CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
           CALL transpose_xz( ar, work, ar, work, ar )
+          CALL transpose_xz( ar, work, ar )
           CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )

palm/trunk/SOURCE/transpose.f90

-                      r4
+                      r164
  SUBROUTINE transpose_xy( f_in, work1, f_inv, work2, f_out )
+ SUBROUTINE transpose_xy( f_in, work, f_out )
 !------------------------------------------------------------------------------!
 ! Actual revisions:
 ! -----------------
+!
+! f_inv changed from subroutine argument to automatic array in order to do
+! re-ordering from f_in to f_inv in one step, one array work is needed instead
+! of work1 and work2
+!
 ! Former revisions:
 …
              f_inv(nys_x:nyn_xa,nzb_x:nzt_xa,0:nxa),                    &
              f_out(0:nya,nxl_y:nxr_ya,nzb_y:nzt_ya),                    &
              work1(nys_x:nyn_xa,nzb_x:nzt_xa,0:nxa), work2(nnx*nny*nnz)
+             work(nnx*nny*nnz)
 #if defined( __parallel )
 …
 !-- Rearrange indices of input array in order to make data to be send
 !-- by MPI contiguous
-    DO  k = nzb_x, nzt_xa
-       DO  j = nys_x, nyn_xa
-          DO  i = 0, nxa
-             work1(j,k,i) = f_in(i,j,k)
-          ENDDO
-       ENDDO
-    ENDDO
+!
-!-- Move data to different array, because memory location of work1 is
-!-- needed further below (work1 = work2)
     DO  i = 0, nxa
        DO  k = nzb_x, nzt_xa
           DO  j = nys_x, nyn_xa
              f_inv(j,k,i) = work1(j,k,i)
+             f_inv(j,k,i) = f_in(i,j,k)
           ENDDO
        ENDDO
 …
     CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
     CALL MPI_ALLTOALL( f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
                        work2(1),             sendrecvcount_xy, MPI_REAL, &
+                       work(1),              sendrecvcount_xy, MPI_REAL, &
                        comm1dy, ierr )
     CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
 …
              DO  j = ys, ys + nyn_xa - nys_x
                 m = m + 1
                 f_out(j,i,k) = work2(m)
+                f_out(j,i,k) = work(m)
              ENDDO
           ENDDO
 …
  SUBROUTINE transpose_xz( f_in, work1, f_inv, work2, f_out )
+ SUBROUTINE transpose_xz( f_in, work, f_out )
 !------------------------------------------------------------------------------!
 …
     REAL ::  f_in(0:nxa,nys_x:nyn_xa,nzb_x:nzt_xa),             &
              f_inv(nxl:nxra,nys:nyna,1:nza),                    &
+             f_inv(nys:nyna,nxl:nxra,1:nza),                    &
              f_out(1:nza,nys:nyna,nxl:nxra),                    &
              work1(1:nza,nys:nyna,nxl:nxra), work2(nnx*nny*nnz)
+             work(nnx*nny*nnz)
 #if defined( __parallel )
 …
           xs = 0 + l * nnx
           DO  k = nzb_x, nzt_xa
              DO  j = nys_x, nyn_xa
                 DO  i = xs, xs + nnx - 1
+             DO  i = xs, xs + nnx - 1
+                DO  j = nys_x, nyn_xa
                    m = m + 1
                    work2(m) = f_in(i,j,k)
+                   work(m) = f_in(i,j,k)
                 ENDDO
              ENDDO
 …
 !--    Transpose array
        CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
        CALL MPI_ALLTOALL( work2(1),         sendrecvcount_zx, MPI_REAL, &
                           f_inv(nxl,nys,1), sendrecvcount_zx, MPI_REAL, &
+       CALL MPI_ALLTOALL( work(1),          sendrecvcount_zx, MPI_REAL, &
+                          f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
                           comm1dx, ierr )
        CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
 …
+!
 !--    Reorder transposed array in a way that the z index is in first position
        DO  i = nxl, nxra
           DO  j = nys, nyna
              DO  k = 1, nza
                 work1(k,j,i) = f_inv(i,j,k)
+       DO  k = 1, nza
+          DO  i = nxl, nxra
+             DO  j = nys, nyna
+                f_out(k,j,i) = f_inv(j,i,k)
              ENDDO
           ENDDO
 …
           DO  j = nys, nyna
              DO  k = 1, nza
+                work1(k,j,i) = f_in(i,j,k)
+             ENDDO
+          ENDDO
+       ENDDO
+                f_inv(j,i,k) = f_in(i,j,k)
+             ENDDO
+          ENDDO
+       ENDDO
+       DO  k = 1, nza
+          DO  i = nxl, nxra
+             DO  j = nys, nyna
+                f_out(k,j,i) = f_inv(j,i,k)
+             ENDDO
+          ENDDO
+       ENDDO
     ENDIF
+!
-!-- Move data to output array
-    DO  i = nxl, nxra
-       DO  j = nys, nyna
-          DO  k = 1, nza
-             f_out(k,j,i) = work1(k,j,i)
-          ENDDO
-       ENDDO
-    ENDDO
 #endif
 …
  SUBROUTINE transpose_yx( f_in, work1, f_inv, work2, f_out )
+ SUBROUTINE transpose_yx( f_in, work, f_out )
 !------------------------------------------------------------------------------!
 …
              f_inv(nys_x:nyn_xa,nzb_x:nzt_xa,0:nxa),                    &
              f_out(0:nxa,nys_x:nyn_xa,nzb_x:nzt_xa),                    &
              work1(0:nxa,nys_x:nyn_xa,nzb_x:nzt_xa), work2(nnx*nny*nnz)
+             work(nnx*nny*nnz)
 #if defined( __parallel )
 …
              DO  j = ys, ys + nyn_xa - nys_x
                 m = m + 1
                 work2(m) = f_in(j,i,k)
+                work(m) = f_in(j,i,k)
              ENDDO
           ENDDO
 …
 !-- Transpose array
     CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
     CALL MPI_ALLTOALL( work2(1),             sendrecvcount_xy, MPI_REAL, &
+    CALL MPI_ALLTOALL( work(1),              sendrecvcount_xy, MPI_REAL, &
                        f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
                        comm1dy, ierr )
 …
        DO  k = nzb_x, nzt_xa
           DO  j = nys_x, nyn_xa
+             work1(i,j,k) = f_inv(j,k,i)
+          ENDDO
+       ENDDO
+    ENDDO
+!
+!-- Move data to output array
+    DO  k = nzb_x, nzt_xa
+       DO  j = nys_x, nyn_xa
+          DO  i = 0, nxa
+             f_out(i,j,k) = work1(i,j,k)
+             f_out(i,j,k) = f_inv(j,k,i)
           ENDDO
        ENDDO
 …
  SUBROUTINE transpose_yxd( f_in, work1, f_inv, work2, f_out )
+ SUBROUTINE transpose_yxd( f_in, work, f_out )
 !------------------------------------------------------------------------------!
 …
     REAL ::  f_in(1:nza,nys:nyna,nxl:nxra), f_inv(nxl:nxra,1:nza,nys:nyna), &
              f_out(0:nxa,nys_x:nyn_xa,nzb_x:nzt_xa),                        &
              work1(nxl:nxra,1:nza,nys:nyna), work2(nnx*nny*nnz)
+             work(nnx*nny*nnz)
 #if defined( __parallel )
 …
        DO  j = nys, nyna
           DO  i = nxl, nxra
+             work1(i,k,j) = f_in(k,j,i)
+          ENDDO
+       ENDDO
+    ENDDO
+!
+!-- Move data to different array, because memory location of work1 is
+!-- needed further below (work1 = work2)
+    DO  j = nys, nyna
+       DO  k = 1, nza
+          DO  i = nxl, nxra
+             f_inv(i,k,j) = work1(i,k,j)
+             f_inv(i,k,j) = f_in(k,j,i)
           ENDDO
        ENDDO
 …
     CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
     CALL MPI_ALLTOALL( f_inv(nxl,1,nys), sendrecvcount_xy, MPI_REAL, &
                        work2(1),         sendrecvcount_xy, MPI_REAL, &
+                       work(1),          sendrecvcount_xy, MPI_REAL, &
                        comm1dx, ierr )
     CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
 …
              DO  i = xs, xs + nnx - 1
                 m = m + 1
                 f_out(i,j,k) = work2(m)
+                f_out(i,j,k) = work(m)
              ENDDO
           ENDDO
 …
  SUBROUTINE transpose_yz( f_in, work1, f_inv, work2, f_out )
+ SUBROUTINE transpose_yz( f_in, work, f_out )
 !------------------------------------------------------------------------------!
 …
              f_inv(nxl_y:nxr_ya,nzb_y:nzt_ya,0:nya),                    &
              f_out(nxl_z:nxr_za,nys_z:nyn_za,1:nza),                    &
              work1(nxl_y:nxr_ya,nzb_y:nzt_ya,0:nya), work2(nnx*nny*nnz)
+             work(nnx*nny*nnz)
 #if defined( __parallel )
 …
 !-- Rearrange indices of input array in order to make data to be send
 !-- by MPI contiguous
     DO  k = nzb_y, nzt_ya
        DO  i = nxl_y, nxr_ya
           DO  j = 0, nya
              work1(i,k,j) = f_in(j,i,k)
+    DO  j = 0, nya
+       DO  k = nzb_y, nzt_ya
+          DO  i = nxl_y, nxr_ya
+             f_inv(i,k,j) = f_in(j,i,k)
           ENDDO
        ENDDO
 …
           DO  k = nzb_y, nzt_ya
              DO  i = nxl_y, nxr_ya
                 f_out(i,j,k) = work1(i,k,j)
+                f_out(i,j,k) = f_inv(i,k,j)
              ENDDO
           ENDDO
        ENDDO
        RETURN
-    ELSE
-       DO  j = 0, nya
-          DO  k = nzb_y, nzt_ya
-             DO  i = nxl_y, nxr_ya
-                f_inv(i,k,j) = work1(i,k,j)
-             ENDDO
-          ENDDO
-       ENDDO
     ENDIF
 …
     CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
     CALL MPI_ALLTOALL( f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
                        work2(1),             sendrecvcount_yz, MPI_REAL, &
+                       work(1),              sendrecvcount_yz, MPI_REAL, &
                        comm1dx, ierr )
     CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
 …
              DO  i = nxl_z, nxr_za
                 m = m + 1
                 f_out(i,j,k) = work2(m)
+                f_out(i,j,k) = work(m)
              ENDDO
           ENDDO
 …
  SUBROUTINE transpose_zx( f_in, work1, f_inv, work2, f_out )
+ SUBROUTINE transpose_zx( f_in, work, f_out )
 !------------------------------------------------------------------------------!
 …
     INTEGER ::  i, j, k, l, m, xs
     REAL ::  f_in(1:nza,nys:nyna,nxl:nxra), f_inv(nxl:nxra,nys:nyna,1:nza), &
+    REAL ::  f_in(1:nza,nys:nyna,nxl:nxra), f_inv(nys:nyna,nxl:nxra,1:nza), &
              f_out(0:nxa,nys_x:nyn_xa,nzb_x:nzt_xa),                        &
              work1(nxl:nxra,nys:nyna,1:nza), work2(nnx*nny*nnz)
+             work(nnx*nny*nnz)
 #if defined( __parallel )
 …
 !-- Rearrange indices of input array in order to make data to be send
 !-- by MPI contiguous
     DO  i = nxl, nxra
        DO  j = nys, nyna
           DO  k = 1,nza
              work1(i,j,k) = f_in(k,j,i)
+    DO  k = 1,nza
+       DO  i = nxl, nxra
+          DO  j = nys, nyna
+             f_inv(j,i,k) = f_in(k,j,i)
           ENDDO
        ENDDO
 …
     IF ( pdims(1) == 1 )  THEN
        DO  k = 1, nza
           DO  j = nys, nyna
              DO  i = nxl, nxra
                 f_out(i,j,k) = work1(i,j,k)
+          DO  i = nxl, nxra
+             DO  j = nys, nyna
+                f_out(i,j,k) = f_inv(j,i,k)
              ENDDO
           ENDDO
        ENDDO
        RETURN
-    ELSE
-       DO  k = 1, nza
-          DO  j = nys, nyna
-             DO  i = nxl, nxra
-                f_inv(i,j,k) = work1(i,j,k)
-             ENDDO
-          ENDDO
-       ENDDO
     ENDIF
 …
 !-- Transpose array
     CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
     CALL MPI_ALLTOALL( f_inv(nxl,nys,1), sendrecvcount_zx, MPI_REAL, &
                        work2(1),         sendrecvcount_zx, MPI_REAL, &
+    CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
+                       work(1),          sendrecvcount_zx, MPI_REAL, &
                        comm1dx, ierr )
     CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
 …
        xs = 0 + l * nnx
        DO  k = nzb_x, nzt_xa
           DO  j = nys_x, nyn_xa
              DO  i = xs, xs + nnx - 1
+          DO  i = xs, xs + nnx - 1
+             DO  j = nys_x, nyn_xa
                 m = m + 1
                 f_out(i,j,k) = work2(m)
+                f_out(i,j,k) = work(m)
              ENDDO
           ENDDO
 …
  SUBROUTINE transpose_zy( f_in, work1, f_inv, work2, f_out )
+ SUBROUTINE transpose_zy( f_in, work, f_out )
 !------------------------------------------------------------------------------!
 …
              f_inv(nxl_y:nxr_ya,nzb_y:nzt_ya,0:nya),                    &
              f_out(0:nya,nxl_y:nxr_ya,nzb_y:nzt_ya),                    &
              work1(0:nya,nxl_y:nxr_ya,nzb_y:nzt_ya), work2(nnx*nny*nnz)
+             work(nnx*nny*nnz)
 #if defined( __parallel )
 …
                 DO  i = nxl_z, nxr_za
                    m = m + 1
                    work2(m) = f_in(i,j,k)
+                   work(m) = f_in(i,j,k)
                 ENDDO
              ENDDO
 …
 !--    Transpose array
        CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
        CALL MPI_ALLTOALL( work2(1),             sendrecvcount_yz, MPI_REAL, &
+       CALL MPI_ALLTOALL( work(1),              sendrecvcount_yz, MPI_REAL, &
                           f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
                           comm1dx, ierr )
 …
+!
 !--    Reorder transposed array in a way that the y index is in first position
+       DO  j = 0, nya
+          DO  k = nzb_y, nzt_ya
+             DO  i = nxl_y, nxr_ya
+                f_out(j,i,k) = f_inv(i,k,j)
+             ENDDO
+          ENDDO
+       ENDDO
+    ELSE
+!
+!--    Reorder the array in a way that the y index is in first position
+       DO  k = nzb_y, nzt_ya
+          DO  j = 0, nya
+             DO  i = nxl_y, nxr_ya
+                f_inv(i,k,j) = f_in(i,j,k)
+             ENDDO
+          ENDDO
+       ENDDO
+!
+!--    Move data to output array
        DO  k = nzb_y, nzt_ya
           DO  i = nxl_y, nxr_ya
              DO  j = 0, nya
+                work1(j,i,k) = f_inv(i,k,j)
+             ENDDO
+          ENDDO
+       ENDDO
+    ELSE
+!
+!--    Reorder the array in a way that the y index is in first position
+       DO  k = nzb_y, nzt_ya
+          DO  i = nxl_y, nxr_ya
+             DO  j = 0, nya
+                work1(j,i,k) = f_in(i,j,k)
+             ENDDO
+          ENDDO
+       ENDDO
+                f_out(j,i,k) = f_inv(i,k,j)
+             ENDDO
+          ENDDO
+       ENDDO
     ENDIF
+!
-!-- Move data to output array
-    DO  k = nzb_y, nzt_ya
-       DO  i = nxl_y, nxr_ya
-          DO  j = 0, nya
-             f_out(j,i,k) = work1(j,i,k)
-          ENDDO
-       ENDDO
-    ENDDO
 #endif
 …
  SUBROUTINE transpose_zyd( f_in, work1, f_inv, work2, f_out )
+ SUBROUTINE transpose_zyd( f_in, work, f_out )
 !------------------------------------------------------------------------------!
 …
     REAL ::  f_in(1:nza,nys:nyna,nxl:nxra), f_inv(nys:nyna,nxl:nxra,1:nza), &
              f_out(0:nya,nxl_yd:nxr_yda,nzb_yd:nzt_yda),                    &
              work1(nys:nyna,nxl:nxra,1:nza), work2(nnx*nny*nnz)
+             work(nnx*nny*nnz)
 #if defined( __parallel )
 …
        DO  j = nys, nyna
           DO  k = 1, nza
              work1(j,i,k) = f_in(k,j,i)
+             f_inv(j,i,k) = f_in(k,j,i)
           ENDDO
        ENDDO
 …
           DO  i = nxl, nxra
              DO  j = nys, nyna
                 f_out(j,i,k) = work1(j,i,k)
+                f_out(j,i,k) = f_inv(j,i,k)
              ENDDO
           ENDDO
        ENDDO
        RETURN
-    ELSE
-       DO  k = 1, nza
-          DO  i = nxl, nxra
-             DO  j = nys, nyna
-                f_inv(j,i,k) = work1(j,i,k)
-             ENDDO
-          ENDDO
-       ENDDO
     ENDIF
 …
     CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
     CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zyd, MPI_REAL, &
                        work2(1),         sendrecvcount_zyd, MPI_REAL, &
+                       work(1),          sendrecvcount_zyd, MPI_REAL, &
                        comm1dy, ierr )
     CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
 …
              DO  j = ys, ys + nny - 1
                 m = m + 1
                 f_out(j,i,k) = work2(m)
+                f_out(j,i,k) = work(m)
              ENDDO
           ENDDO

Note: See TracChangeset for help on using the changeset viewer.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |