Home

Context Navigation

← Previous Changeset
Next Changeset →

Changeset 4617

Timestamp:

Jul 22, 2020 9:48:50 AM (5 years ago)

Author:

raasch

Message:

cyclic fill mode implemented for MPI-IO, check, if boundary conditions in the prerun are both set to cyclic

Location:

palm/trunk/SOURCE

Files:

: 3 edited

read_restart_data_mod.f90 (modified) (6 diffs)
restart_data_mpi_io_mod.f90 (modified) (48 diffs)
shared_memory_io_mod.f90 (modified) (25 diffs)

Legend:

: Unmodified
: Added
: Removed

palm/trunk/SOURCE/read_restart_data_mod.f90

-                      r4590
+                      r4617
 ! -----------------
 ! $Id$
+! check, if boundary conditions in the prerun are both set to cyclic
+!
+! 4590 2020-07-06 14:34:59Z suehring
 ! Bugfix in allocation of hom and hom_sum in case of mpi-io restart when
 ! chemistry or salsa are employed
 …
     CHARACTER (LEN=10) ::  version_on_file
+    CHARACTER (LEN=20) ::  bc_lr_on_file
+    CHARACTER (LEN=20) ::  bc_ns_on_file
     CHARACTER (LEN=20) ::  momentum_advec_check
     CHARACTER (LEN=20) ::  scalar_advec_check
 …
                 ENDIF
+             CASE ( 'bc_lr' )
+                READ ( 13 )  bc_lr_on_file
+                IF ( TRIM( bc_lr_on_file ) /= 'cyclic' )  THEN
+                   message_string = 'bc_lr in the prerun was set /= "cyclic"'
+                   CALL message( 'rrd_read_parts_of_global', 'PA0498', 1, 2, 0, 6, 0 )
+                ENDIF
+             CASE ( 'bc_ns' )
+                READ ( 13 )  bc_ns_on_file
+                IF ( TRIM( bc_ns_on_file ) /= 'cyclic' )  THEN
+                   message_string = 'bc_ns in the prerun was set /= "cyclic"'
+                   CALL message( 'rrd_read_parts_of_global', 'PA0498', 1, 2, 0, 6, 0 )
+                ENDIF
              CASE ( 'hom' )
                 ALLOCATE( hom_on_file(0:nz+1,2,pr_palm+max_pr_user_on_file,    &
 …
        ENDIF
+       CALL rrd_mpi_io( 'nx', nx_on_file )
+       CALL rrd_mpi_io( 'ny', ny_on_file )
+       CALL rrd_mpi_io_global_array( 'ref_state', ref_state )
+       CALL rrd_mpi_io( 'bc_lr', bc_lr_on_file )
+       CALL rrd_mpi_io( 'bc_ns', bc_ns_on_file )
+       IF ( TRIM( bc_lr_on_file ) /= 'cyclic'  .OR.  TRIM( bc_ns_on_file ) /= 'cyclic' )  THEN
+          message_string = 'bc_lr and/or bc_ns in the prerun was set /= "cyclic"'
+          CALL message( 'rrd_read_parts_of_global', 'PA0498', 1, 2, 0, 6, 0 )
+       ENDIF
        scalar_advec_check = scalar_advec
 …
           CALL message( 'rrd_read_parts_of_global', 'PA0101', 1, 2, 0, 6, 0 )
        ENDIF
+       CALL rrd_mpi_io( 'nx', nx_on_file )
+       CALL rrd_mpi_io( 'ny', ny_on_file )
+       CALL rrd_mpi_io_global_array( 'ref_state', ref_state )
+!
 …
+!
 !--    Read global restart data using MPI-IO
+!--    Read local restart data using MPI-IO
+!
 !--    Open the MPI-IO restart file.

palm/trunk/SOURCE/restart_data_mpi_io_mod.f90

-                      r4598
+                      r4617
 ! -----------------
 ! $Id$
+! Cyclic fill mode implemented
+!
+! 4598 2020-07-10 10:13:23Z suehring
 ! Bugfix in treatment of 3D soil arrays
+!
 ! 4591 2020-07-06 15:56:08Z raasch
 ! File re-formatted to follow the PALM coding standard
+!
+!
 ! 4539 2020-05-18 14:05:17Z raasch
 …
                nxl,                                                                                &
                nxlg,                                                                               &
+               nx_on_file,                                                                         &
                nxr,                                                                                &
                nxrg,                                                                               &
 …
                nyn,                                                                                &
                nyng,                                                                               &
+               ny_on_file,                                                                         &
                nys,                                                                                &
                nysg,                                                                               &
 …
                comm1dy,                                                                            &
                comm2d,                                                                             &
+               communicator_configurations,                                                        &
                myid,                                                                               &
                myidx,                                                                              &
 …
     USE shared_memory_io_mod,                                                                      &
         ONLY:  local_boundaries,                                                                   &
+        ONLY:  domain_decomposition_grid_features,                                                 &
                sm_class
 …
+!
+!-- Handling of outer boundaries
+    TYPE(local_boundaries) ::  lb  !<
+!-- Variable to store the grid features (index bounds) of the temporary arrays that are used
+!-- to read and write the restart data. They differ depending on if the outer boundary of the
+!-- total domain is contained in the restart data or not. iog stands for IO-grid.
+    TYPE(domain_decomposition_grid_features) ::  iog  !<
+!
 …
     CHARACTER(LEN=32), DIMENSION(max_nr_arrays) ::  array_names
     INTEGER(KIND=rd_offset_kind), DIMENSION(max_nr_arrays) :: array_offset
+!
+!-- Variables to handle the cyclic fill initialization mode
+    INTEGER ::  comm_cyclic_fill  !< communicator for cyclic fill PEs
+    INTEGER ::  rmawin_2di        !< RMA window 2d INTEGER
+    INTEGER ::  rmawin_2d         !< RMA window 2d REAL
+    INTEGER ::  rmawin_3d         !< RMA window 3d
+    INTEGER(iwp), ALLOCATABLE, DIMENSION(:,:) ::  remote_pe
+    INTEGER(iwp), ALLOCATABLE, DIMENSION(:,:) ::  remote_pe_s
+    INTEGER(iwp), ALLOCATABLE, DIMENSION(:,:) ::  rma_offset
+    INTEGER(iwp), ALLOCATABLE, DIMENSION(:,:) ::  rma_offset_s
+    INTEGER(iwp), ALLOCATABLE, DIMENSION(:,:) ::  rmabuf_2di
+    LOGICAL ::  cyclic_fill_mode            !< arrays are filled cyclically with data from prerun
+    LOGICAL ::  pe_active_for_read = .TRUE. !< this PE is active for reading data from prerun or
+                                            !< restart run. For restarts all PEs are active.
+    REAL(wp), ALLOCATABLE, DIMENSION(:,:)   :: rmabuf_2d
+    REAL(wp), ALLOCATABLE, DIMENSION(:,:,:) :: rmabuf_3d
+    TYPE(domain_decomposition_grid_features) ::  mainrun_grid  !< grid variables for the main run
+    TYPE(domain_decomposition_grid_features) ::  prerun_grid   !< grid variables for the prerun
     SAVE
 …
 #endif
+!    write(9,*) 'Here is rd_mpi_io_open',nx,nx_on_file,ny,ny_on_file,TRIM(action)   !kk may become Debug Output
     offset = 0
 …
     ENDIF
+    CALL sm_io%sm_init_comm( io_on_limited_cores_per_node )
+!
+!-- Determine, if prerun data shall be read and mapped cyclically to the mainrun arrays.
+!-- In cyclic fill mode only a subset of the PEs will read.
+    cyclic_fill_mode   = .FALSE.
+    pe_active_for_read = .TRUE.
+    IF ( rd_flag  .AND.  .NOT. PRESENT( open_for_global_io_only )  .AND.                           &
+         nx_on_file < nx  .AND.  ny_on_file < ny )  THEN
+       cyclic_fill_mode = .TRUE.
+       CALL setup_cyclic_fill
+!
+!--    Shared memory IO on limited cores is not allowed for cyclic fill mode
+       CALL sm_io%sm_init_comm( .FALSE. )  !
+    ELSE
+       CALL sm_io%sm_init_comm( io_on_limited_cores_per_node )
+    ENDIF
+!
+!-- TODO: add a more detailed meaningful comment about what is happening here
+!-- activate model grid
+    IF( cyclic_fill_mode  .AND.  .NOT. pe_active_for_read )  THEN
+      CALL mainrun_grid%activate_grid_from_this_class()
+      RETURN
+    ENDIF
+!
 …
     IF( sm_io%is_sm_active() )  THEN
        comm_io = sm_io%comm_io
+    ELSEIF ( cyclic_fill_mode )  THEN
+       comm_io = comm_cyclic_fill
     ELSE
        comm_io = comm2d
 …
 #endif
+    ENDIF
+    ENDIF
+!
+!-- TODO: describe in more detail what is happening here
+!-- activate model grid
+    IF ( cyclic_fill_mode )  CALL mainrun_grid%activate_grid_from_this_class()
+ CONTAINS
+    SUBROUTINE setup_cyclic_fill
+       IMPLICIT NONE
+       INTEGER      ::  color  !< used to set the IO PEs for MPI_COMM_SPLIT
+       INTEGER      ::  ierr   !<
+       INTEGER(iwp) ::  i      !<
+       INTEGER(iwp) ::  j      !<
+       INTEGER(KIND=MPI_ADDRESS_KIND) ::  winsize  !< size of RMA window
+!
+!--    TODO: describe in more detail what is done here and why it is done
+!--    save grid of main run
+       CALL mainrun_grid%save_grid_into_this_class()
+       ALLOCATE( remote_pe(0:nx_on_file,0:ny_on_file) )
+       ALLOCATE( remote_pe_s(0:nx_on_file,0:ny_on_file) )
+       ALLOCATE( rma_offset(0:nx_on_file,0:ny_on_file) )
+       ALLOCATE( rma_offset_s(0:nx_on_file,0:ny_on_file) )
+       remote_pe_s  = 0
+       rma_offset_s = 0
+!
+!--    Determine, if gridpoints of the prerun are located on this thread.
+!--    Set the (cyclic) prerun grid.
+       nxr = MIN( nxr, nx_on_file )
+       IF ( nxl > nx_on_file )  THEN
+          nxl = -99
+          nxr = -99
+          nnx = 0
+       ELSE
+          nnx =nxr-nxl+1
+       ENDIF
+       nyn = MIN( nyn, ny_on_file )
+       IF ( nys > ny_on_file )  THEN
+          nys = -99
+          nyn = -99
+          nny = 0
+       ELSE
+          nny = nyn-nys+1
+       ENDIF
+       nx = nx_on_file
+       ny = ny_on_file
+!
+!--    Determine, if this thread is doing IO
+       IF ( nnx > 0  .AND.  nny > 0 )  THEN
+          color = 1
+          pe_active_for_read = .TRUE.
+          remote_pe_s(nxl:nxr,nys:nyn) = myid   ! myid from comm2d
+          DO  j = nys, nyn
+             DO  i = nxl, nxr
+                rma_offset_s(i,j) = ( j-nys ) + ( i-nxl ) * nny
+             ENDDO
+          ENDDO
+       ELSE
+          color = MPI_UNDEFINED
+          pe_active_for_read = .FALSE.
+       ENDIF
+#if defined( __parallel )
+       CALL MPI_ALLREDUCE( remote_pe_s,  remote_pe,  SIZE(remote_pe_s),  MPI_INTEGER, MPI_SUM,     &
+                           comm2d, ierr )
+       CALL MPI_ALLREDUCE( rma_offset_s, rma_offset, SIZE(rma_offset_s), MPI_INTEGER, MPI_SUM,     &
+                           comm2d, ierr )
+       CALL MPI_COMM_SPLIT( comm2d, color, 0, comm_cyclic_fill, ierr )
+       IF ( pe_active_for_read )  THEN
+          CALL MPI_COMM_SIZE( comm_cyclic_fill, numprocs, ierr )
+          CALL MPI_COMM_RANK( comm_cyclic_fill, myid, ierr )
+       ENDIF
+#else
+       remote_pe  = remote_pe_s
+       rma_offset = rma_offset_s
+       myid       = 0
+       numprocs   = 1
+#endif
+!
+!--    Allocate 2d buffers as RMA window, accessible on all threads
+       IF ( pe_active_for_read )  THEN
+          ALLOCATE( rmabuf_2di(nys:nyn,nxl:nxr) )
+       ELSE
+          ALLOCATE( rmabuf_2di(1,1) )
+       ENDIF
+       winsize = SIZE( rmabuf_2di ) * iwp
+#if defined( __parallel )
+       CALL MPI_WIN_CREATE( rmabuf_2di, winsize, iwp, MPI_INFO_NULL, comm2d, rmawin_2di, ierr )
+       CALL MPI_WIN_FENCE( 0, rmawin_2di, ierr )
+#endif
+       IF ( pe_active_for_read )  THEN
+          ALLOCATE( rmabuf_2d(nys:nyn,nxl:nxr) )
+       ELSE
+          ALLOCATE( rmabuf_2d(1,1) )
+       ENDIF
+       winsize = SIZE( rmabuf_2d ) * wp
+#if defined( __parallel )
+       CALL MPI_WIN_CREATE( rmabuf_2d, winsize, wp, MPI_INFO_NULL, comm2d, rmawin_2d, ierr )
+       CALL MPI_WIN_FENCE( 0, rmawin_2d, ierr )
+#endif
+!
+!--    Allocate 3d buffer as RMA window, accessable on all threads
+       IF ( pe_active_for_read )  THEN
+          ALLOCATE( rmabuf_3d(nzb:nzt+1,nys:nyn,nxl:nxr) )
+       ELSE
+          ALLOCATE( rmabuf_3d(1,1,1) )
+       ENDIF
+       winsize = SIZE( rmabuf_3d ) * wp
+#if defined( __parallel )
+       CALL MPI_WIN_CREATE( rmabuf_3d, winsize, wp, MPI_INFO_NULL, comm2d, rmawin_3d, ierr )
+       CALL MPI_WIN_FENCE( 0, rmawin_3d, ierr )
+#endif
+!
+!--    TODO: comment in more detail, what is done here, and why
+!--    save small grid
+       CALL prerun_grid%save_grid_into_this_class()
+       prerun_grid%comm2d = comm_cyclic_fill
+       DEALLOCATE( remote_pe_s, rma_offset_s )
+    END SUBROUTINE setup_cyclic_fill
  END SUBROUTINE rd_mpi_io_open
 …
     IF ( found )  THEN
+#if defined( __parallel )
+       CALL sm_io%sm_node_barrier()  ! Has no effect if I/O on limited number of cores is inactive
+       IF ( sm_io%iam_io_pe )  THEN
+       IF ( cyclic_fill_mode )  THEN
+          CALL rrd_mpi_io_real_2d_cyclic_fill
+       ELSE
+#if defined( __parallel )
+          CALL sm_io%sm_node_barrier()  ! Has no effect if I/O on limited # of cores is inactive
+          IF ( sm_io%iam_io_pe )  THEN
+             CALL MPI_FILE_SET_VIEW( fh, array_position, MPI_REAL, ft_2d, 'native', MPI_INFO_NULL, &
+                                     ierr )
+             CALL MPI_FILE_READ_ALL( fh, array_2d, SIZE( array_2d ), MPI_REAL, status, ierr )
+          ENDIF
+          CALL sm_io%sm_node_barrier()
+#else
+          CALL posix_lseek( fh, array_position )
+          CALL posix_read( fh, array_2d, SIZE( array_2d ) )
+#endif
+          IF ( include_total_domain_boundaries )  THEN
+             DO  i = iog%nxl, iog%nxr
+                data(iog%nys-nbgp:iog%nyn-nbgp,i-nbgp) = array_2d(i,iog%nys:iog%nyn)
+             ENDDO
+             IF ( debug_level >= 2)  THEN
+                WRITE(9,*) 'r2f_ob ', TRIM(name),' ', SUM( data(nys:nyn,nxl:nxr) )
+             ENDIF
+          ELSE
+             DO  i = nxl, nxr
+                data(nys:nyn,i) = array_2d(i,nys:nyn)
+             ENDDO
+             IF ( debug_level >= 2)  THEN
+                WRITE(9,*) 'r2f ', TRIM( name ),' ', SUM( data(nys:nyn,nxl:nxr) )
+             ENDIF
+          ENDIF
+       ENDIF
+       CALL exchange_horiz_2d( data )
+    ELSE
+       message_string = '2d-REAL array "' // TRIM( name ) // '" not found in restart file'
+       CALL message( 'rrd_mpi_io_int', 'PA0722', 3, 2, 0, 6, 0 )
+    ENDIF
+ CONTAINS
+    SUBROUTINE rrd_mpi_io_real_2d_cyclic_fill
+       IMPLICIT NONE
+       INTEGER(iwp)    :: i         !<
+       INTEGER(iwp)    :: ie        !<
+       INTEGER(iwp)    :: ierr      !<
+       INTEGER(iwp)    :: is        !<
+       INTEGER(iwp)    :: i_remote  !<
+       INTEGER(iwp)    :: j         !<
+       INTEGER(iwp)    :: je        !<
+       INTEGER(iwp)    :: js        !<
+       INTEGER(iwp)    :: j_remote  !<
+       INTEGER(iwp)    :: nval      !<
+       INTEGER(iwp)    :: rem_pe    !<
+       INTEGER(KIND=MPI_ADDRESS_KIND) ::  rem_offs  !<
+!kk       write(9,*) 'Here is rma_cylic_fill_real_2d ',nxl,nxr,nys,nyn; FLUSH(9)
+!
+!--    Reading 2d real array on prerun grid
+       CALL prerun_grid%activate_grid_from_this_class()
+       IF ( pe_active_for_read )  THEN
           CALL MPI_FILE_SET_VIEW( fh, array_position, MPI_REAL, ft_2d, 'native', MPI_INFO_NULL,    &
                                   ierr )
           CALL MPI_FILE_READ_ALL( fh, array_2d, SIZE( array_2d ), MPI_REAL, status, ierr )
+       ENDIF
+       CALL sm_io%sm_node_barrier()
+          DO  i = nxl, nxr
+             rmabuf_2d(nys:nyn,i) = array_2d(i,nys:nyn)
+          ENDDO
+          data(nys:nyn,nxl:nxr) = rmabuf_2d     ! copy prerund data directly into output array data
+       ENDIF
+       CALL mainrun_grid%activate_grid_from_this_class()
+#if defined( __parallel )
+!
+!--    Close RMA window to allow remote access
+       CALL MPI_WIN_FENCE( 0, rmawin_2d, ierr )
+#endif
+!
+!--    TODO: describe in more detail what is happening in this IF/ELSE clause
+       IF ( .NOT. pe_active_for_read )  THEN
+          is = nxl
+          ie = nxr
+          js = nys
+          je = nyn
+       ELSE
+!
+!--       Extra get for cyclic data north of prerun data
+          is = nxl
+          ie = nxr
+          js = prerun_grid%nys+1
+          je = nyn
+          DO  i = is, ie
+             DO  j = js, je
+                i_remote = MOD(i,nx_on_file+1)
+                j_remote = MOD(j,ny_on_file+1)
+                rem_pe   = remote_pe(i_remote,j_remote)
+                rem_offs = rma_offset(i_remote,j_remote)
+                nval     = 1
+#if defined( __parallel )
+                IF ( rem_pe /= myid )  THEN
+                   CALL MPI_GET( data(j,i), nval, MPI_REAL, rem_pe, rem_offs, nval, MPI_REAL,      &
+                                 rmawin_2d, ierr )
+                ELSE
+                   data(j,i) = rmabuf_2d(j_remote,i_remote)
+                ENDIF
 #else
+       CALL posix_lseek( fh, array_position )
+       CALL posix_read( fh, array_2d, SIZE( array_2d ) )
+#endif
+       IF ( include_total_domain_boundaries )  THEN
+          DO  i = lb%nxl, lb%nxr
+             data(lb%nys-nbgp:lb%nyn-nbgp,i-nbgp) = array_2d(i,lb%nys:lb%nyn)
+                data(j,i) = array_2d(i_remote,j_remote)
+#endif
+             ENDDO
           ENDDO
+          IF ( debug_level >= 2)  WRITE(9,*) 'r2f_ob ', TRIM(name),' ', SUM( data(nys:nyn,nxl:nxr) )
+       ELSE
+          DO  i = nxl, nxr
+             data(nys:nyn,i) = array_2d(i,nys:nyn)
+!
+!--       Prepare setup for stripe right of prerun data
+          is = prerun_grid%nxr+1
+          ie = nxr
+          js = nys
+          je = nyn
+       ENDIF
+       DO  i = is, ie
+          DO j = js, je
+             i_remote = MOD(i,nx_on_file+1)
+             j_remote = MOD(j,ny_on_file+1)
+             rem_pe   = remote_pe(i_remote,j_remote)
+             rem_offs = rma_offset(i_remote,j_remote)
+             nval     = 1
+#if defined( __parallel )
+             IF ( rem_pe /= myid )  THEN
+                CALL MPI_GET( data(j,i), nval, MPI_REAL, rem_pe, rem_offs, nval, MPI_REAL,         &
+                              rmawin_2d, ierr )
+             ELSE
+                data(j,i) = rmabuf_2d(j_remote,i_remote)
+             ENDIF
+#else
+             data(j,i) = array_2d(i_remote,j_remote)
+#endif
           ENDDO
           IF ( debug_level >= 2) WRITE(9,*) 'r2f ', TRIM( name ),' ', SUM( data(nys:nyn,nxl:nxr) )
+       ENDIF
+       CALL exchange_horiz_2d( data )
     ELSE
+       message_string = '2d-REAL array "' // TRIM( name ) // '" not found in restart file'
+       CALL message( 'rrd_mpi_io_int', 'PA0722', 3, 2, 0, 6, 0 )
     ENDIF
+       ENDDO
+#if defined( __parallel )
+!
+!--    Reopen RMA window to allow filling
+       CALL MPI_WIN_FENCE( 0, rmawin_2d, ierr )
+#endif
+    END SUBROUTINE rrd_mpi_io_real_2d_cyclic_fill
  END SUBROUTINE rrd_mpi_io_real_2d
 …
 !--       This kind of array is dimensioned in the caller subroutine
 !--       INTEGER, DIMENSION(nys:nyn,nxl:nxr) ::  data
+#if defined( __parallel )
+          CALL sm_io%sm_node_barrier() ! Has no effect if I/O on limited number of cores is inactive
+          IF ( sm_io%iam_io_pe )  THEN
+             CALL MPI_FILE_SET_VIEW( fh, array_position, MPI_INTEGER, ft_2di_nb, 'native',         &
+                                     MPI_INFO_NULL, ierr )
+             CALL MPI_FILE_READ_ALL( fh, array_2di, SIZE( array_2di ), MPI_INTEGER, status, ierr )
+          IF ( cyclic_fill_mode )  THEN
+             CALL rrd_mpi_io_int_2d_cyclic_fill
+          ELSE
+#if defined( __parallel )
+             CALL sm_io%sm_node_barrier() ! Has no effect if I/O on limited # of cores is inactive
+             IF ( sm_io%iam_io_pe )  THEN
+                CALL MPI_FILE_SET_VIEW( fh, array_position, MPI_INTEGER, ft_2di_nb, 'native',      &
+                                        MPI_INFO_NULL, ierr )
+                CALL MPI_FILE_READ_ALL( fh, array_2di, SIZE( array_2di ), MPI_INTEGER, status,     &
+                                        ierr )
+             ENDIF
+             CALL sm_io%sm_node_barrier()
+#else
+             CALL posix_lseek( fh, array_position )
+             CALL posix_read( fh, array_2di, SIZE( array_2di ) )
+#endif
+             DO  j = nys, nyn
+                DO  i = nxl, nxr
+                   data(j-nys+1,i-nxl+1) = array_2di(i,j)
+                ENDDO
+             ENDDO
           ENDIF
-          CALL sm_io%sm_node_barrier()
-#else
-          CALL posix_lseek( fh, array_position )
-          CALL posix_read( fh, array_2di, SIZE( array_2di ) )
-#endif
-          DO  j = nys, nyn
-             DO  i = nxl, nxr
-                data(j-nys+1,i-nxl+1) = array_2di(i,j)
-             ENDDO
-          ENDDO
        ELSE
 …
     ENDIF
+ CONTAINS
+    SUBROUTINE rrd_mpi_io_int_2d_cyclic_fill
+       IMPLICIT NONE
+       INTEGER(iwp)    :: i         !<
+       INTEGER(iwp)    :: ie        !<
+       INTEGER(iwp)    :: ierr      !<
+       INTEGER(iwp)    :: is        !<
+       INTEGER(iwp)    :: i_remote  !<
+       INTEGER(iwp)    :: j         !<
+       INTEGER(iwp)    :: je        !<
+       INTEGER(iwp)    :: js        !<
+       INTEGER(iwp)    :: j_remote  !<
+       INTEGER(iwp)    :: nval      !<
+       INTEGER(iwp)    :: rem_pe    !<
+       INTEGER(KIND=MPI_ADDRESS_KIND) ::  rem_offs  !<
+       CALL prerun_grid%activate_grid_from_this_class()
+       IF ( pe_active_for_read )  THEN
+          CALL MPI_FILE_SET_VIEW( fh, array_position, MPI_INTEGER, ft_2di_nb, 'native',            &
+                                  MPI_INFO_NULL, ierr )
+          CALL MPI_FILE_READ_ALL( fh, array_2di, SIZE( array_2di ), MPI_INTEGER, status, ierr )
+          DO  i = nxl, nxr
+             rmabuf_2di(nys:nyn,i) = array_2di(i,nys:nyn)
+          ENDDO
+          data(1:nny,1:nnx) = rmabuf_2di
+       ENDIF
+       CALL mainrun_grid%activate_grid_from_this_class()
+#if defined( __parallel )
+!
+!--    Close RMA window to allow remote access
+       CALL MPI_WIN_FENCE( 0, rmawin_2di, ierr )
+#endif
+       IF ( .NOT. pe_active_for_read )  THEN
+          is = nxl
+          ie = nxr
+          js = nys
+          je = nyn
+       ELSE
+          is = nxl
+          ie = nxr
+          js = prerun_grid%nys+1
+          je = nyn
+          DO  i = is, ie
+             DO  j = js, je
+                i_remote = MOD(i,nx_on_file+1)
+                j_remote = MOD(j,ny_on_file+1)
+                rem_pe   = remote_pe(i_remote,j_remote)
+                rem_offs = rma_offset(i_remote,j_remote)
+                nval     = 1
+#if defined( __parallel )
+                IF ( rem_pe /= myid )  THEN
+                   CALL MPI_GET( data(j-nys+1,i-nxl+1), nval, MPI_INTEGER, rem_pe, rem_offs, nval, &
+                                 MPI_INTEGER, rmawin_2di, ierr )
+                ELSE
+                   data(j-nys+1,i-nxl+1) = rmabuf_2di(j_remote,i_remote)
+                ENDIF
+#else
+                data(j-nys+1,i-nxl+1) = array_2di(i_remote,j_remote)
+#endif
+             ENDDO
+          ENDDO
+          is = prerun_grid%nxr+1
+          ie = nxr
+          js = nys
+          je = nyn
+       ENDIF
+       DO  i = is, ie
+          DO  j = js, je
+             i_remote = MOD(i,nx_on_file+1)
+             j_remote = MOD(j,ny_on_file+1)
+             rem_pe   = remote_pe(i_remote,j_remote)
+             rem_offs = rma_offset(i_remote,j_remote)
+             nval     = 1
+#if defined( __parallel )
+             IF ( rem_pe /= myid )  THEN
+                CALL MPI_GET( data(j-nys+1,i-nxl+1), nval, MPI_INTEGER, rem_pe, rem_offs, nval,    &
+                              MPI_INTEGER, rmawin_2di, ierr)
+             ELSE
+                data(j-nys+1,i-nxl+1) = rmabuf_2di(j_remote,i_remote)
+             ENDIF
+#else
+             data(j-nys+1,i-nxl+1) = array_2di(i_remote,j_remote)
+#endif
+          ENDDO
+       ENDDO
+#if defined( __parallel )
+!
+!--    Reopen RMA window to allow filling
+       CALL MPI_WIN_FENCE( 0, rmawin_2di, ierr )
+#endif
+    END SUBROUTINE rrd_mpi_io_int_2d_cyclic_fill
  END SUBROUTINE rrd_mpi_io_int_2d
 …
     INTEGER(iwp)                       ::  i       !<
+    INTEGER(iwp)                       ::  j       !<
 #if defined( __parallel )
 …
     found = .FALSE.
+    data  = -1.0
     DO  i = 1, tgh%nr_arrays
 …
     IF ( found )  THEN
+#if defined( __parallel )
+       CALL sm_io%sm_node_barrier()  ! Has no effect if I/O on limited number of cores is inactive
+       IF( sm_io%iam_io_pe )  THEN
+       IF ( cyclic_fill_mode )  THEN
+          CALL rrd_mpi_io_real_3d_cyclic_fill
+!
+!--       Cyclic fill mode requires to use the "cyclic" communicator, in order to initialize
+!--       grid points at the outer boundaries (ghost layers) of the total domain. These points
+!--       are not contained in the prerun data, because the prerun used cyclic boundary conditions.
+          CALL exchange_horiz( data, nbgp, alternative_communicator = 1 )
+       ELSE
+#if defined( __parallel )
+          CALL sm_io%sm_node_barrier()  ! Has no effect if I/O on limited # of cores is inactive
+          IF( sm_io%iam_io_pe )  THEN
+             CALL MPI_FILE_SET_VIEW( fh, array_position, MPI_REAL, ft_3d, 'native', MPI_INFO_NULL, &
+                                     ierr )
+             CALL MPI_FILE_READ_ALL( fh, array_3d, SIZE( array_3d ), MPI_REAL, status, ierr )
+          ENDIF
+          CALL sm_io%sm_node_barrier()
+#else
+          CALL posix_lseek( fh, array_position )
+          CALL posix_read(fh, array_3d, SIZE( array_3d ) )
+#endif
+          IF ( include_total_domain_boundaries )  THEN
+             DO  i = iog%nxl, iog%nxr
+                data(:,iog%nys-nbgp:iog%nyn-nbgp,i-nbgp) = array_3d(:,i,iog%nys:iog%nyn)
+             ENDDO
+          ELSE
+             DO  i = nxl, nxr
+                data(:,nys:nyn,i) = array_3d(:,i,nys:nyn)
+             ENDDO
+          ENDIF
+          CALL exchange_horiz( data, nbgp )
+       ENDIF
+    ELSE
+       message_string = '3d-REAL array "' // TRIM( name ) // '" not found in restart file'
+       CALL message( 'rrd_mpi_io_real_3d', 'PA0722', 3, 2, 0, 6, 0 )
+    ENDIF
+ CONTAINS
+    SUBROUTINE rrd_mpi_io_real_3d_cyclic_fill
+       IMPLICIT NONE
+       INTEGER(iwp)    :: i         !<
+       INTEGER(iwp)    :: ie        !<
+       INTEGER(iwp)    :: ierr      !<
+       INTEGER(iwp)    :: is        !<
+       INTEGER(iwp)    :: i_remote  !<
+       INTEGER(iwp)    :: j         !<
+       INTEGER(iwp)    :: je        !<
+       INTEGER(iwp)    :: js        !<
+       INTEGER(iwp)    :: j_remote  !<
+       INTEGER(iwp)    :: nval      !<
+       INTEGER(iwp)    :: rem_pe    !<
+       INTEGER(KIND=MPI_ADDRESS_KIND) ::  rem_offs  !<
+       CALL prerun_grid%activate_grid_from_this_class()
+       IF ( pe_active_for_read )  THEN
           CALL MPI_FILE_SET_VIEW( fh, array_position, MPI_REAL, ft_3d, 'native', MPI_INFO_NULL,    &
                                   ierr )
           CALL MPI_FILE_READ_ALL( fh, array_3d, SIZE( array_3d ), MPI_REAL, status, ierr )
+       ENDIF
+       CALL sm_io%sm_node_barrier()
+          DO  i = nxl, nxr
+             rmabuf_3d(:,nys:nyn,i) = array_3d(:,i,nys:nyn)
+          ENDDO
+          data(:,nys:nyn,nxl:nxr) = rmabuf_3d
+       ENDIF
+       CALL mainrun_grid%activate_grid_from_this_class ()
+#if defined( __parallel )
+!
+!--     Close RMA window to allow remote access
+        CALL MPI_WIN_FENCE( 0, rmawin_3d, ierr )
+#endif
+       IF ( .NOT. pe_active_for_read )  THEN
+          is = nxl
+          ie = nxr
+          js = nys
+          je = nyn
+       ELSE
+          is = nxl
+          ie = nxr
+          js = prerun_grid%nys+1
+          je = nyn
+          DO  i = is, ie
+             DO  j = js, je
+                i_remote = MOD(i,nx_on_file+1)
+                j_remote = MOD(j,ny_on_file+1)
+                rem_pe   = remote_pe(i_remote,j_remote)
+                rem_offs = rma_offset(i_remote,j_remote)*(nzt-nzb+2)
+                nval     = nzt-nzb+2
+#if defined( __parallel )
+                IF(rem_pe /= myid)   THEN
+                   CALL MPI_GET( data(nzb,j,i), nval, MPI_REAL, rem_pe, rem_offs, nval, MPI_REAL,  &
+                                 rmawin_3d, ierr)
+                ELSE
+                   data(:,j,i) = rmabuf_3d(:,j_remote,i_remote)
+                ENDIF
 #else
+       CALL posix_lseek( fh, array_position )
+       CALL posix_read(fh, array_3d, SIZE( array_3d ) )
+#endif
+       IF ( include_total_domain_boundaries )  THEN
+          DO  i = lb%nxl, lb%nxr
+             data(:,lb%nys-nbgp:lb%nyn-nbgp,i-nbgp) = array_3d(:,i,lb%nys:lb%nyn)
+                data(:,j,i) = array_3d(:,i_remote,j_remote)
+#endif
+             ENDDO
           ENDDO
+       ELSE
+          DO  i = nxl, nxr
+             data(:,nys:nyn,i) = array_3d(:,i,nys:nyn)
+          is = prerun_grid%nxr+1
+          ie = nxr
+          js = nys
+          je = nyn
+       ENDIF
+       DO  i = is, ie
+          DO  j = js, je
+             i_remote = MOD(i,nx_on_file+1)
+             j_remote = MOD(j,ny_on_file+1)
+             rem_pe   = remote_pe(i_remote,j_remote)
+             rem_offs = rma_offset(i_remote,j_remote) * ( nzt-nzb+2 )
+             nval     = nzt-nzb+2
+#if defined( __parallel )
+             IF ( rem_pe /= myid )  THEN
+                CALL MPI_GET( data(nzb,j,i), nval, MPI_REAL, rem_pe, rem_offs, nval, MPI_REAL,     &
+                              rmawin_3d, ierr)
+             ELSE
+                data(:,j,i) = rmabuf_3d(:,j_remote,i_remote)
+             ENDIF
+#else
+             data(:,j,i) = array_3d(:,i_remote,j_remote)
+#endif
           ENDDO
+       ENDIF
+       CALL exchange_horiz( data, nbgp )
+    ELSE
+       message_string = '3d-REAL array "' // TRIM( name ) // '" not found in restart file'
+       CALL message( 'rrd_mpi_io_real_3d', 'PA0722', 3, 2, 0, 6, 0 )
+    ENDIF
+       ENDDO
+#if defined( __parallel )
+!
+!--    Reopen RMA window to allow filling
+       CALL MPI_WIN_FENCE( 0, rmawin_3d, ierr )
+#endif
+    END SUBROUTINE rrd_mpi_io_real_3d_cyclic_fill
  END SUBROUTINE rrd_mpi_io_real_3d
 …
     LOGICAL                            ::  found     !<
+    INTEGER(iwp)                       ::  ierr      !<
     REAL(wp), INTENT(INOUT), DIMENSION(nzb_soil:nzt_soil,nysg:nyng,nxlg:nxrg) ::  data  !<
+!
+!-- Prerun data is not allowed to contain soil information so far
+    IF ( cyclic_fill_mode )  THEN
+       message_string = 'prerun data is not allowed to contain soil information'
+       CALL message( 'rrd_mpi_io_real_3d_soil', 'PA0729', 3, 2, -1, 6, 0 )
+    ENDIF
     found = .FALSE.
 …
 #endif
        IF ( include_total_domain_boundaries )  THEN
           DO  i = lb%nxl, lb%nxr
              data(:,lb%nys-nbgp:lb%nyn-nbgp,i-nbgp) = array_3d_soil(:,i,lb%nys:lb%nyn)
+          DO  i = iog%nxl, iog%nxr
+             data(:,iog%nys-nbgp:iog%nyn-nbgp,i-nbgp) = array_3d_soil(:,i,iog%nys:iog%nyn)
           ENDDO
        ELSE
 …
+!
 !--    Prepare output with outer boundaries
        DO  i = lb%nxl, lb%nxr
           array_2d(i,lb%nys:lb%nyn) = data(lb%nys-nbgp:lb%nyn-nbgp,i-nbgp)
+       DO  i = iog%nxl, iog%nxr
+          array_2d(i,iog%nys:iog%nyn) = data(iog%nys-nbgp:iog%nyn-nbgp,i-nbgp)
        ENDDO
 …
 !--    Prepare output without outer boundaries
        DO  i = nxl,nxr
           array_2d(i,lb%nys:lb%nyn) = data(nys:nyn,i)
+          array_2d(i,iog%nys:iog%nyn) = data(nys:nyn,i)
        ENDDO
 …
 !-- Type conversion required, otherwise right hand side brackets are calculated assuming 4 byte INT.
 !-- Maybe a compiler problem.
     array_position = array_position + ( INT( lb%ny, KIND=rd_offset_kind ) + 1 ) *                  &
                                       ( INT( lb%nx, KIND=rd_offset_kind ) + 1 ) * wp
+    array_position = array_position + ( INT( iog%ny, KIND=rd_offset_kind ) + 1 ) *                 &
+                                      ( INT( iog%nx, KIND=rd_offset_kind ) + 1 ) * wp
  END SUBROUTINE wrd_mpi_io_real_2d
 …
 !--    index order of the array in the same way, i.e. the first dimension should be along x and the
 !--    second along y. For this reason, the original PALM data need to be swaped.
        DO  i = lb%nxl, lb%nxr
           array_3d(:,i,lb%nys:lb%nyn) = data(:,lb%nys-nbgp:lb%nyn-nbgp,i-nbgp)
+       DO  i = iog%nxl, iog%nxr
+          array_3d(:,i,iog%nys:iog%nyn) = data(:,iog%nys-nbgp:iog%nyn-nbgp,i-nbgp)
        ENDDO
 …
 !--    Prepare output of 3d-REAL-array without ghost layers
        DO  i = nxl, nxr
            array_3d(:,i,lb%nys:lb%nyn) = data(:,nys:nyn,i)
+           array_3d(:,i,iog%nys:iog%nyn) = data(:,nys:nyn,i)
        ENDDO
 …
 !-- Type conversion required, otherwise right hand side brackets are calculated assuming 4 byte INT.
 !-- Maybe a compiler problem.
     array_position = array_position + INT(    (nz+2), KIND = rd_offset_kind ) *                    &
                                       INT( (lb%ny+1), KIND = rd_offset_kind ) *                    &
                                       INT( (lb%nx+1), KIND = rd_offset_kind ) * wp
+    array_position = array_position + INT(     (nz+2), KIND = rd_offset_kind ) *                   &
+                                      INT( (iog%ny+1), KIND = rd_offset_kind ) *                   &
+                                      INT( (iog%nx+1), KIND = rd_offset_kind ) * wp
  END SUBROUTINE wrd_mpi_io_real_3d
 …
 #endif
     IF ( include_total_domain_boundaries )  THEN
+    IF ( include_total_domain_boundaries)  THEN
+!
 !--    Prepare output of 3d-REAL-array with ghost layers. In the virtual PE grid, the first
 …
 !--    index order of the array in the same way, i.e. the first dimension should be along x and the
 !--    second along y. For this reason, the original PALM data need to be swaped.
        DO  i = lb%nxl, lb%nxr
           array_3d_soil(:,i,lb%nys:lb%nyn) = data(:,lb%nys-nbgp:lb%nyn-nbgp,i-nbgp)
+       DO  i = iog%nxl, iog%nxr
+          array_3d_soil(:,i,iog%nys:iog%nyn) = data(:,iog%nys-nbgp:iog%nyn-nbgp,i-nbgp)
        ENDDO
 …
 !--    Prepare output of 3d-REAL-array without ghost layers
        DO  i = nxl, nxr
           array_3d_soil(:,i,lb%nys:lb%nyn) = data(:,nys:nyn,i)
+          array_3d_soil(:,i,iog%nys:iog%nyn) = data(:,nys:nyn,i)
        ENDDO
 …
 !-- Maybe a compiler problem.
     array_position = array_position + INT( (nzt_soil-nzb_soil+1), KIND = rd_offset_kind ) *        &
                                       INT( (lb%ny+1),             KIND = rd_offset_kind ) *        &
                                       INT( (lb%nx+1),             KIND = rd_offset_kind ) * wp
+                                      INT( (iog%ny+1),            KIND = rd_offset_kind ) *        &
+                                      INT( (iog%nx+1),            KIND = rd_offset_kind ) * wp
  END SUBROUTINE wrd_mpi_io_real_3d_soil
 …
     ENDDO
     IF ( found )  THEN
+!
 !--    Set default view
 #if defined( __parallel )
+       IF ( sm_io%iam_io_pe )  THEN
+          CALL MPI_FILE_SET_VIEW( fh, offset, MPI_BYTE, MPI_BYTE, 'native', MPI_INFO_NULL, ierr )
+          CALL MPI_FILE_SEEK( fh, array_position, MPI_SEEK_SET, ierr )
+          CALL MPI_FILE_READ_ALL( fh, data, SIZE( data ), MPI_REAL, status, ierr )
+       ENDIF
+       IF ( sm_io%is_sm_active() )  THEN
+          CALL MPI_BCAST( data, SIZE( data ), MPI_REAL, 0, sm_io%comm_shared, ierr )
+       IF ( cyclic_fill_mode )  THEN        !kk This may be the general solution for all cases
+          IF ( pe_active_for_read )  THEN
+             CALL MPI_FILE_SET_VIEW( fh, offset, MPI_BYTE, MPI_BYTE, 'native', MPI_INFO_NULL, ierr )
+             CALL MPI_FILE_SEEK( fh, array_position, MPI_SEEK_SET, ierr )
+             CALL MPI_FILE_READ_ALL( fh, data, SIZE( data ), MPI_REAL, status, ierr )
+         ENDIF
+         CALL MPI_BCAST( data, SIZE( data ), MPI_REAL, 0, comm2d, ierr )
+       ELSE
+          IF ( sm_io%iam_io_pe )  THEN
+             CALL MPI_FILE_SET_VIEW( fh, offset, MPI_BYTE, MPI_BYTE, 'native', MPI_INFO_NULL, ierr )
+             CALL MPI_FILE_SEEK( fh, array_position, MPI_SEEK_SET, ierr )
+             CALL MPI_FILE_READ_ALL( fh, data, SIZE( data ), MPI_REAL, status, ierr )
+          ENDIF
+          IF ( sm_io%is_sm_active() )  THEN
+             CALL MPI_BCAST( data, SIZE( data ), MPI_REAL, 0, sm_io%comm_shared, ierr )
+          ENDIF
        ENDIF
 #else
 …
 !--    Set default view
 #if defined( __parallel )
+       IF ( sm_io%iam_io_pe )  THEN
+          CALL MPI_FILE_SET_VIEW( fh, offset, MPI_BYTE, MPI_BYTE, 'native', MPI_INFO_NULL, ierr )
+          CALL MPI_FILE_SEEK( fh, array_position, MPI_SEEK_SET, ierr )
+          CALL MPI_FILE_READ_ALL( fh, data, SIZE( data), MPI_INTEGER, status, ierr )
+       ENDIF
+       IF ( sm_io%is_sm_active() )  THEN
+          CALL MPI_BCAST( data, SIZE( data ), MPI_INTEGER, 0, sm_io%comm_shared, ierr )
+       IF ( cyclic_fill_mode )  THEN      !kk This may be the general solution for all cases
+          IF ( pe_active_for_read )  THEN
+             CALL MPI_FILE_SET_VIEW( fh, offset, MPI_BYTE, MPI_BYTE, 'native', MPI_INFO_NULL, ierr )
+             CALL MPI_FILE_SEEK( fh, array_position, MPI_SEEK_SET, ierr )
+             CALL MPI_FILE_READ_ALL( fh, data, SIZE( data), MPI_INTEGER, status, ierr )
+          ENDIF
+          CALL MPI_BCAST( data, SIZE( data ), MPI_REAL, 0, comm2d, ierr )
+       ELSE
+          IF ( sm_io%iam_io_pe )  THEN
+             CALL MPI_FILE_SET_VIEW( fh, offset, MPI_BYTE, MPI_BYTE, 'native', MPI_INFO_NULL, ierr )
+             CALL MPI_FILE_SEEK( fh, array_position, MPI_SEEK_SET, ierr )
+             CALL MPI_FILE_READ_ALL( fh, data, SIZE( data), MPI_INTEGER, status, ierr )
+          ENDIF
+          IF ( sm_io%is_sm_active() )  THEN
+             CALL MPI_BCAST( data, SIZE( data ), MPI_INTEGER, 0, sm_io%comm_shared, ierr )
+          ENDIF
        ENDIF
 #else
 …
     lo_first_index = 1
-    IF ( MAXVAL( m_global_start ) == -1 )   RETURN   ! Nothing to do on this PE
     IF ( PRESENT( first_index ) )   THEN
        lo_first_index = first_index
 …
     IF ( found )  THEN
+       DO  i = nxl, nxr
+          DO  j = nys, nyn
+             IF ( m_global_start(j,i) > 0 )  THEN
+                disp     = array_position+(m_global_start(j,i)-1) * wp
+                nr_words = m_end_index(j,i)-m_start_index(j,i)+1
+                nr_bytes = nr_words * wp
+             ENDIF
+             IF ( disp >= 0  .AND.  disp_f == -1 )  THEN   ! First entry
+                disp_f     = disp
+                nr_bytes_f = 0
+                i_f = i
+                j_f = j
+             ENDIF
+             IF ( j == nyn  .AND.  i == nxr )  THEN        ! Last entry
+                disp_n = -1
+                IF (  nr_bytes > 0 )  THEN
+                   nr_bytes_f = nr_bytes_f+nr_bytes
+       IF ( cyclic_fill_mode )  THEN
+          CALL rrd_mpi_io_surface_cyclic_fill
+       ELSE
+          IF ( MAXVAL( m_global_start ) == -1 )   RETURN   ! Nothing to do on this PE
+          DO  i = nxl, nxr
+             DO  j = nys, nyn
+                IF ( m_global_start(j,i) > 0 )  THEN
+                   disp     = array_position+(m_global_start(j,i)-1) * wp
+                   nr_words = m_end_index(j,i)-m_start_index(j,i)+1
+                   nr_bytes = nr_words * wp
                 ENDIF
+             ELSEIF ( j == nyn )  THEN                     ! Next x
+                IF ( m_global_start(nys,i+1) > 0  .AND.  disp > 0 )  THEN
+                   disp_n = array_position + ( m_global_start(nys,i+1) - 1 ) * wp
+                IF ( disp >= 0  .AND.  disp_f == -1 )  THEN   ! First entry
+                   disp_f     = disp
+                   nr_bytes_f = 0
+                   i_f = i
+                   j_f = j
+                ENDIF
+                IF ( j == nyn  .AND.  i == nxr )  THEN        ! Last entry
+                   disp_n = -1
+                   IF (  nr_bytes > 0 )  THEN
+                      nr_bytes_f = nr_bytes_f+nr_bytes
+                   ENDIF
+                ELSEIF ( j == nyn )  THEN                     ! Next x
+                   IF ( m_global_start(nys,i+1) > 0  .AND.  disp > 0 )  THEN
+                      disp_n = array_position + ( m_global_start(nys,i+1) - 1 ) * wp
+                   ELSE
+                      CYCLE
+                   ENDIF
                 ELSE
+                   CYCLE
+                   IF ( m_global_start(j+1,i) > 0  .AND.  disp > 0 )  THEN
+                      disp_n = array_position + ( m_global_start(j+1,i) - 1 ) * wp
+                   ELSE
+                      CYCLE
+                   ENDIF
                 ENDIF
+             ELSE
+                IF ( m_global_start(j+1,i) > 0  .AND.  disp > 0 )  THEN
+                   disp_n = array_position + ( m_global_start(j+1,i) - 1 ) * wp
+                ELSE
+                   CYCLE
+                IF ( disp + nr_bytes == disp_n )  THEN        ! Contiguous block
+                   nr_bytes_f = nr_bytes_f + nr_bytes
+                ELSE                                          ! Read
+#if defined( __parallel )
+                   CALL MPI_FILE_SEEK( fhs, disp_f, MPI_SEEK_SET, ierr )
+                   nr_words = nr_bytes_f / wp
+                   CALL MPI_FILE_READ( fhs, data(m_start_index(j_f,i_f)), nr_words, MPI_REAL, status, &
+                      ierr )
+#else
+                   CALL posix_lseek( fh, disp_f )
+                   CALL posix_read( fh, data(m_start_index(j_f,i_f):), nr_bytes_f )
+#endif
+                   disp_f     = disp
+                   nr_bytes_f = nr_bytes
+                   i_f = i
+                   j_f = j
                 ENDIF
+             ENDIF
+             IF ( disp + nr_bytes == disp_n )  THEN        ! Contiguous block
+                nr_bytes_f = nr_bytes_f + nr_bytes
+             ELSE                                          ! Read
+#if defined( __parallel )
+                CALL MPI_FILE_SEEK( fhs, disp_f, MPI_SEEK_SET, ierr )
+                nr_words = nr_bytes_f / wp
+                CALL MPI_FILE_READ( fhs, data(m_start_index(j_f,i_f)), nr_words, MPI_REAL, status, &
+                                    ierr )
+#else
+                CALL posix_lseek( fh, disp_f )
+                CALL posix_read( fh, data(m_start_index(j_f,i_f):), nr_bytes_f )
+#endif
+                disp_f     = disp
+                nr_bytes_f = nr_bytes
+                i_f = i
+                j_f = j
+             ENDIF
+             ENDDO
           ENDDO
+       ENDDO
+       ENDIF
     ELSE
 …
 !                                                             lo_first_index,nr_val, SUM( data(1:nr_val) )
 !    ENDIF
+ CONTAINS
+    SUBROUTINE rrd_mpi_io_surface_cyclic_fill
+       IMPLICIT NONE
+       INTEGER(iwp) ::  i         !<
+       INTEGER(iwp) ::  ie        !<
+       INTEGER(iwp) ::  ierr      !<
+       INTEGER(iwp) ::  is        !<
+       INTEGER(iwp) ::  i_remote  !<
+       INTEGER(iwp) ::  j         !<
+       INTEGER(iwp) ::  je        !<
+       INTEGER(iwp) ::  js        !<
+       INTEGER(iwp) ::  j_remote  !<
+       INTEGER(iwp) ::  nval      !<
+       INTEGER(iwp) ::  rem_pe    !<
+       INTEGER(KIND=MPI_ADDRESS_KIND) ::  rem_offs  !<
+       LOGICAL ::  write_done  !<
+!
+!--    In the current version, there is only 1 value per grid cell allowed.
+!--    In this special case, the cyclical repetition can be done with the same method as for 2d-real
+!--    array.
+       CALL prerun_grid%activate_grid_from_this_class()
+       IF ( pe_active_for_read )  THEN
+          rmabuf_2d = -1.0
+          DO  i = nxl, nxr
+             DO  j = nys, nyn
+                IF ( m_global_start(j,i) > 0 )  THEN
+                   disp     = array_position+(m_global_start(j,i)-1) * wp
+                   nr_words = m_end_index(j,i)-m_start_index(j,i)+1
+                   nr_bytes = nr_words * wp
+                ENDIF
+                IF ( disp >= 0  .AND.  disp_f == -1 )  THEN   ! First entry
+                   disp_f     = disp
+                   nr_bytes_f = 0
+                   write_done = .TRUE.
+                ENDIF
+                IF( write_done )  THEN
+                   i_f = i
+                   j_f = j
+                   write_done = .FALSE.
+                ENDIF
+                IF ( j == nyn  .AND.  i == nxr )  THEN        ! Last entry
+                   disp_n = -1
+                   IF (  nr_bytes > 0 )  THEN
+                      nr_bytes_f = nr_bytes_f+nr_bytes
+                   ENDIF
+                ELSEIF ( j == nyn )  THEN                     ! Next x
+                   IF ( m_global_start(nys,i+1) > 0  .AND.  disp > 0 )  THEN
+                      disp_n = array_position + ( m_global_start(nys,i+1) - 1 ) * wp
+                   ELSE
+                      CYCLE
+                   ENDIF
+                ELSE
+                   IF ( m_global_start(j+1,i) > 0  .AND.  disp > 0 )  THEN
+                      disp_n = array_position + ( m_global_start(j+1,i) - 1 ) * wp
+                   ELSE
+                      CYCLE
+                   ENDIF
+                ENDIF
+                IF ( disp + nr_bytes == disp_n )  THEN        ! Contiguous block
+                   nr_bytes_f = nr_bytes_f + nr_bytes
+                ELSE                                          ! Read
+#if defined( __parallel )
+                   CALL MPI_FILE_SEEK( fhs, disp_f, MPI_SEEK_SET, ierr )
+                   nr_words = nr_bytes_f / wp
+                   CALL MPI_FILE_READ( fhs, rmabuf_2d(j_f,i_f), nr_words, MPI_REAL, status, ierr )
+#else
+                   CALL posix_lseek( fh, disp_f )
+                   CALL posix_read( fh, rmabuf_2d(j_f,i_f), nr_bytes_f )
+#endif
+                   disp_f     = disp
+                   nr_bytes_f = nr_bytes
+                   write_done = .TRUE.
+                ENDIF
+             ENDDO
+          ENDDO
+       ENDIF
+       CALL mainrun_grid%activate_grid_from_this_class()
+#if defined( __parallel )
+!
+!--    Close RMA window to allow remote access
+       CALL MPI_WIN_FENCE( 0, rmawin_2d, ierr )
+#endif
+       IF ( .NOT. pe_active_for_read )  THEN
+          is = nxl
+          ie = nxr
+          js = nys
+          je = nyn
+       ELSE
+          is = nxl
+          ie = nxr
+          js = prerun_grid%nys+1
+          je = nyn
+          DO  i = is, ie
+             DO  j = js, je
+                i_remote = MOD(i,nx_on_file+1)
+                j_remote = MOD(j,ny_on_file+1)
+                rem_pe   = remote_pe(i_remote,j_remote)
+                rem_offs = rma_offset(i_remote,j_remote)
+                nval     = 1
+#if defined( __parallel )
+                IF ( rem_pe /= myid )  THEN
+                   CALL MPI_GET( data(m_start_index(j,i)), nval, MPI_REAL, rem_pe, rem_offs, nval, &
+                                 MPI_REAL, rmawin_2d, ierr)
+                ELSE
+                   data(m_start_index(j,i)) = rmabuf_2d(j_remote,i_remote)
+                ENDIF
+#else
+                data(m_start_index(j,i)) = array_2d(i_remote,j_remote)
+#endif
+             ENDDO
+          ENDDO
+          is = prerun_grid%nxr+1
+          ie = nxr
+          js = nys
+          je = nyn
+       ENDIF
+       DO  i = is, ie
+          DO  j = js, je
+             i_remote = MOD(i,nx_on_file+1)
+             j_remote = MOD(j,ny_on_file+1)
+             rem_pe   = remote_pe(i_remote,j_remote)
+             rem_offs = rma_offset(i_remote,j_remote)
+             nval     = 1
+#if defined( __parallel )
+             IF ( rem_pe /= myid )  THEN
+                CALL MPI_GET( data(m_start_index(j,i)), nval, MPI_REAL, rem_pe, rem_offs, nval,    &
+                              MPI_REAL, rmawin_2d, ierr)
+             ELSE
+                data(m_start_index(j,i)) = rmabuf_2d(j_remote,i_remote)
+             ENDIF
+#else
+             data(m_tart_index(j,i)) = array_2d(i_remote,j_remote)
+#endif
+          ENDDO
+       ENDDO
+#if defined( __parallel )
+!
+!--    Reopen RMA window to allow filling
+       CALL MPI_WIN_FENCE( 0, rmawin_2d, ierr )
+#endif
+    END SUBROUTINE rrd_mpi_io_surface_cyclic_fill
  END SUBROUTINE rrd_mpi_io_surface
 …
        tgh%nr_real   = header_real_index - 1
        tgh%nr_arrays = header_array_index - 1
        tgh%total_nx  = lb%nx + 1
        tgh%total_ny  = lb%ny + 1
+       tgh%total_nx  = iog%nx + 1
+       tgh%total_ny  = iog%ny + 1
        IF ( include_total_domain_boundaries )  THEN   ! Not sure, if LOGICAL interpretation is the same for all compilers,
           tgh%i_outer_bound = 1                       ! therefore store as INTEGER in general header
 …
     ENDIF
 #endif
+!
+!-- Free RMA windows
+    IF ( cyclic_fill_mode )  THEN
+       CALL MPI_WIN_FREE( rmawin_2di, ierr )
+       CALL MPI_WIN_FREE( rmawin_2d, ierr )
+       CALL MPI_WIN_FREE( rmawin_3d, ierr )
+    ENDIF
+    IF (.NOT. pe_active_for_read )  RETURN
+!
+!-- TODO: better explain the following message
+!-- In case on non cyclic read, pe_active_for_read is set .TRUE.
     IF ( sm_io%iam_io_pe )  THEN
 …
     INTEGER(iwp)                          ::  i           !<  loop index
+    INTEGER(iwp)                          ::  j           !<  loop index
     INTEGER(KIND=rd_offset_kind)          ::  offset      !<
 …
     INTEGER, INTENT(IN), DIMENSION(nys:nyn,nxl:nxr)  ::  end_index     !<
     INTEGER, INTENT(OUT), DIMENSION(nys:nyn,nxl:nxr) ::  global_start  !<
     INTEGER, INTENT(IN), DIMENSION(nys:nyn,nxl:nxr)  ::  start_index   !<
+    INTEGER, INTENT(INOUT), DIMENSION(nys:nyn,nxl:nxr)  ::  end_index     !<
+    INTEGER, INTENT(OUT), DIMENSION(nys:nyn,nxl:nxr)    ::  global_start  !<
+    INTEGER, INTENT(INOUT), DIMENSION(nys:nyn,nxl:nxr)  ::  start_index   !<
     LOGICAL, INTENT(OUT) ::  data_to_write  !< returns, if surface data have to be written
-    offset = 0
-    lo_nr_val= 0
-    lo_nr_val(myid) = MAXVAL( end_index )
-#if defined( __parallel )
-    CALL MPI_ALLREDUCE( lo_nr_val, all_nr_val, numprocs, MPI_INTEGER, MPI_SUM, comm2d, ierr )
-    IF ( ft_surf /= -1  .AND.  sm_io%iam_io_pe )  THEN
-       CALL MPI_TYPE_FREE( ft_surf, ierr )    ! If set, free last surface filetype
-    ENDIF
-    IF ( win_surf /= -1 )  THEN
-       IF ( sm_io%is_sm_active() )  THEN
-          CALL MPI_WIN_FREE( win_surf, ierr )
-       ENDIF
-       win_surf = -1
-    ENDIF
-    IF ( sm_io%is_sm_active() .AND. rd_flag )  THEN
-       IF ( fhs == -1 )  THEN
-          CALL MPI_FILE_OPEN( comm2d, TRIM( io_file_name ), MPI_MODE_RDONLY, MPI_INFO_NULL, fhs,   &
-                              ierr )
-       ENDIF
-    ELSE
-       fhs = fh
-    ENDIF
-#else
-    all_nr_val(myid) = lo_nr_val(myid)
-#endif
-    nr_val = lo_nr_val(myid)
-    total_number_of_surface_values = 0
-    DO  i = 0, numprocs-1
-       IF ( i == myid )  THEN
-          glo_start = total_number_of_surface_values + 1
-       ENDIF
-       total_number_of_surface_values = total_number_of_surface_values + all_nr_val(i)
-    ENDDO
+!
 !-- Actions during reading
     IF ( rd_flag )  THEN
+!
+!--    Set start index and end index for the mainrun grid.
+!--    ATTENTION: This works only for horizontal surfaces with one vale per grid cell!!!
+       IF ( cyclic_fill_mode )  THEN
+          DO  i = nxl, nxr
+             DO  j = nys, nyn
+                start_index (j,i) = (i-nxl) * nny + j - nys + 1
+                end_index (j,i)   = start_index(j,i)
+             ENDDO
+          ENDDO
+       ENDIF
        IF ( .NOT. ALLOCATED( m_start_index )  )  ALLOCATE( m_start_index(nys:nyn,nxl:nxr)  )
        IF ( .NOT. ALLOCATED( m_end_index )    )  ALLOCATE( m_end_index(nys:nyn,nxl:nxr)    )
 …
        nr_val = MAXVAL( end_index )
+    ENDIF
+    IF ( .NOT. pe_active_for_read )  RETURN
+    IF ( cyclic_fill_mode )  CALL prerun_grid%activate_grid_from_this_class()
+    offset = 0
+    lo_nr_val= 0
+    lo_nr_val(myid) = MAXVAL( end_index )
+#if defined( __parallel )
+    CALL MPI_ALLREDUCE( lo_nr_val, all_nr_val, numprocs, MPI_INTEGER, MPI_SUM, comm2d, ierr )
+    IF ( ft_surf /= -1  .AND.  sm_io%iam_io_pe )  THEN
+       CALL MPI_TYPE_FREE( ft_surf, ierr )    ! If set, free last surface filetype
+    ENDIF
+    IF ( win_surf /= -1 )  THEN
+       IF ( sm_io%is_sm_active() )  THEN
+          CALL MPI_WIN_FREE( win_surf, ierr )
+       ENDIF
+       win_surf = -1
+    ENDIF
+    IF ( sm_io%is_sm_active() .AND. rd_flag )  THEN
+       IF ( fhs == -1 )  THEN
+          CALL MPI_FILE_OPEN( comm2d, TRIM( io_file_name ), MPI_MODE_RDONLY, MPI_INFO_NULL, fhs,   &
+                              ierr )
+       ENDIF
+    ELSE
+       fhs = fh
+    ENDIF
+#else
+    all_nr_val(myid) = lo_nr_val(myid)
+#endif
+    nr_val = lo_nr_val(myid)
+    total_number_of_surface_values = 0
+    DO  i = 0, numprocs-1
+       IF ( i == myid )  THEN
+          glo_start = total_number_of_surface_values + 1
+       ENDIF
+       total_number_of_surface_values = total_number_of_surface_values + all_nr_val(i)
+    ENDDO
+!
+!-- Actions during reading
+    IF ( rd_flag )  THEN
 #if defined( __parallel )
        CALL MPI_FILE_SET_VIEW( fhs, offset, MPI_BYTE, MPI_BYTE, 'native', MPI_INFO_NULL, ierr )
 #endif
     ENDIF
+    IF ( cyclic_fill_mode )  CALL mainrun_grid%activate_grid_from_this_class()
+!
 …
     INTEGER, DIMENSION(3) ::  start3  !<
     TYPE(local_boundaries) ::  save_io_grid  !< temporary variable to store grid settings
+    TYPE(domain_decomposition_grid_features) ::  save_io_grid  !< temporary variable to store grid settings
 …
        save_io_grid = sm_io%io_grid
     ENDIF
+    IF( .NOT. pe_active_for_read )  RETURN
+    IF ( cyclic_fill_mode )  CALL prerun_grid%activate_grid_from_this_class()
+!
 …
     IF ( include_total_domain_boundaries )  THEN
        lb%nxl = nxl + nbgp
        lb%nxr = nxr + nbgp
        lb%nys = nys + nbgp
        lb%nyn = nyn + nbgp
        lb%nnx = nnx
        lb%nny = nny
        lb%nx  = nx + 2 * nbgp
        lb%ny  = ny + 2 * nbgp
+       iog%nxl = nxl + nbgp
+       iog%nxr = nxr + nbgp
+       iog%nys = nys + nbgp
+       iog%nyn = nyn + nbgp
+       iog%nnx = nnx
+       iog%nny = nny
+       iog%nx  = nx + 2 * nbgp
+       iog%ny  = ny + 2 * nbgp
        IF ( myidx == 0 )  THEN
           lb%nxl = lb%nxl - nbgp
           lb%nnx = lb%nnx + nbgp
+          iog%nxl = iog%nxl - nbgp
+          iog%nnx = iog%nnx + nbgp
        ENDIF
        IF ( myidx == npex-1  .OR.  npex == -1 )  THEN   ! npex == 1 if -D__parallel not set
           lb%nxr = lb%nxr + nbgp
           lb%nnx = lb%nnx + nbgp
+          iog%nxr = iog%nxr + nbgp
+          iog%nnx = iog%nnx + nbgp
        ENDIF
        IF ( myidy == 0 )  THEN
           lb%nys = lb%nys - nbgp
           lb%nny = lb%nny + nbgp
+          iog%nys = iog%nys - nbgp
+          iog%nny = iog%nny + nbgp
        ENDIF
        IF ( myidy == npey-1  .OR.  npey == -1 )  THEN   ! npey == 1 if -D__parallel not set
           lb%nyn = lb%nyn + nbgp
           lb%nny = lb%nny + nbgp
+          iog%nyn = iog%nyn + nbgp
+          iog%nny = iog%nny + nbgp
        ENDIF
 …
     ELSE
        lb%nxl = nxl
        lb%nxr = nxr
        lb%nys = nys
        lb%nyn = nyn
        lb%nnx = nnx
        lb%nny = nny
        lb%nx  = nx
        lb%ny  = ny
+       iog%nxl = nxl
+       iog%nxr = nxr
+       iog%nys = nys
+       iog%nyn = nyn
+       iog%nnx = nnx
+       iog%nny = nny
+       iog%nx  = nx
+       iog%ny  = ny
     ENDIF
 …
 #endif
     ELSE
        ALLOCATE( array_2d(lb%nxl:lb%nxr,lb%nys:lb%nyn) )
+       ALLOCATE( array_2d(iog%nxl:iog%nxr,iog%nys:iog%nyn) )
        ALLOCATE( array_2di(nxl:nxr,nys:nyn) )
        ALLOCATE( array_3d(nzb:nzt+1,lb%nxl:lb%nxr,lb%nys:lb%nyn) )
        sm_io%io_grid = lb
+       ALLOCATE( array_3d(nzb:nzt+1,iog%nxl:iog%nxr,iog%nys:iog%nyn) )
+       sm_io%io_grid = iog
     ENDIF
+!
 !-- Create filetype for 2d-REAL array with ghost layers around the total domain
     dims2(1)  = lb%nx + 1
     dims2(2)  = lb%ny + 1
+    dims2(1)  = iog%nx + 1
+    dims2(2)  = iog%ny + 1
     lize2(1)  = sm_io%io_grid%nnx
 …
 !-- Create filetype for 3d-REAL array
     dims3(1)  = nz + 2
     dims3(2)  = lb%nx + 1
     dims3(3)  = lb%ny + 1
+    dims3(2)  = iog%nx + 1
+    dims3(3)  = iog%ny + 1
     lize3(1)  = dims3(1)
 …
     ENDIF
 #endif
+    IF ( cyclic_fill_mode )  CALL mainrun_grid%activate_grid_from_this_class()
  END SUBROUTINE rd_mpi_io_create_filetypes
 …
                                       win_3ds )
     ELSE
        ALLOCATE( array_3d_soil(nzb_soil:nzt_soil,lb%nxl:lb%nxr,lb%nys:lb%nyn) )
        sm_io%io_grid = lb
+       ALLOCATE( array_3d_soil(nzb_soil:nzt_soil,iog%nxl:iog%nxr,iog%nys:iog%nyn) )
+       sm_io%io_grid = iog
     ENDIF
 …
 !-- Create filetype for 3d-soil array
     dims3(1)  = nzt_soil - nzb_soil + 1
     dims3(2)  = lb%nx + 1
     dims3(3)  = lb%ny + 1
+    dims3(2)  = iog%nx + 1
+    dims3(3)  = iog%ny + 1
     lize3(1)  = dims3(1)

palm/trunk/SOURCE/shared_memory_io_mod.f90

-                      r4591
+                      r4617
 ! $Id$
+!
+! Additions for cyclic fill mode
+!
+!
 ! File re-formatted to follow the PALM coding standard
+!
+!
+!
 ! Initial version (Klaus Ketelsen)
+!
 …
     USE kinds,                                                                                     &
+        ONLY: iwp,                                                                                 &
+        ONLY: dp,                                                                                  &
+              iwp,                                                                                 &
+              sp,                                                                                  &
               wp
-    USE transpose_indices,                                                                         &
-        ONLY: nxl_z,                                                                               &
-              nxr_z,                                                                               &
-              nyn_x,                                                                               &
-              nyn_z,                                                                               &
-              nys_x,                                                                               &
-              nys_z
     USE pegrid,                                                                                    &
 …
+!
+!-- Type to store grid information
+    TYPE, PUBLIC ::  local_boundaries  !<
+       INTEGER(iwp) ::  nnx  !<
+       INTEGER(iwp) ::  nny  !<
+       INTEGER(iwp) ::  nx   !<
+       INTEGER(iwp) ::  nxl  !<
+       INTEGER(iwp) ::  nxr  !<
+       INTEGER(iwp) ::  ny   !<
+       INTEGER(iwp) ::  nyn  !<
+       INTEGER(iwp) ::  nys  !<
+    END TYPE local_boundaries
+!-- Type to store information about the domain decomposition grid
+    TYPE, PUBLIC ::  domain_decomposition_grid_features  !<
+       INTEGER(iwp) ::  comm2d    !<
+       INTEGER(iwp) ::  myid      !<
+       INTEGER(iwp) ::  nnx       !<
+       INTEGER(iwp) ::  nny       !<
+       INTEGER(iwp) ::  nx        !<
+       INTEGER(iwp) ::  nxl       !<
+       INTEGER(iwp) ::  nxr       !<
+       INTEGER(iwp) ::  ny        !<
+       INTEGER(iwp) ::  nyn       !<
+       INTEGER(iwp) ::  nys       !<
+       INTEGER(iwp) ::  numprocs  !<
+       CONTAINS
+          PROCEDURE, PASS(this), PUBLIC :: activate_grid_from_this_class
+          PROCEDURE, PASS(this), PUBLIC :: save_grid_into_this_class
+    END TYPE domain_decomposition_grid_features
+!
 …
        INTEGER(iwp) ::  nr_io_pe_per_node = 2         !< typical configuration, 2 sockets per node
        LOGICAL      ::  no_shared_Memory_in_this_run  !<
+       INTEGER(iwp) ::  comm_model            !< communicator of this model run
+!
 !--    Variables for the shared memory communicator
        INTEGER(iwp), PUBLIC ::  comm_shared   !< Communicator for processes with shared array
+       INTEGER(iwp), PUBLIC ::  comm_shared   !< communicator for processes with shared array
        INTEGER(iwp), PUBLIC ::  sh_npes       !<
        INTEGER(iwp), PUBLIC ::  sh_rank       !<
 …
        INTEGER(iwp), PUBLIC ::  io_npes  !<
        INTEGER(iwp), PUBLIC ::  io_rank  !<
-       TYPE( local_boundaries ), PUBLIC ::  io_grid
+!
 !--    Variables for the node local communicator
 …
        INTEGER(iwp) ::  n_rank             !<
+ CONTAINS
+       PRIVATE
+          PROCEDURE, PASS(this), PUBLIC ::  is_sm_active              !<
+          PROCEDURE, PASS(this), PUBLIC ::  sm_adjust_outer_boundary  !<
+          PROCEDURE, PASS(this), PUBLIC ::  sm_free_shared            !<
+          PROCEDURE, PASS(this), PUBLIC ::  sm_init_comm              !<
+          PROCEDURE, PASS(this), PUBLIC ::  sm_node_barrier           !<
+       TYPE(domain_decomposition_grid_features), PUBLIC ::  io_grid  !< io grid features, depending on reading from prerun or restart run
+       CONTAINS
+          PRIVATE
+          PROCEDURE, PASS(this), PUBLIC ::  is_sm_active
+          PROCEDURE, PASS(this), PUBLIC ::  sm_adjust_outer_boundary
+          PROCEDURE, PASS(this), PUBLIC ::  sm_free_shared
+          PROCEDURE, PASS(this), PUBLIC ::  sm_init_comm
+          PROCEDURE, PASS(this), PUBLIC ::  sm_init_part
+          PROCEDURE, PASS(this), PUBLIC ::  sm_node_barrier
 #if defined( __parallel )
+          PROCEDURE, PASS(this), PUBLIC ::  sm_allocate_shared_1d   !<
+          PROCEDURE, PASS(this), PUBLIC ::  sm_allocate_shared_2d   !<
+          PROCEDURE, PASS(this), PUBLIC ::  sm_allocate_shared_2di  !<
+          PROCEDURE, PASS(this), PUBLIC ::  sm_allocate_shared_3d   !<
+          GENERIC, PUBLIC ::  sm_allocate_shared =>  sm_allocate_shared_1d, sm_allocate_shared_2d, &
+                                                  sm_allocate_shared_2di, sm_allocate_shared_3d  !<
+          PROCEDURE, PASS(this), PUBLIC ::  sm_allocate_shared_1d_64
+          PROCEDURE, PASS(this), PUBLIC ::  sm_allocate_shared_1d_32
+          PROCEDURE, PASS(this), PUBLIC ::  sm_allocate_shared_1di
+          PROCEDURE, PASS(this), PUBLIC ::  sm_allocate_shared_2d_64
+          PROCEDURE, PASS(this), PUBLIC ::  sm_allocate_shared_2d_32
+          PROCEDURE, PASS(this), PUBLIC ::  sm_allocate_shared_2di
+          PROCEDURE, PASS(this), PUBLIC ::  sm_allocate_shared_3d_64
+          PROCEDURE, PASS(this), PUBLIC ::  sm_allocate_shared_3d_32
+          GENERIC, PUBLIC ::  sm_allocate_shared =>                                                &
+                                               sm_allocate_shared_1d_64, sm_allocate_shared_1d_32, &
+                                               sm_allocate_shared_2d_64, sm_allocate_shared_2d_32, &
+                                               sm_allocate_shared_2di,   sm_allocate_shared_3d_64, &
+                                               sm_allocate_shared_3d_32, sm_allocate_shared_1di
 #endif
     END TYPE sm_class
 …
 !> Setup the grid for shared memory IO.
 !--------------------------------------------------------------------------------------------------!
+ SUBROUTINE sm_init_comm( this, sm_active )
+    IMPLICIT NONE
+    CLASS(sm_class), INTENT(INOUT) ::  this  !< pointer to access internal variables of this call
+ SUBROUTINE sm_init_comm( this, sm_active, comm_input )
+    IMPLICIT NONE
+    CLASS(sm_class), INTENT(INOUT) ::  this        !< pointer to access internal variables of this call
+    INTEGER, INTENT(IN), OPTIONAL  ::  comm_input  !< main model communicator (comm2d) can optional be set
 #if defined( __parallel )
     INTEGER ::  color       !<
     INTEGER ::  max_n_npes  !< Maximum number of PEs/node
+    INTEGER ::  color
+    INTEGER ::  max_n_npes  !< maximum number of PEs/node
 #endif
+    LOGICAL, INTENT(IN) ::  sm_active  !< Flag to activate shared-memory IO
+    LOGICAL, INTENT(IN) ::  sm_active  !< flag to activate shared-memory IO
+    IF ( PRESENT( comm_input ) )  THEN
+       this%comm_model = comm_input
+    ELSE
+       this%comm_model = comm2d
+    ENDIF
     this%no_shared_memory_in_this_run = .NOT. sm_active
+    this%comm_io = this%comm_model      ! preset in case of non shared-memory-IO
     IF ( this%no_shared_memory_in_this_run )  THEN
 …
 !-- Determine, how many MPI threads are running on a node
     this%iam_io_pe = .FALSE.
+    CALL MPI_COMM_SPLIT_TYPE( comm2d, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, this%comm_node, ierr )
+    CALL MPI_COMM_SPLIT_TYPE( this%comm_model, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,             &
+                              this%comm_node, ierr )
     CALL MPI_COMM_SIZE( this%comm_node, this%n_npes, ierr )
     CALL MPI_COMM_RANK( this%comm_node, this%n_rank, ierr )
     CALL MPI_ALLREDUCE( this%n_npes, max_n_npes, 1, MPI_INTEGER, MPI_MAX, comm2d, ierr )
+    CALL MPI_ALLREDUCE( this%n_npes, max_n_npes, 1, MPI_INTEGER, MPI_MAX, this%comm_model, ierr )
+!
 !-- Decide, if the configuration can run with shared-memory IO
 …
 !-- All threads with shared memory rank 0 will be I/O threads.
     color = this%sh_rank
     CALL MPI_COMM_SPLIT( comm2d, color, 0, this%comm_io, ierr )
+    CALL MPI_COMM_SPLIT( this%comm_model, color, 0, this%comm_io, ierr )
     IF ( this%comm_io /= MPI_COMM_NULL )  THEN
 …
 #endif
+!      write(9,'(a,8i7)') ' end of sm_init_comm ',this%sh_rank,this%sh_npes,this%io_rank,this%io_npes,this%io_pe_global_rank
+!      write(9,*) 'This process is IO Process ',this%iam_io_pe
 #if defined( __parallel )
  CONTAINS
 …
     INTEGER(iwp), DIMENSION(4,0:this%n_npes-1) ::  local_dim_r   !<
     TYPE(local_boundaries), DIMENSION(32) ::  node_grid  !<
+    TYPE(domain_decomposition_grid_features), DIMENSION(32) ::  node_grid  !<
+!
 …
+!
+!-- TODO: short description required, about the meaning of the following routine
+!--       part must be renamed particles!
+ SUBROUTINE sm_init_part( this )
+    IMPLICIT NONE
+    CLASS(sm_class), INTENT(INOUT) ::  this  !< pointer to access internal variables of this call
+#if defined( __parallel )
+    INTEGER(iwp) ::  color             !<
+    INTEGER(iwp) ::  comm_shared_base  !<
+    INTEGER(iwp) ::  ierr              !<
+    INTEGER(iwp) ::  max_n_npes        !< maximum number of PEs/node
+    LOGICAL :: sm_active  !<
+#endif
+    sm_active       = .TRUE.   ! particle IO always uses shared memory
+    this%comm_model = comm2d
+    this%no_shared_memory_in_this_run = .NOT. sm_active
+    this%comm_io = this%comm_model  ! preset in case of non shared-memory-IO
+    IF ( this%no_shared_memory_in_this_run )  THEN
+       this%iam_io_pe = .TRUE.
+       RETURN
+    ENDIF
+#if defined( __parallel )
+!
+!-- Determine, how many MPI threads are running on a node
+    this%iam_io_pe = .FALSE.
+    CALL MPI_COMM_SPLIT_TYPE( this%comm_model, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,             &
+                              this%comm_node, ierr )
+    CALL MPI_COMM_SIZE( this%comm_node, this%n_npes, ierr )
+    CALL MPI_COMM_RANK( this%comm_node, this%n_rank, ierr )
+    CALL MPI_ALLREDUCE( this%n_npes, max_n_npes, 1, MPI_INTEGER, MPI_MAX, this%comm_model, ierr )
+!
+!-- TODO: better explanation
+!-- It has to be testet, if using memory blocks for an IO process (MPI shared Memory), or if it is
+!-- even better to use the complete node for MPI shared memory (this%nr_io_pe_per_node = 1).
+!-  In the latter case, the access to the MPI shared memory buffer is slower, the number of
+!-- particles to move between threads will be much smaller.
+    IF ( max_n_npes > 64 )  THEN
+!
+!--    Special configuration on the HLRN-IV system with 4 shared memory blocks/node
+       this%nr_io_pe_per_node = 4
+    ENDIF
+    IF ( this%nr_io_pe_per_node == 1 )  THEN
+!
+!--    This branch is not realized so far
+       this%iam_io_pe   = ( this%n_rank == 0 )
+       this%comm_shared = this%comm_node
+       CALL MPI_COMM_SIZE( this%comm_shared, this%sh_npes, ierr )
+       CALL MPI_COMM_RANK( this%comm_shared, this%sh_rank, ierr )
+    ELSEIF( this%nr_io_pe_per_node == 2 )  THEN
+       this%iam_io_pe = ( this%n_rank == 0  .OR.  this%n_rank == this%n_npes/2 )
+       IF ( this%n_rank < this%n_npes/2 )  THEN
+          color = 1
+       ELSE
+          color = 2
+       ENDIF
+       CALL MPI_COMM_SPLIT( this%comm_node, color, 0, this%comm_shared, ierr )
+       CALL MPI_COMM_SIZE( this%comm_shared, this%sh_npes, ierr )
+       CALL MPI_COMM_RANK( this%comm_shared, this%sh_rank, ierr )
+    ELSEIF( this%nr_io_pe_per_node == 4 )  THEN
+       this%iam_io_pe = ( this%n_rank == 0  .OR.  this%n_rank == this%n_npes/4  .OR.               &
+                          this%n_rank == this%n_npes/2  .OR.  this%n_rank == (3*this%n_npes)/4 )
+       IF ( this%n_rank < this%n_npes/4 )  THEN
+          color = 1
+       ELSEIF( this%n_rank < this%n_npes/2 )  THEN
+          color = 2
+       ELSEIF( this%n_rank < (3*this%n_npes)/4 )  THEN
+          color = 3
+       ELSE
+          color = 4
+       ENDIF
+       CALL MPI_COMM_SPLIT( this%comm_node, color, 0, this%comm_shared, ierr )
+       CALL MPI_COMM_SIZE( this%comm_shared, this%sh_npes, ierr )
+       CALL MPI_COMM_RANK( this%comm_shared, this%sh_rank, ierr )
+    ELSE
+       WRITE( *, * ) 'shared_memory_io_mod: internal error'
+       WRITE( *, * ) 'only 2 or 4 shared memory groups per node are allowed'
+       STOP
+    ENDIF
+!
+!-- Setup the shared memory area
+    CALL MPI_COMM_SPLIT( this%comm_node, color, 0, this%comm_shared, ierr )
+    CALL MPI_COMM_SIZE( this%comm_shared, this%sh_npes, ierr )
+    CALL MPI_COMM_RANK( this%comm_shared, this%sh_rank, ierr )
+!
+!-- Setup the communicator across the nodes depending on the shared memory rank.
+!-- All threads with shared memory rank 0 will be I/O threads.
+    color = this%sh_rank
+    CALL MPI_COMM_SPLIT( this%comm_model, color, 0, this%comm_io, ierr )
+    IF ( this%comm_io /= MPI_COMM_NULL )  THEN
+       CALL MPI_COMM_SIZE( this%comm_io, this%io_npes, ierr )
+       CALL MPI_COMM_RANK( this%comm_io, this%io_rank, ierr )
+    ELSE
+       this%io_npes = -1
+       this%io_rank = -1
+    ENDIF
+    IF ( this%sh_rank == 0 )  THEN
+       this%iam_io_pe = .TRUE.
+       this%io_pe_global_rank = myid
+    ENDIF
+    CALL MPI_BCAST( this%io_pe_global_rank, 1, MPI_INTEGER, 0, this%comm_shared, ierr )
+#else
+    this%iam_io_pe = .FALSE.
+#endif
+!    write(9,'(a,8i7)') 'sm_init_comm_part ',this%sh_rank,this%sh_npes,this%io_rank,this%io_npes
+ END SUBROUTINE sm_init_part
 !--------------------------------------------------------------------------------------------------!
 …
 #if defined( __parallel )
+!--------------------------------------------------------------------------------------------------!
+! Description:
+! ------------
+!> Allocate shared 1d-REAL array on ALL threads
+!--------------------------------------------------------------------------------------------------!
+ SUBROUTINE sm_allocate_shared_1d( this, p1, d1, d2, win )
+    IMPLICIT NONE
+    CLASS(sm_class), INTENT(inout) ::  this         !<
+                                                    !<
+    INTEGER(iwp)                   ::  disp_unit    !<
+    INTEGER(iwp), INTENT(IN)       ::  d1           !<
+    INTEGER(iwp), INTENT(IN)       ::  d2           !<
+    INTEGER(iwp), SAVE             ::  pe_from = 0  !<
+    INTEGER(KIND=MPI_ADDRESS_KIND) ::  rem_size     !<
+    INTEGER(iwp), INTENT(OUT)      ::  win          !<
+    INTEGER(KIND=MPI_ADDRESS_KIND) ::  wsize        !<
+    INTEGER, DIMENSION(1)           ::  buf_shape   !<
+    REAL(wp), DIMENSION(:), POINTER ::  buf         !<
+    REAL(wp), DIMENSION(:), POINTER ::  p1          !<
+    TYPE(C_PTR), SAVE               ::  base_ptr    !<
+    TYPE(C_PTR), SAVE               ::  rem_ptr     !<
+!--------------------------------------------------------------------------------------------------!
+! Description:
+! ------------
+!> Allocate shared 1d-REAL (64 Bit) array on ALL threads
+!--------------------------------------------------------------------------------------------------!
+ SUBROUTINE sm_allocate_shared_1d_64( this, p1, d1, d2, win )
+    IMPLICIT NONE
+    CLASS(sm_class), INTENT(inout)  ::  this
+    INTEGER(iwp)                    ::  disp_unit
+    INTEGER(iwp), INTENT(IN)        ::  d1
+    INTEGER(iwp), INTENT(IN)        ::  d2
+    INTEGER(iwp), SAVE              ::  pe_from = 0
+    INTEGER(iwp), INTENT(OUT)       ::  win
+    INTEGER(KIND=MPI_ADDRESS_KIND)  ::  rem_size
+    INTEGER(KIND=MPI_ADDRESS_KIND)  ::  wsize
+    INTEGER, DIMENSION(1)           ::  buf_shape
+    REAL(dp), DIMENSION(:), POINTER ::  buf
+    REAL(dp), DIMENSION(:), POINTER ::  p1
+    TYPE(C_PTR), SAVE               ::  base_ptr
+    TYPE(C_PTR), SAVE               ::  rem_ptr
 …
        wsize = 1
     ENDIF
     wsize = wsize * 8  ! Please note, size is always in bytes, independently of the displacement
                        ! unit
     CALL MPI_WIN_ALLOCATE_SHARED( wsize, 8, MPI_INFO_NULL, this%comm_shared,base_ptr, win, ierr )
+    wsize = wsize * dp  ! please note, size is always in bytes, independently of the displacement
+                        ! unit
+    CALL MPI_WIN_ALLOCATE_SHARED( wsize, dp, MPI_INFO_NULL, this%comm_shared,base_ptr, win, ierr )
+!
 !-- Get C-pointer of the memory located on node-rank pe_from (sh_rank == pe_from)
 …
     pe_from = MOD( pe_from, this%sh_npes )
+ END SUBROUTINE sm_allocate_shared_1d
+!--------------------------------------------------------------------------------------------------!
+! Description:
+! ------------
+!> Allocate shared 2d-REAL array on ALL threads
+!--------------------------------------------------------------------------------------------------!
+ SUBROUTINE sm_allocate_shared_2d( this, p2, n_nxlg, n_nxrg, n_nysg, n_nyng, win )
+    IMPLICIT NONE
+    CLASS(sm_class), INTENT(INOUT)    ::  this         !<
+    INTEGER(iwp)                      ::  disp_unit    !<
+    INTEGER(iwp), INTENT(IN)          ::  n_nxlg       !<
+    INTEGER(iwp), INTENT(IN)          ::  n_nxrg       !<
+    INTEGER(iwp), INTENT(IN)          ::  n_nyng       !<
+    INTEGER(iwp), INTENT(IN)          ::  n_nysg       !<
+    INTEGER(iwp), SAVE                ::  pe_from = 0  !<
+    INTEGER(KIND=MPI_ADDRESS_KIND)    ::  rem_size     !<
+    INTEGER(iwp), INTENT(OUT)         ::  win          !<
+    INTEGER(KIND=MPI_ADDRESS_KIND)    ::  wsize        !<
+    INTEGER(iwp), DIMENSION(2)        ::  buf_shape    !<
+    REAL(wp), DIMENSION(:,:), POINTER ::  buf          !<
+    REAL(wp), DIMENSION(:,:), POINTER ::  p2           !<
+    TYPE(C_PTR),SAVE                  ::  base_ptr     !<
+    TYPE(C_PTR),SAVE                  ::  rem_ptr      !<
+ END SUBROUTINE sm_allocate_shared_1d_64
+!--------------------------------------------------------------------------------------------------!
+! Description:
+! ------------
+!> Allocate shared 1d-REAL (32 Bit) array on ALL threads
+!--------------------------------------------------------------------------------------------------!
+ SUBROUTINE sm_allocate_shared_1d_32( this, p1, d1, d2, win )
+    IMPLICIT NONE
+    CLASS(sm_class), INTENT(inout)  ::  this
+    INTEGER(iwp)                    ::  disp_unit
+    INTEGER(iwp), INTENT(IN)        ::  d1
+    INTEGER(iwp), INTENT(IN)        ::  d2
+    INTEGER(iwp), SAVE              ::  pe_from = 0
+    INTEGER(iwp), INTENT(OUT)       ::  win
+    INTEGER(KIND=MPI_ADDRESS_KIND)  ::  rem_size
+    INTEGER(KIND=MPI_ADDRESS_KIND)  ::  wsize
+    INTEGER, DIMENSION(1)           ::  buf_shape
+    REAL(sp), DIMENSION(:), POINTER ::  buf
+    REAL(sp), DIMENSION(:), POINTER ::  p1
+    TYPE(C_PTR), SAVE               ::  base_ptr
+    TYPE(C_PTR), SAVE               ::  rem_ptr
+    IF ( this%no_shared_memory_in_this_run )  RETURN
+!
+!-- Allocate shared memory on node rank 0 threads.
+    IF ( this%sh_rank == pe_from )  THEN
+       wsize = d2 - d1 + 1
+    ELSE
+       wsize = 1
+    ENDIF
+    wsize = wsize * sp  ! Please note, size is always in bytes, independently of the displacement
+                       ! unit
+    CALL MPI_WIN_ALLOCATE_SHARED( wsize, sp, MPI_INFO_NULL, this%comm_shared,base_ptr, win, ierr )
+!
+!-- Get C-pointer of the memory located on node-rank pe_from (sh_rank == pe_from)
+    CALL MPI_WIN_SHARED_QUERY( win, pe_from, rem_size, disp_unit, rem_ptr, ierr )
+!
+!-- Convert C- to Fortran-pointer
+    buf_shape(1) = d2 - d1 + 1
+    CALL C_F_POINTER( rem_ptr, buf, buf_shape )
+    p1(d1:) => buf
+!
+!-- Allocate shared memory in round robin on all PEs of a node.
+    pe_from = MOD( pe_from, this%sh_npes )
+ END SUBROUTINE sm_allocate_shared_1d_32
+!--------------------------------------------------------------------------------------------------!
+! Description:
+! ------------
+!> Allocate shared 1d-INTEGER array on ALL threads
+!--------------------------------------------------------------------------------------------------!
+ SUBROUTINE sm_allocate_shared_1di( this, p1, d1, d2, win )
+    IMPLICIT NONE
+    CLASS(sm_class), INTENT(inout)  ::  this
+    INTEGER(iwp)                    ::  disp_unit
+    INTEGER(iwp), INTENT(IN)        ::  d1
+    INTEGER(iwp), INTENT(IN)        ::  d2
+    INTEGER(iwp), SAVE              ::  pe_from = 0
+    INTEGER(iwp), INTENT(OUT)       ::  win
+    INTEGER(KIND=MPI_ADDRESS_KIND)  ::  rem_size
+    INTEGER(KIND=MPI_ADDRESS_KIND)  ::  wsize
+    INTEGER, DIMENSION(1)           ::  buf_shape
+    INTEGER(iwp), DIMENSION(:), POINTER ::  buf
+    INTEGER(iwp), DIMENSION(:), POINTER ::  p1
+    TYPE(C_PTR), SAVE                   ::  base_ptr
+    TYPE(C_PTR), SAVE                   ::  rem_ptr
+    IF ( this%no_shared_memory_in_this_run )  RETURN
+!
+!-- Allocate shared memory on node rank 0 threads.
+    IF ( this%sh_rank == pe_from )  THEN
+       wsize = d2 - d1 + 1
+    ELSE
+       wsize = 1
+    ENDIF
+    wsize = wsize * iwp  ! Please note, size is always in bytes, independently of the displacement
+                       ! unit
+    CALL MPI_WIN_ALLOCATE_SHARED( wsize, iwp, MPI_INFO_NULL, this%comm_shared,base_ptr, win, ierr )
+!
+!-- Get C-pointer of the memory located on node-rank pe_from (sh_rank == pe_from)
+    CALL MPI_WIN_SHARED_QUERY( win, pe_from, rem_size, disp_unit, rem_ptr, ierr )
+!
+!-- Convert C- to Fortran-pointer
+    buf_shape(1) = d2 - d1 + 1
+    CALL C_F_POINTER( rem_ptr, buf, buf_shape )
+    p1(d1:) => buf
+!
+!-- Allocate shared memory in round robin on all PEs of a node.
+    pe_from = MOD( pe_from, this%sh_npes )
+ END SUBROUTINE sm_allocate_shared_1di
+!--------------------------------------------------------------------------------------------------!
+! Description:
+! ------------
+!> Allocate shared 2d-REAL array on ALL threads (64 Bit)
+!--------------------------------------------------------------------------------------------------!
+ SUBROUTINE sm_allocate_shared_2d_64( this, p2, n_nxlg, n_nxrg, n_nysg, n_nyng, win )
+    IMPLICIT NONE
+    CLASS(sm_class), INTENT(INOUT)    ::  this
+    INTEGER(iwp)                      ::  disp_unit
+    INTEGER(iwp), INTENT(IN)          ::  n_nxlg
+    INTEGER(iwp), INTENT(IN)          ::  n_nxrg
+    INTEGER(iwp), INTENT(IN)          ::  n_nyng
+    INTEGER(iwp), INTENT(IN)          ::  n_nysg
+    INTEGER(iwp), SAVE                ::  pe_from = 0
+    INTEGER(iwp), INTENT(OUT)         ::  win
+    INTEGER(KIND=MPI_ADDRESS_KIND)    ::  rem_size
+    INTEGER(KIND=MPI_ADDRESS_KIND)    ::  wsize
+    INTEGER(iwp), DIMENSION(2)        ::  buf_shape
+    REAL(dp), DIMENSION(:,:), POINTER ::  buf
+    REAL(dp), DIMENSION(:,:), POINTER ::  p2
+    TYPE(C_PTR), SAVE                 ::  base_ptr
+    TYPE(C_PTR), SAVE                 ::  rem_ptr
 …
     ENDIF
     wsize = wsize * 8  ! Please note, size is always in bytes, independently of the displacement
                        ! unit
+    wsize = wsize * dp  ! Please note, size is always in bytes, independently of the displacement
+                        ! unit
     CALL MPI_WIN_ALLOCATE_SHARED( wsize, 8, MPI_INFO_NULL, this%comm_shared, base_ptr, win, ierr )
 …
     pe_from = MOD( pe_from, this%sh_npes )
+ END SUBROUTINE sm_allocate_shared_2d
+ END SUBROUTINE sm_allocate_shared_2d_64
+!--------------------------------------------------------------------------------------------------!
+! Description:
+! ------------
+!> Allocate shared 2d-REAL (32 Bit) array on ALL threads
+!--------------------------------------------------------------------------------------------------!
+ SUBROUTINE sm_allocate_shared_2d_32( this, p2, n_nxlg, n_nxrg, n_nysg, n_nyng, win )
+    IMPLICIT NONE
+    CLASS(sm_class), INTENT(INOUT)    ::  this
+    INTEGER(iwp)                      ::  disp_unit
+    INTEGER(iwp), INTENT(IN)          ::  n_nxlg
+    INTEGER(iwp), INTENT(IN)          ::  n_nxrg
+    INTEGER(iwp), INTENT(IN)          ::  n_nyng
+    INTEGER(iwp), INTENT(IN)          ::  n_nysg
+    INTEGER(iwp), SAVE                ::  pe_from = 0
+    INTEGER(iwp), INTENT(OUT)         ::  win
+    INTEGER(KIND=MPI_ADDRESS_KIND)    ::  rem_size
+    INTEGER(KIND=MPI_ADDRESS_KIND)    ::  wsize
+    INTEGER(iwp), DIMENSION(2)        ::  buf_shape
+    REAL(sp), DIMENSION(:,:), POINTER ::  buf
+    REAL(sp), DIMENSION(:,:), POINTER ::  p2
+    TYPE(C_PTR), SAVE                 ::  base_ptr
+    TYPE(C_PTR), SAVE                 ::  rem_ptr
+    IF ( this%no_shared_memory_in_this_run )  RETURN
+!
+!-- Allocate shared memory on node rank 0 threads.
+    IF ( this%sh_rank == pe_from )  THEN
+       wsize = ( n_nyng - n_nysg + 1 ) * ( n_nxrg - n_nxlg + 1 )
+    ELSE
+       wsize = 1
+    ENDIF
+    wsize = wsize * sp  ! Please note, size is always in bytes, independently of the displacement
+                        ! unit
+    CALL MPI_WIN_ALLOCATE_SHARED( wsize, dp, MPI_INFO_NULL, this%comm_shared, base_ptr, win, ierr )
+!
+!-- Get C-pointer of the memory located on node-rank pe_from (sh_rank == pe_from)
+    CALL MPI_WIN_SHARED_QUERY( win, pe_from, rem_size, disp_unit, rem_ptr, ierr )
+!
+!-- Convert C- to Fortran-pointer
+    buf_shape(2) = n_nyng - n_nysg + 1
+    buf_shape(1) = n_nxrg - n_nxlg + 1
+    CALL C_F_POINTER( rem_ptr, buf, buf_shape )
+    p2(n_nxlg:, n_nysg:) => buf
+!
+!-- Allocate shared memory in round robin on all PEs of a node.
+    pe_from = MOD( pe_from, this%sh_npes )
+ END SUBROUTINE sm_allocate_shared_2d_32
 …
     INTEGER(iwp), INTENT(IN)              ::  n_nysg       !<
     INTEGER(iwp), SAVE                    ::  pe_from = 0  !<
+    INTEGER(iwp), INTENT(OUT)             ::  win          !<
     INTEGER(kind=MPI_ADDRESS_KIND)        ::  rem_size     !<
-    INTEGER(iwp), INTENT(OUT)             ::  win          !<
     INTEGER(kind=MPI_ADDRESS_KIND)        ::  wsize        !<
 …
     INTEGER(iwp), DIMENSION(:,:), POINTER ::  p2i          !<
     TYPE(C_PTR),SAVE                      ::  base_ptr     !<
     TYPE(C_PTR),SAVE                      ::  rem_ptr      !<
+    TYPE(C_PTR), SAVE                     ::  base_ptr     !<
+    TYPE(C_PTR), SAVE                     ::  rem_ptr      !<
 …
 ! Description:
 ! ------------
 !> Allocate shared 3d-REAL array on ALL threads
 !--------------------------------------------------------------------------------------------------!
  SUBROUTINE sm_allocate_shared_3d( this, p3, d1s, d1e, d2s, d2e, d3s, d3e, win )
+!> Allocate shared 3d-REAL (64 Bit) array on ALL threads
+!--------------------------------------------------------------------------------------------------!
+ SUBROUTINE sm_allocate_shared_3d_64( this, p3, d1s, d1e, d2s, d2e, d3s, d3e, win )
     IMPLICIT NONE
 …
     INTEGER, INTENT(IN)                 ::  d3s          !<
     INTEGER, SAVE                       ::  pe_from = 0  !<
+    INTEGER, INTENT(OUT)                ::  win          !<
     INTEGER(KIND=MPI_ADDRESS_KIND)      ::  rem_size     !<
-    INTEGER, INTENT(OUT)                ::  win          !<
     INTEGER(KIND=MPI_ADDRESS_KIND)      ::  wsize        !<
     INTEGER, DIMENSION(3)               ::  buf_shape    !<
     REAL(wp), DIMENSION(:,:,:), POINTER ::  buf          !<
     REAL(wp), DIMENSION(:,:,:), POINTER ::  p3           !<
+    REAL(dp), DIMENSION(:,:,:), POINTER ::  buf          !<
+    REAL(dp), DIMENSION(:,:,:), POINTER ::  p3           !<
     TYPE(C_PTR), SAVE                   ::  base_ptr     !<
 …
     ENDIF
     wsize = wsize * 8  ! Please note, size is always in bytes, independently of the displacement
+    wsize = wsize * dp ! Please note, size is always in bytes, independently of the displacement
                        ! unit
     CALL MPI_WIN_ALLOCATE_SHARED( wsize, 8, MPI_INFO_NULL, this%comm_shared, base_ptr, win, ierr )
+    CALL MPI_WIN_ALLOCATE_SHARED( wsize, dp, MPI_INFO_NULL, this%comm_shared, base_ptr, win, ierr )
+!
 !-- Get C-pointer of the memory located on node-rank pe_from (sh_rank == pe_from)
 …
     pe_from = MOD( pe_from, this%sh_npes )
+ END SUBROUTINE sm_allocate_shared_3d
+ END SUBROUTINE sm_allocate_shared_3d_64
+!--------------------------------------------------------------------------------------------------!
+! Description:
+! ------------
+!> Allocate shared 3d-REAL (32 Bit) array on ALL threads
+!--------------------------------------------------------------------------------------------------!
+ SUBROUTINE sm_allocate_shared_3d_32( this, p3, d1s, d1e, d2s, d2e, d3s, d3e, win )
+    IMPLICIT NONE
+    CLASS(sm_class), INTENT(inout)      ::  this
+    INTEGER                             ::  disp_unit
+    INTEGER, INTENT(IN)                 ::  d1e
+    INTEGER, INTENT(IN)                 ::  d1s
+    INTEGER, INTENT(IN)                 ::  d2e
+    INTEGER, INTENT(IN)                 ::  d2s
+    INTEGER, INTENT(IN)                 ::  d3e
+    INTEGER, INTENT(IN)                 ::  d3s
+    INTEGER, SAVE                       ::  pe_from = 0
+    INTEGER, INTENT(OUT)                ::  win
+    INTEGER(KIND=MPI_ADDRESS_KIND)      ::  rem_size
+    INTEGER(KIND=MPI_ADDRESS_KIND)      ::  wsize
+    INTEGER, DIMENSION(3)               ::  buf_shape
+    REAL(sp), DIMENSION(:,:,:), POINTER ::  buf
+    REAL(sp), DIMENSION(:,:,:), POINTER ::  p3
+    TYPE(C_PTR), SAVE                   ::  base_ptr
+    TYPE(C_PTR), SAVE                   ::  rem_ptr
+    IF ( this%no_shared_memory_in_this_run )  RETURN
+!
+!-- Allocate shared memory on node rank 0 threads.
+    IF ( this%sh_rank == pe_from )  THEN
+       wsize = ( d3e - d3s + 1 ) * ( d2e - d2s + 1 ) * ( d1e - d1s + 1 )
+    ELSE
+       wsize = 1
+    ENDIF
+    wsize = wsize * sp ! Please note, size is always in bytes, independently of the displacement
+                       ! unit
+    CALL MPI_WIN_ALLOCATE_SHARED( wsize, sp, MPI_INFO_NULL, this%comm_shared, base_ptr, win, ierr )
+!
+!-- Get C-pointer of the memory located on node-rank pe_from (sh_rank == pe_from)
+    CALL MPI_WIN_SHARED_QUERY( win, pe_from, rem_size, disp_unit, rem_ptr, ierr )
+!
+!-- Convert C- to Fortran-pointer
+    buf_shape(3) = d3e - d3s + 1
+    buf_shape(2) = d2e - d2s + 1
+    buf_shape(1) = d1e - d1s + 1
+    CALL C_F_POINTER( rem_ptr, buf, buf_shape )
+    p3(d1s:,d2s:,d3s:) => buf
+!
+!-- Allocate shared memory in round robin on all PEs of a node.
+    pe_from = MOD( pe_from, this%sh_npes )
+ END SUBROUTINE sm_allocate_shared_3d_32
 #endif
 …
     INTEGER(iwp), INTENT(INOUT)    ::  win   !<
+    IF ( this%no_shared_memory_in_this_run  .OR.  win == -1234567890 )  RETURN
+                     ! win is used just to avoid compile errors because of unused arguments
+    IF ( this%no_shared_memory_in_this_run )  RETURN
 #if defined( __parallel )
     CALL MPI_WIN_FREE( win, ierr )
 #endif
+    win = -1
  END SUBROUTINE sm_free_shared
 …
  END SUBROUTINE sm_node_barrier
+ SUBROUTINE save_grid_into_this_class( this )
+    IMPLICIT NONE
+    CLASS(domain_decomposition_grid_features), INTENT(inout) ::  this  !<
+       this%myid     = myid      !<
+       this%nnx      = nnx       !<
+       this%nny      = nny       !<
+       this%nx       = nx        !<
+       this%nxl      = nxl       !<
+       this%nxr      = nxr       !<
+       this%ny       = ny        !<
+       this%nyn      = nyn       !<
+       this%nys      = nys       !<
+       this%numprocs = numprocs  !<
+       this%comm2d   = comm2d    !<
+ END SUBROUTINE save_grid_into_this_class
+ SUBROUTINE activate_grid_from_this_class( this )
+    IMPLICIT NONE
+    CLASS(domain_decomposition_grid_features), INTENT(inout) ::  this  !<
+       myid     = this%myid      !<
+       nnx      = this%nnx       !<
+       nny      = this%nny       !<
+       nx       = this%nx        !<
+       nxl      = this%nxl       !<
+       nxr      = this%nxr       !<
+       ny       = this%ny        !<
+       nyn      = this%nyn       !<
+       nys      = this%nys       !<
+       numprocs = this%numprocs  !<
+       comm2d   = this%comm2d    !<
+ END SUBROUTINE activate_grid_from_this_class
  END MODULE shared_memory_io_mod

Note: See TracChangeset for help on using the changeset viewer.