Home

Context Navigation

← Previous Change
Next Change →

restart_data_mpi_io_mod.f90

Timestamp:

Mar 2, 2021 4:39:14 PM (3 years ago)

Author:

raasch

Message:

revised output of surface data via MPI-IO for better performance

File:

: 1 edited

palm/trunk/SOURCE/restart_data_mpi_io_mod.f90 (modified) (72 diffs)

Legend:

: Unmodified
: Added
: Removed

palm/trunk/SOURCE/restart_data_mpi_io_mod.f90

-                      r4857
+                      r4893
 ! -----------------
 ! $Id$
+! revised output of surface data via MPI-IO for better performance
+!
+! 4857 2021-01-26 07:24:41Z raasch
 ! bugfix: allocation of 3d-int4 array moved from particle output to standard output
+!
 …
                myidx,                                                                              &
                myidy,                                                                              &
-               npex,                                                                               &
-               npey,                                                                               &
                numprocs,                                                                           &
                pdims
 …
     INTEGER(iwp)            ::  fh = -1          !< MPI-IO file handle
 #if defined( __parallel )
-    INTEGER(iwp)            ::  fhs = -1         !< MPI-IO file handle to open file with comm2d always
-#endif
     INTEGER(iwp)            ::  ft_surf = -1     !< MPI filetype surface data
-#if defined( __parallel )
     INTEGER(iwp)            ::  ft_2di_nb        !< MPI filetype 2D array INTEGER no outer boundary
     INTEGER(iwp)            ::  ft_2d            !< MPI filetype 2D array REAL with outer boundaries
 …
     INTEGER(iwp)            ::  glo_start        !< global start index on this PE
 #if defined( __parallel )
-    INTEGER(iwp)            ::  local_start      !<
-#endif
-    INTEGER(iwp)            ::  nr_iope          !<
-    INTEGER(iwp)            ::  nr_val           !< local number of values in x and y direction
-#if defined( __parallel )
     INTEGER(iwp)            ::  win_2di          !<
     INTEGER(iwp)            ::  win_2dr          !<
 …
     INTEGER(iwp)            ::  win_3dr          !<
     INTEGER(iwp)            ::  win_3ds          !<
+    INTEGER(iwp)            ::  win_end   = -1   !<
+    INTEGER(iwp)            ::  win_glost = -1   !<
+    INTEGER(iwp)            ::  win_out   = -1   !<
+    INTEGER(iwp)            ::  win_start = -1   !<
     INTEGER(iwp)            ::  win_surf = -1    !<
 #endif
 …
     INTEGER(iwp), DIMENSION(:,:), POINTER, CONTIGUOUS   ::  array_2di   !<
+    INTEGER(iwp), DIMENSION(:,:), ALLOCATABLE ::  m_end_index     !<
+    INTEGER(iwp), DIMENSION(:,:), ALLOCATABLE ::  m_global_start  !<
+    INTEGER(iwp), DIMENSION(:,:), ALLOCATABLE ::  e_end_index     !< extended end index, every grid cell has at least one value
+    INTEGER(iwp), DIMENSION(:,:), ALLOCATABLE ::  e_start_index   !<
+    INTEGER(iwp), DIMENSION(:,:), ALLOCATABLE ::  m_end_index     !< module copy of end_index
     INTEGER(iwp), DIMENSION(:,:), ALLOCATABLE ::  m_start_index   !<
+    INTEGER(iwp), DIMENSION(:),   ALLOCATABLE ::  thread_index    !<
+    INTEGER(iwp), DIMENSION(:,:), ALLOCATABLE ::  target_thread   !<
+    INTEGER(iwp), DIMENSION(:,:), ALLOCATABLE ::  transfer_index  !<
+    INTEGER(iwp), DIMENSION(:),   ALLOCATABLE ::  thread_values   !<
+!
+!-- Indices for cyclic fill
+    INTEGER(iwp), DIMENSION(:,:), ALLOCATABLE ::  o_start_index   !<
+    INTEGER(iwp), DIMENSION(:,:), ALLOCATABLE ::  c_start_index   !<
+!#if defined( __parallel )
+    INTEGER(iwp), DIMENSION(:,:), ALLOCATABLE ::  o_end_index     !< extended end index, every grid cell has at least one value
+    INTEGER(iwp), DIMENSION(:,:), ALLOCATABLE ::  c_end_index     !< extended end index, every grid cell has at least one value
+    INTEGER(iwp), DIMENSION(:,:), ALLOCATABLE ::  c_global_start  !<
+    INTEGER(iwp), DIMENSION(:,:), ALLOCATABLE ::  c_global_end    !<
+!#endif
     INTEGER(isp), DIMENSION(:,:,:), POINTER, CONTIGUOUS ::  array_3di4  !<
     INTEGER(idp), DIMENSION(:,:,:), POINTER, CONTIGUOUS ::  array_3di8  !<
-    LOGICAL ::  all_pes_write                 !< all PEs have data to write
     LOGICAL ::  filetypes_created             !<
     LOGICAL ::  io_on_limited_cores_per_node  !< switch to shared memory MPI-IO
 …
     LOGICAL ::  wr_flag                       !< file is opened for write
+#if defined( __parallel )
+    INTEGER(iwp), DIMENSION(:,:), ALLOCATABLE       :: local_indices
+#endif
+    REAL(wp), DIMENSION(:), POINTER, CONTIGUOUS     ::  array_out      !<
 #if defined( __parallel )
     REAL(wp), DIMENSION(:), POINTER, CONTIGUOUS     ::  array_1d       !<
 …
        INTEGER(iwp) :: nr_int         !< number of INTEGER entries in header
        INTEGER(iwp) :: nr_real        !< number of REAL entries in header
+       INTEGER(iwp) :: pes_along_x    !< number of PEs along x-direction during writing restart file
+       INTEGER(iwp) :: pes_along_y    !< number of PEs along y-direction during writing restart file
        INTEGER(iwp) :: total_nx       !< total number of points in x-direction
        INTEGER(iwp) :: total_ny       !< total number of points in y-direction
     END TYPE general_header
     TYPE(general_header), TARGET ::  tgh    !<
+    TYPE(general_header), TARGET, PUBLIC ::  tgh    !<
     TYPE(sm_class)               ::  sm_io  !<
 …
             wrd_mpi_io_surface
  CONTAINS
 …
     TYPE(C_PTR)                   ::  buf_ptr  !<
 #endif
     offset = 0
 …
     io_file_name = file_name
+!
 !-- Setup for IO on a limited number of threads per node (using shared memory MPI)
+!-- Setup for IO on a limited number of PEs per node (using shared memory MPI)
     IF ( rd_flag )  THEN
        set_filetype = .TRUE.
 …
+!
 !--    TODO: describe in more detail what is done here and why it is done
 !--    save grid of main run
+!--    Save grid information of the mainrun, i.e. grid variables like nxl, nxr, nys, nyn and other
+!--    values are stored within the mainrun_grid structure
        CALL mainrun_grid%save_grid_into_this_class()
 …
        rma_offset_s = 0
+!
 !--    Determine, if gridpoints of the prerun are located on this thread.
+!--    Determine, if gridpoints of the prerun are located on this PE.
 !--    Set the (cyclic) prerun grid.
        nxr = MIN( nxr, nx_on_file )
 …
        ny = ny_on_file
+!
 !--    Determine, if this thread is doing IO
+!--    Determine, if this PE is doing IO
        IF ( nnx > 0  .AND.  nny > 0 )  THEN
           color = 1
 …
 #endif
+!
 !--    Allocate 2d buffers as RMA window, accessible on all threads
+!--    Allocate 2d buffers as RMA window, accessible on all PEs
        IF ( pe_active_for_read )  THEN
           ALLOCATE( rmabuf_2di(nys:nyn,nxl:nxr) )
 …
+!
 !--    Allocate 3d buffer as RMA window, accessable on all threads
+!--    Allocate 3d buffer as RMA window, accessable on all PEs
        IF ( pe_active_for_read )  THEN
           ALLOCATE( rmabuf_3d(nzb:nzt+1,nys:nyn,nxl:nxr) )
 …
+!
+!--    TODO: comment in more detail, what is done here, and why
+!--    save small grid
+!--    Save grid of the prerun, i.e. grid variables like nxl, nxr, nys, nyn and other values
+!--    are stored within the prerun_grid structure.
+!--    The prerun grid can later be activated by calling prerun_grid%activate_grid_from_this_class()
        CALL prerun_grid%save_grid_into_this_class()
        prerun_grid%comm2d = comm_cyclic_fill
 …
-!kk       write(9,*) 'Here is rma_cylic_fill_real_2d ',nxl,nxr,nys,nyn; FLUSH(9)
+!
 !--    Reading 2d real array on prerun grid
 …
 !--                  array would be dimensioned in the caller subroutine like this:
 !--                  INTEGER, DIMENSION(nysg:nyng,nxlg:nxrg)::  data
           message_string = '2d-INTEGER array "' // TRIM( name ) // '" to be read from restart ' // &
                            'file is defined with illegal dimensions in the PALM code'
+          message_string = '2d-INTEGER array with nbgp "' // TRIM( name ) // '" to be read ' //    &
+                           'from restart file is defined with illegal dimensions in the PALM code'
           CALL message( 'rrd_mpi_io_int_2d', 'PA0723', 3, 2, 0, 6, 0 )
 …
-       CALL prerun_grid%activate_grid_from_this_class()
        IF ( pe_active_for_read )  THEN
+          CALL prerun_grid%activate_grid_from_this_class()
 #if defined( __parallel )
           CALL MPI_FILE_SET_VIEW( fh, array_position, MPI_INTEGER, ft_2di_nb, 'native',            &
                                   MPI_INFO_NULL, ierr )
           CALL MPI_FILE_READ_ALL( fh, array_2di, SIZE( array_2di ), MPI_INTEGER, status, ierr )
+#else
+          CALL posix_lseek( fh, array_position )
+          CALL posix_read( fh, array_2di, SIZE( array_2di ) )
 #endif
           DO  i = nxl, nxr
 …
           ENDDO
           data(1:nny,1:nnx) = rmabuf_2di
+       ENDIF
        CALL mainrun_grid%activate_grid_from_this_class()
+          CALL mainrun_grid%activate_grid_from_this_class()
+       ENDIF
 #if defined( __parallel )
 …
 #endif
+       IF ( .NOT. pe_active_for_read )  THEN
+          is = nxl
+          ie = nxr
+          js = nys
+          je = nyn
+       ELSE
+          is = nxl
+          ie = nxr
+          js = prerun_grid%nys+1
+          je = nyn
+          DO  i = is, ie
+             DO  j = js, je
+                i_remote = MOD(i,nx_on_file+1)
+                j_remote = MOD(j,ny_on_file+1)
+                rem_pe   = remote_pe(i_remote,j_remote)
+                rem_offs = rma_offset(i_remote,j_remote)
+                nval     = 1
+#if defined( __parallel )
+                IF ( rem_pe /= myid )  THEN
+                   CALL MPI_GET( data(j-nys+1,i-nxl+1), nval, MPI_INTEGER, rem_pe, rem_offs, nval, &
+                                 MPI_INTEGER, rmawin_2di, ierr )
+                ELSE
+                   data(j-nys+1,i-nxl+1) = rmabuf_2di(j_remote,i_remote)
+                ENDIF
+#else
+                data(j-nys+1,i-nxl+1) = array_2di(i_remote,j_remote)
+#endif
+             ENDDO
+          ENDDO
+          is = prerun_grid%nxr+1
+          ie = nxr
+          js = nys
+          je = nyn
+       ENDIF
+       is = nxl
+       ie = nxr
+       js = nys
+       je = nyn
        DO  i = is, ie
 …
                                   ierr )
           CALL MPI_FILE_READ_ALL( fh, array_3d, SIZE( array_3d ), MPI_REAL, status, ierr )
+#else
+          CALL posix_lseek( fh, array_position )
+          CALL posix_read( fh, array_3d, SIZE( array_3d ) )
 #endif
           DO  i = nxl, nxr
 …
 #if defined( __parallel )
+!
+!--     Close RMA window to allow remote access
+        CALL MPI_WIN_FENCE( 0, rmawin_3d, ierr )
+#endif
+       IF ( .NOT. pe_active_for_read )  THEN
+          is = nxl
+          ie = nxr
+          js = nys
+          je = nyn
+       ELSE
+          is = nxl
+          ie = nxr
+          js = prerun_grid%nys+1
+          je = nyn
+          DO  i = is, ie
+             DO  j = js, je
+                i_remote = MOD(i,nx_on_file+1)
+                j_remote = MOD(j,ny_on_file+1)
+                rem_pe   = remote_pe(i_remote,j_remote)
+                rem_offs = rma_offset(i_remote,j_remote)*(nzt-nzb+2)
+                nval     = nzt-nzb+2
+#if defined( __parallel )
+                IF(rem_pe /= myid)   THEN
+                   CALL MPI_GET( data(nzb,j,i), nval, MPI_REAL, rem_pe, rem_offs, nval, MPI_REAL,  &
+                                 rmawin_3d, ierr)
+                ELSE
+                   data(:,j,i) = rmabuf_3d(:,j_remote,i_remote)
+                ENDIF
+#else
+                data(:,j,i) = array_3d(:,i_remote,j_remote)
+#endif
+             ENDDO
+          ENDDO
+          is = prerun_grid%nxr+1
+          ie = nxr
+          js = nys
+          je = nyn
+       ENDIF
+!--    Close RMA window to allow remote access
+       CALL MPI_WIN_FENCE( 0, rmawin_3d, ierr )
+#endif
+       is = nxl
+       ie = nxr
+       js = nys
+       je = nyn
        DO  i = is, ie
           DO  j = js, je
              i_remote = MOD(i,nx_on_file+1)
              j_remote = MOD(j,ny_on_file+1)
+             i_remote = MOD( i, nx_on_file+1 )
+             j_remote = MOD( j, ny_on_file+1 )
              rem_pe   = remote_pe(i_remote,j_remote)
              rem_offs = rma_offset(i_remote,j_remote) * ( nzt-nzb+2 )
 …
     IF ( found )  THEN
-#if defined( __parallel )
        CALL rd_mpi_io_create_filetypes_3dsoil( nzb_soil, nzt_soil )
+#if defined( __parallel )
        CALL sm_io%sm_node_barrier()  ! Has no effect if I/O on limited number of cores is inactive
        IF ( sm_io%iam_io_pe )  THEN
 …
        ENDIF
+#if defined( __parallel )
+      IF ( sm_io%is_sm_active() )  THEN
+         CALL MPI_WIN_FREE( win_3ds, ierr )
+      ELSE
+         DEALLOCATE( array_3d_soil )
+      ENDIF
+#else
+      DEALLOCATE( array_3d_soil )
+#endif
     ELSE
 …
     IF ( header_array_index == max_nr_arrays )  THEN
+       STOP '+++ maximum number of 2d/3d-array entries in restart file header exceeded'
+       message_string = 'maximum number of 2d/3d-array entries in restart file header exceeded'
+       CALL message( 'wrd_mpi_io_real_2d', 'PA0585', 1, 2, 0, 6, 0 )
     ENDIF
 …
     IF ( header_array_index == max_nr_arrays )  THEN
+       STOP '+++ maximum number of 2d/3d-array entries in restart file header exceeded'
+       message_string = 'maximum number of 2d/3d-array entries in restart file header exceeded'
+       CALL message( 'wrd_mpi_io_int_2d', 'PA0585', 1, 2, 0, 6, 0 )
     ENDIF
 …
     IF ( header_array_index == max_nr_arrays )  THEN
+       STOP '+++ maximum number of 2d/3d-array entries in restart file header exceeded'
+       message_string = 'maximum number of 2d/3d-array entries in restart file header exceeded'
+       CALL message( 'wrd_mpi_io_int4_3d', 'PA0585', 1, 2, 0, 6, 0 )
     ENDIF
 …
                                       INT( (iog%nx+1), KIND = rd_offset_kind ) * isp
-    write(9,*) 'array_position int4_3d ',trim(name),' ',array_position
  END SUBROUTINE wrd_mpi_io_int4_3d
 …
     IF ( header_array_index == max_nr_arrays )  THEN
+       STOP '+++ maximum number of 2d/3d-array entries in restart file header exceeded'
+       message_string = 'maximum number of 2d/3d-array entries in restart file header exceeded'
+       CALL message( 'wrd_mpi_io_int8_3d', 'PA0585', 1, 2, 0, 6, 0 )
     ENDIF
 …
                                       INT( (iog%nx+1), KIND = rd_offset_kind ) * dp
-    write(9,*) 'array_position int8_3d ',trim(name),' ',array_position
  END SUBROUTINE wrd_mpi_io_int8_3d
 …
     IF ( header_array_index == max_nr_arrays )  THEN
+       STOP '+++ maximum number of 2d/3d-array entries in restart file header exceeded'
+       message_string = 'maximum number of 2d/3d-array entries in restart file header exceeded'
+       CALL message( 'wrd_mpi_io_real_3d', 'PA0585', 1, 2, 0, 6, 0 )
     ENDIF
 …
     IF ( header_array_index == max_nr_arrays )  THEN
+       STOP '+++ maximum number of 2d/3d-array entries in restart file header exceeded'
+       message_string = 'maximum number of 2d/3d-array entries in restart file header exceeded'
+       CALL message( 'wrd_mpi_io_real_3d_soil', 'PA0585', 1, 2, 0, 6, 0 )
     ENDIF
 …
     header_array_index = header_array_index + 1
-#if defined( __parallel )
     CALL rd_mpi_io_create_filetypes_3dsoil( nzb_soil, nzt_soil )
-#endif
     IF ( include_total_domain_boundaries)  THEN
 …
     ENDIF
     CALL sm_io%sm_node_barrier()
+    IF ( sm_io%is_sm_active() )  THEN
+       CALL MPI_WIN_FREE( win_3ds, ierr )
+    ELSE
+       DEALLOCATE( array_3d_soil )
+    ENDIF
+    IF ( sm_io%iam_io_pe )  THEN
+       CALL MPI_TYPE_FREE( ft_3dsoil, ierr )
+    ENDIF
 #else
     CALL posix_lseek( fh, array_position )
     CALL posix_write( fh, array_3d_soil, SIZE( array_3d_soil ) )
+    DEALLOCATE( array_3d_soil )
 #endif
+!
 …
          CALL MPI_BCAST( data, SIZE( data ), MPI_REAL, 0, comm2d, ierr )
        ELSE
           IF ( sm_io%iam_io_pe )  THEN
+          IF( sm_io%iam_io_pe )  THEN
              CALL MPI_FILE_SET_VIEW( fh, offset, MPI_BYTE, MPI_BYTE, 'native', MPI_INFO_NULL, ierr )
+          ENDIF
+          IF ( myid == 0 )  THEN
              CALL MPI_FILE_SEEK( fh, array_position, MPI_SEEK_SET, ierr )
              CALL MPI_FILE_READ_ALL( fh, data, SIZE( data ), MPI_REAL, status, ierr )
+             CALL MPI_FILE_READ( fh, data, SIZE( data ), MPI_REAL, status, ierr )
           ENDIF
+          IF ( sm_io%is_sm_active() )  THEN
+             CALL MPI_BCAST( data, SIZE( data ), MPI_REAL, 0, sm_io%comm_shared, ierr )
+          ENDIF
+          CALL MPI_BCAST( data, SIZE( data ), MPI_REAL, 0, comm2d, ierr )
        ENDIF
 #else
 …
              CALL MPI_FILE_READ_ALL( fh, data, SIZE( data), MPI_INTEGER, status, ierr )
           ENDIF
           CALL MPI_BCAST( data, SIZE( data ), MPI_REAL, 0, comm2d, ierr )
+          CALL MPI_BCAST( data, SIZE( data ), MPI_INTEGER, 0, comm2d, ierr )
        ELSE
           IF ( sm_io%iam_io_pe )  THEN
+          IF( sm_io%iam_io_pe )  THEN
              CALL MPI_FILE_SET_VIEW( fh, offset, MPI_BYTE, MPI_BYTE, 'native', MPI_INFO_NULL, ierr )
+          ENDIF
+          IF ( myid == 0 )  THEN
              CALL MPI_FILE_SEEK( fh, array_position, MPI_SEEK_SET, ierr )
              CALL MPI_FILE_READ_ALL( fh, data, SIZE( data), MPI_INTEGER, status, ierr )
+             CALL MPI_FILE_READ( fh, data, SIZE( data), MPI_INTEGER, status, ierr )
           ENDIF
+          IF ( sm_io%is_sm_active() )  THEN
+             CALL MPI_BCAST( data, SIZE( data ), MPI_INTEGER, 0, sm_io%comm_shared, ierr )
+          ENDIF
+       ENDIF
+          CALL MPI_BCAST( data, SIZE( data ), MPI_INTEGER, 0, comm2d, ierr )
+        ENDIF
 #else
        CALL posix_lseek( fh, array_position )
 …
     IF ( header_array_index == max_nr_arrays )  THEN
+       STOP '+++ maximum number of 2d/3d-array entries in restart file header exceeded'
+       message_string = 'maximum number of 2d/3d-array entries in restart file header exceeded'
+       CALL message( 'wrd_mpi_io_global_array_real_1d', 'PA0585', 1, 2, 0, 6, 0 )
     ENDIF
 …
     IF ( header_array_index == max_nr_arrays )  THEN
+       STOP '+++ maximum number of 2d/3d-array entries in restart file header exceeded'
+       message_string = 'maximum number of 2d/3d-array entries in restart file header exceeded'
+       CALL message( 'wrd_mpi_io_global_array_int_1d', 'PA0585', 1, 2, 0, 6, 0 )
     ENDIF
 …
        ENDDO
-       write(9,*) 'particle_size_read ',particle_size,array_size,array_position,sum(prt_global_index)
        ALLOCATE( prt_data(MAX(array_size,1)) )
 …
        array_position = prt_nr_bytes
-       write(9,*) 'array_position after particle read ',array_position,prt_nr_bytes,rs
        DEALLOCATE( prt_data )
 …
 ! ------------
 !> Read 1d-REAL surface data array with MPI-IO.
+!--------------------------------------------------------------------------------------------------!
+ SUBROUTINE rrd_mpi_io_surface( name, data, first_index )
+!> This is a recursive subroutine. In case of cyclic fill mode it may call itself for reading parts
+!> of the prerun grid.
+!--------------------------------------------------------------------------------------------------!
+ RECURSIVE SUBROUTINE rrd_mpi_io_surface( name, data, first_index )
     IMPLICIT NONE
 …
     CHARACTER(LEN=*), INTENT(IN) ::  name            !<
+    INTEGER(iwp), OPTIONAL       ::  first_index     !<
+    INTEGER(iwp)                 ::  i               !<
+    INTEGER(iwp)                 ::  j               !<
+    INTEGER(iwp)                 ::  lo_first_index  !<
+#if defined( __parallel )
+    INTEGER(iwp)                 ::  buf_start       !<
     INTEGER(KIND=rd_offset_kind) ::  disp            !< displacement of actual indices
+    INTEGER(KIND=rd_offset_kind) ::  disp_f          !< displacement in file
+    INTEGER(KIND=rd_offset_kind) ::  disp_n          !< displacement of next column
+    INTEGER(iwp), OPTIONAL       ::  first_index     !<
+    INTEGER(iwp)                 ::  i               !<
+    INTEGER(iwp)                 ::  i_f             !<
+    INTEGER(iwp)                 ::  j               !<
+    INTEGER(iwp)                 ::  j_f             !<
+    INTEGER(iwp)                 ::  lo_first_index  !<
+    INTEGER(iwp)                 ::  nr_bytes        !<
+    INTEGER(iwp)                 ::  nr_bytes_f      !<
+    INTEGER(iwp)                 ::  nr_words        !<
+#if defined( __parallel )
+    INTEGER, DIMENSION(rd_status_size)  ::  status   !<
+#else
+    TYPE(C_PTR)                         ::  buf      !<
+#endif
+    LOGICAL                             ::  found    !<
+    INTEGER(iwp)                 ::  ie              !<
+    INTEGER(iwp)                 ::  ind_gb          !<
+    INTEGER(iwp)                 ::  ind_out         !<
+    INTEGER(iwp)                 ::  is              !<
+    INTEGER(iwp)                 ::  n               !<
+    INTEGER(iwp)                 ::  n_trans         !<
+    INTEGER(iwp),DIMENSION(0:numprocs-1) ::  lo_index  !<
+    INTEGER, DIMENSION(rd_status_size)   ::  status    !<
+#endif
+    LOGICAL                      ::  found  !<
     REAL(wp), INTENT(OUT), DIMENSION(:), TARGET ::  data  !<
+#if defined( __parallel )
+    REAL(wp),DIMENSION(:),ALLOCATABLE    ::  put_buffer  !<
+#endif
 …
     DO  i = 1, tgh%nr_arrays
         IF ( TRIM( array_names(i) ) == TRIM( name ) )  THEN
+!
+!--        ATTENTION: The total_number_of_surface_values and wp MUST be INTERGER(8).
+!--        The compiler (at least Intel) first computes total_number_of_surface_values*wp
+!--        and then does the conversion to INTEGER(8).
+!--        This may lead to wrong results when total_number_of_surface_values*wp is > 2*10**6
            array_position = array_offset(i) + ( lo_first_index - 1 ) *                             &
                             total_number_of_surface_values * wp
+                            INT( total_number_of_surface_values, idp ) * INT( wp, idp )
            found = .TRUE.
            EXIT
 …
     ENDDO
+    disp   = -1
+    disp_f = -1
+    disp_n = -1
+!
+!-- In case of 2d-data, name is written only once
+    IF ( lo_first_index == 1 )  THEN
+       IF ( header_array_index == max_nr_arrays )  THEN
+          message_string = 'maximum number of 2d/3d-array entries in restart file header exceeded'
+          CALL message( 'rrd_mpi_io_surface', 'PA0585', 1, 2, 0, 6, 0 )
+       ENDIF
+       array_names(header_array_index)  = name
+       array_offset(header_array_index) = array_position
+       header_array_index = header_array_index + 1
+    ENDIF
     IF ( found )  THEN
        IF ( cyclic_fill_mode )  THEN
           CALL rrd_mpi_io_surface_cyclic_fill
+          RETURN
        ELSE
+          IF ( MAXVAL( m_global_start ) == -1 )   RETURN   ! Nothing to do on this PE
+#if defined( __parallel )
+!
+!--       Read data from restart file
+          CALL sm_io%sm_node_barrier() ! has no effect if I/O on limited number of cores is inactive
+          IF ( sm_io%iam_io_pe )  THEN
+             CALL MPI_FILE_SET_VIEW( fh, array_position, MPI_REAL, ft_surf, 'native',              &
+                                     MPI_INFO_NULL, ierr )
+             CALL MPI_FILE_READ_ALL ( fh, array_out, SIZE(array_out), MPI_REAL, status, ierr )
+          ENDIF
+          CALL sm_io%sm_node_barrier()
+!
+!--       Copy data into transfer buffer. Data is organized in a way that only one MPI_PUT to the
+!--       respective PE ist required.
+          ALLOCATE( put_buffer(SUM( transfer_index(4,:) )) )
+          ind_gb = 1
+          DO  i = 1, SIZE( local_indices, 2 )
+             ind_out = local_indices(1,i)
+             DO  j = 1, local_indices(2,i)
+                put_buffer(ind_gb) = array_out(ind_out)
+                ind_out = ind_out + 1
+                ind_gb  = ind_gb  + 1
+             ENDDO
+          ENDDO
+!
+!--       Transfer data from I/O PEs to the respective PEs to which they belong.
+          CALL MPI_WIN_FENCE( 0, win_surf, ierr )
+          buf_start = 1
+          DO  n = 0, numprocs-1
+             n_trans = transfer_index(4,n)
+             IF ( n_trans > 0 )  THEN
+                disp = transfer_index(3,n) - 1
+                CALL MPI_PUT( put_buffer(buf_start), n_trans, MPI_REAL, n, disp, n_trans, MPI_REAL,&
+                              win_surf, ierr)
+                buf_start = buf_start + n_trans
+             ENDIF
+          ENDDO
+          CALL MPI_WIN_FENCE( 0, win_surf, ierr )
+          DEALLOCATE( put_buffer )
+!
+!--       Copy from RMA window into output array (data) to allow transfering data to target PEs.
+!--       Check, if the number of surface values per grid cell match the index setup.
+          lo_index = thread_values
           DO  i = nxl, nxr
              DO  j = nys, nyn
+                IF ( m_global_start(j,i) > 0 )  THEN
+                   disp     = array_position+(m_global_start(j,i)-1) * wp
+                   nr_words = m_end_index(j,i)-m_start_index(j,i)+1
+                   nr_bytes = nr_words * wp
+                is = lo_index(target_thread(j,i)) + 1
+                ie = is + m_end_index(j,i) - m_start_index(j,i)
+                data(m_start_index(j,i):m_end_index(j,i)) = array_1d(is:ie)
+                lo_index(target_thread(j,i)) = lo_index(target_thread(j,i)) +                      &
+                                               e_end_index(j,i) - e_start_index(j,i) + 1
+!
+!--             TODO: Test can be removed later.
+                IF ( e_end_index(j,i)-e_start_index(j,i)+1 /= NINT( array_1d(is-1) ) )  THEN
+                   WRITE( 9, '(A,6I8)' ) 'Nr surface values does not match ', j, i,                &
+                                         e_start_index(j,i), e_end_index(j,i),                     &
+                                         e_end_index(j,i)-e_start_index(j,i)+1 ,                   &
+                                         NINT( array_1d(is-1) )
+                   FLUSH( 9 )
+                   CALL MPI_ABORT( comm2d, 1, ierr )
                 ENDIF
-                IF ( disp >= 0  .AND.  disp_f == -1 )  THEN   ! First entry
-                   disp_f     = disp
-                   nr_bytes_f = 0
-                   i_f = i
-                   j_f = j
-                ENDIF
-                IF ( j == nyn  .AND.  i == nxr )  THEN        ! Last entry
-                   disp_n = -1
-                   IF (  nr_bytes > 0 )  THEN
-                      nr_bytes_f = nr_bytes_f+nr_bytes
-                   ENDIF
-                ELSEIF ( j == nyn )  THEN                     ! Next x
-                   IF ( m_global_start(nys,i+1) > 0  .AND.  disp > 0 )  THEN
-                      disp_n = array_position + ( m_global_start(nys,i+1) - 1 ) * wp
-                   ELSE
-                      CYCLE
-                   ENDIF
-                ELSE
-                   IF ( m_global_start(j+1,i) > 0  .AND.  disp > 0 )  THEN
-                      disp_n = array_position + ( m_global_start(j+1,i) - 1 ) * wp
-                   ELSE
-                      CYCLE
-                   ENDIF
-                ENDIF
-                IF ( disp + nr_bytes == disp_n )  THEN        ! Contiguous block
-                   nr_bytes_f = nr_bytes_f + nr_bytes
-                ELSE                                          ! Read
-#if defined( __parallel )
-                   CALL MPI_FILE_SEEK( fhs, disp_f, MPI_SEEK_SET, ierr )
-                   nr_words = nr_bytes_f / wp
-                   CALL MPI_FILE_READ( fhs, data(m_start_index(j_f,i_f)), nr_words, MPI_REAL, status, &
-                      ierr )
-#else
+!
-!--                Use C_PTR here, because posix read does not work with indexed array
-                   buf = C_LOC( data(m_start_index(j_f,i_f)) )
-                   CALL posix_lseek( fh, disp_f )
-                   CALL posix_read( fh, buf, nr_bytes_f )
-#endif
-                   disp_f     = disp
-                   nr_bytes_f = nr_bytes
-                   i_f = i
-                   j_f = j
-                ENDIF
              ENDDO
           ENDDO
+       ENDIF
+    ELSE
+       message_string = 'surface array "' // TRIM( name ) // '" not found in restart file'
+       CALL message( 'rrd_mpi_io_surface', 'PA0722', 3, 2, 0, 6, 0 )
+#else
+          CALL posix_lseek( fh, array_position )
+          CALL posix_read( fh, array_out, SIZE(array_out) )
+          DO  i = nxl, nxr
+             DO  j = nys, nyn
+                data(m_start_index(j,i):m_end_index(j,i)) =                                        &
+                                                   array_out(e_start_index(j,i)+1:e_end_index(j,i))
+!
+!--             TODO: Test can be removed later.
+                IF ( e_end_index(j,i)-e_start_index(j,i)+1 /= NINT(array_out(e_start_index(j,i))) )&
+                THEN
+                   WRITE( 9, '(A,6I8)' ) 'Nr surface values does not match ', j, i,                &
+                                         e_start_index(j,i), e_end_index(j,i),                     &
+                                         e_end_index(j,i)-e_start_index(j,i)+1,                    &
+                                         NINT( array_out(e_start_index(j,i)) )
+                   FLUSH( 9 )
+                   CALL ABORT()
+                ENDIF
+             ENDDO
+          ENDDO
+#endif
+       ENDIF
     ENDIF
 …
        INTEGER(iwp) ::  i         !<
        INTEGER(iwp) ::  ie        !<
-#if defined( __parallel )
-       INTEGER(iwp) ::  ierr      !<
-#endif
        INTEGER(iwp) ::  is        !<
        INTEGER(iwp) ::  i_remote  !<
 …
        INTEGER(KIND=MPI_ADDRESS_KIND) ::  rem_offs  !<
 #else
+       INTEGER(idp) ::  rem_offs
+#endif
+       LOGICAL ::  write_done  !<
+!
+!--    In the current version, there is only 1 value per grid cell allowed.
+!--    In this special case, the cyclical repetition can be done with the same method as for 2d-real
+!--    array.
+       INTEGER(idp) ::  rem_offs                    !<
+#endif
+       REAL(wp), DIMENSION(:), ALLOCATABLE ::  c_data  !<
+!
+!--    ATTENTION: This version allows only 1 surface element per grid cell.
+!
+!--    Activate grid of the smaller prerun, i.e. grid variables like nxl, nxr, nys, nyn and other
+!--    values are set according to the prerun settings.
        CALL prerun_grid%activate_grid_from_this_class()
        IF ( pe_active_for_read )  THEN
+          rmabuf_2d = -1.0
+          IF ( MAXVAL( m_end_index ) <= 0 )  THEN
+             CALL mainrun_grid%activate_grid_from_this_class()
+             IF ( debug_output )  THEN
+                CALL debug_message( 'PE inactive for reading restart or prerun data', 'start' )
+             ENDIF
+             RETURN
+          ENDIF
+          ALLOCATE( c_data(MAXVAL( m_end_index )) )
+!
+!--       Recursive CALL of rrd_mpi_io_surface.
+!--       rrd_mpi_io_surface is called with cyclic_fill_mode = .FALSE. on the smaller prerun grid.
+          cyclic_fill_mode = .FALSE.
+          CALL rrd_mpi_io_surface( name, c_data )
+          cyclic_fill_mode = .TRUE.
           DO  i = nxl, nxr
              DO  j = nys, nyn
+                IF ( m_global_start(j,i) > 0 )  THEN
+                   disp     = array_position+(m_global_start(j,i)-1) * wp
+                   nr_words = m_end_index(j,i)-m_start_index(j,i)+1
+                   nr_bytes = nr_words * wp
+                ENDIF
+                IF ( disp >= 0  .AND.  disp_f == -1 )  THEN   ! First entry
+                   disp_f     = disp
+                   nr_bytes_f = 0
+                   write_done = .TRUE.
+                ENDIF
+                IF( write_done )  THEN
+                   i_f = i
+                   j_f = j
+                   write_done = .FALSE.
+                ENDIF
+                IF ( j == nyn  .AND.  i == nxr )  THEN        ! Last entry
+                   disp_n = -1
+                   IF (  nr_bytes > 0 )  THEN
+                      nr_bytes_f = nr_bytes_f+nr_bytes
+                   ENDIF
+                ELSEIF ( j == nyn )  THEN                     ! Next x
+                   IF ( m_global_start(nys,i+1) > 0  .AND.  disp > 0 )  THEN
+                      disp_n = array_position + ( m_global_start(nys,i+1) - 1 ) * wp
+                   ELSE
+                      CYCLE
+                   ENDIF
+                ELSE
+                   IF ( m_global_start(j+1,i) > 0  .AND.  disp > 0 )  THEN
+                      disp_n = array_position + ( m_global_start(j+1,i) - 1 ) * wp
+                   ELSE
+                      CYCLE
+                   ENDIF
+                ENDIF
+                IF ( disp + nr_bytes == disp_n )  THEN        ! Contiguous block
+                   nr_bytes_f = nr_bytes_f + nr_bytes
+                ELSE                                          ! Read
+#if defined( __parallel )
+                   CALL MPI_FILE_SEEK( fhs, disp_f, MPI_SEEK_SET, ierr )
+                   nr_words = nr_bytes_f / wp
+                   CALL MPI_FILE_READ( fhs, rmabuf_2d(j_f,i_f), nr_words, MPI_REAL, status, ierr )
+#else
+                   CALL posix_lseek( fh, disp_f )
+                   CALL posix_read( fh, rmabuf_2d(j_f:,i_f:), nr_bytes_f )
+#endif
+                   disp_f     = disp
+                   nr_bytes_f = nr_bytes
+                   write_done = .TRUE.
+                ENDIF
+                rmabuf_2d(j,i) = c_data(c_start_index(j,i))
              ENDDO
           ENDDO
        ENDIF
+!
+!--    Activate grid of the mainrun, i.e. grid variables like nxl, nxr, nys, nyn and other values
+!--    are set according to the mainrun settings.
        CALL mainrun_grid%activate_grid_from_this_class()
 …
 #endif
+       IF ( .NOT. pe_active_for_read )  THEN
+          is = nxl
+          ie = nxr
+          js = nys
+          je = nyn
+       ELSE
+          is = nxl
+          ie = nxr
+          js = prerun_grid%nys+1
+          je = nyn
+          DO  i = is, ie
+             DO  j = js, je
+                i_remote = MOD(i,nx_on_file+1)
+                j_remote = MOD(j,ny_on_file+1)
+                rem_pe   = remote_pe(i_remote,j_remote)
+                rem_offs = rma_offset(i_remote,j_remote)
+                nval     = 1
+#if defined( __parallel )
+                IF ( rem_pe /= myid )  THEN
+                   CALL MPI_GET( data(m_start_index(j,i)), nval, MPI_REAL, rem_pe, rem_offs, nval, &
+                                 MPI_REAL, rmawin_2d, ierr)
+                ELSE
+                   data(m_start_index(j,i)) = rmabuf_2d(j_remote,i_remote)
+                ENDIF
+#else
+                data(m_start_index(j,i)) = array_2d(i_remote,j_remote)
+#endif
+             ENDDO
+          ENDDO
+          is = prerun_grid%nxr+1
+          ie = nxr
+          js = nys
+          je = nyn
+       ENDIF
+!
+!--   After reading surface data on the small grid, map these data in a cyclic way to all respective
+!--   grid points of the main run.
+      is = nxl
+      ie = nxr
+      js = nys
+      je = nyn
        DO  i = is, ie
           DO  j = js, je
              i_remote = MOD(i,nx_on_file+1)
              j_remote = MOD(j,ny_on_file+1)
+             i_remote = MOD( i, nx_on_file+1 )
+             j_remote = MOD( j, ny_on_file+1 )
              rem_pe   = remote_pe(i_remote,j_remote)
              rem_offs = rma_offset(i_remote,j_remote)
 …
 #if defined( __parallel )
              IF ( rem_pe /= myid )  THEN
                 CALL MPI_GET( data(m_start_index(j,i)), nval, MPI_REAL, rem_pe, rem_offs, nval,    &
+                CALL MPI_GET( data(o_start_index(j,i)), nval, MPI_REAL, rem_pe, rem_offs, nval,    &
                               MPI_REAL, rmawin_2d, ierr)
              ELSE
                 data(m_start_index(j,i)) = rmabuf_2d(j_remote,i_remote)
+                data(o_start_index(j,i)) = rmabuf_2d(j_remote,i_remote)
              ENDIF
 #else
              data(m_start_index(j,i)) = array_2d(i_remote,j_remote)
+             data(o_start_index(j,i)) = array_2d(i_remote,j_remote)
 #endif
           ENDDO
 …
        CALL MPI_WIN_FENCE( 0, rmawin_2d, ierr )
 #endif
+       IF ( ALLOCATED( c_data ) )  DEALLOCATE( c_data )
     END SUBROUTINE rrd_mpi_io_surface_cyclic_fill
 …
     array_position = prt_nr_bytes
-    write(9,*) 'array_position after particle ',array_position,prt_nr_bytes,rs
     DEALLOCATE( prt_data )
 …
     IMPLICIT NONE
+    CHARACTER(LEN=*), INTENT(IN)       ::  name            !<
+#if defined( __parallel )
+    INTEGER(KIND=rd_offset_kind)       ::  disp            !<
+#endif
+    INTEGER(iwp), OPTIONAL             ::  first_index     !<
+#if defined( __parallel )
+    INTEGER(iwp)                       ::  i               !<
+#endif
+    INTEGER(iwp)                       ::  lo_first_index  !<
+    INTEGER(KIND=rd_offset_kind)       ::  offset          !<
+#if defined( __parallel )
+    INTEGER, DIMENSION(rd_status_size) ::  status          !<
+#endif
+    REAL(wp), INTENT(IN), DIMENSION(:), TARGET ::  data    !<
+    offset = 0
+    CHARACTER(LEN=*), INTENT(IN) ::  name  !<
+    INTEGER(iwp), OPTIONAL ::  first_index     !<
+    INTEGER(iwp) ::  i               !<
+    INTEGER(iwp) ::  j               !<
+    INTEGER(iwp) ::  lo_first_index  !<
+#if defined( __parallel )
+    INTEGER(iwp) ::  buf_start       !<
+    INTEGER(iwp) ::  ie              !<
+    INTEGER(iwp) ::  is              !<
+    INTEGER(iwp) ::  ind_gb          !<
+    INTEGER(iwp) ::  ind_out         !<
+    INTEGER(iwp) ::  n               !<
+    INTEGER(iwp) ::  n_trans         !<
+#endif
+#if defined( __parallel )
+    INTEGER(KIND=MPI_ADDRESS_KIND) ::  disp    !< displacement in RMA window
+    INTEGER(KIND=rd_offset_kind)   ::  offset  !<
+    INTEGER(iwp), DIMENSION(0:numprocs-1)   ::  lo_index  !<
+    INTEGER(iwp), DIMENSION(rd_status_size) ::  status    !<
+#endif
+    REAL(wp), INTENT(IN), DIMENSION(:), TARGET ::  data  !<
+#if defined( __parallel )
+    REAL(wp), DIMENSION(:), ALLOCATABLE ::  get_buffer  !<
+#endif
     lo_first_index = 1
 …
        lo_first_index = first_index
     ENDIF
+!
 !-- In case of 2d-data, name is written only once
 …
        IF ( header_array_index == max_nr_arrays )  THEN
+          STOP '+++ maximum number of 2d/3d-array entries in restart file header exceeded'
+          message_string = 'maximum number of 2d/3d-array entries in restart file header exceeded'
+          CALL message( 'wrd_mpi_io_surface', 'PA0585', 1, 2, 0, 6, 0 )
        ENDIF
 …
 #if defined( __parallel )
+    IF ( sm_io%is_sm_active() )  THEN
+       DO  i = 1, nr_val
+          array_1d(i+local_start) = data(i)
+    offset = 0
+    ALLOCATE( get_buffer(SUM( transfer_index(4,:) )) )
+!
+!-- Copy from input array (data) to RMA window to allow the target PEs to get the appropiate data.
+!-- At this point, a dummy surface element is added. This makes sure that every x-y grid cell owns
+!-- at least one surface element. This way, bookkeeping becomes much easier.
+    lo_index = thread_values
+    DO  i = nxl, nxr
+       DO  j = nys, nyn
+          is = lo_index(target_thread(j,i)) + 1
+          ie = is + m_end_index(j,i) - m_start_index(j,i)
+!
+!--       Store number of surface elements in dummy additional surface element
+          array_1d(is-1)  = e_end_index(j,i) - e_start_index(j,i) + 1
+          array_1d(is:ie) = data(m_start_index(j,i):m_end_index(j,i))
+          lo_index(target_thread(j,i)) = lo_index(target_thread(j,i)) +                            &
+                                         e_end_index(j,i) - e_start_index(j,i) + 1
        ENDDO
+    ELSE
+!       array_1d => data                           !kk Did not work in all cases    why???
+       ALLOCATE( array_1d( SIZE( data ) ) )
+       array_1d = data
+    ENDIF
+    CALL sm_io%sm_node_barrier()  ! Has no effect if I/O on limited number of cores is inactive
+    ENDDO
+!
+!-- On target PE, get data from source PEs which are assigned for output on this PE.
+    CALL MPI_WIN_FENCE( 0, win_surf, ierr )
+    buf_start = 1
+    DO  n = 0, numprocs-1
+       n_trans = transfer_index(4,n)
+       IF ( n_trans > 0 )  THEN
+          disp = transfer_index(3,n) - 1
+          CALL MPI_GET( get_buffer(buf_start), n_trans, MPI_REAL, n, disp, n_trans, MPI_REAL,      &
+                        win_surf, ierr )
+          buf_start = buf_start + n_trans
+       ENDIF
+    ENDDO
+    CALL MPI_WIN_FENCE( 0, win_surf, ierr )
+!
+!-- Copy data to output buffer. Here, the outpuf buffer matches the indices global_start and
+!-- global_end.
+    ind_gb  = 1
+    DO  i = 1, SIZE( local_indices, 2 )
+       ind_out = local_indices(1,i)
+       DO  j = 1, local_indices(2,i)
+          array_out(ind_out) = get_buffer(ind_gb)
+          ind_out = ind_out+1
+          ind_gb  = ind_gb+1
+       ENDDO
+    ENDDO
+    DEALLOCATE( get_buffer )
+!
+!-- Write data to disk.
+    CALL sm_io%sm_node_barrier()  ! has no effect if I/O on limited number of cores is inactive
     IF ( sm_io%iam_io_pe )  THEN
+       IF ( all_pes_write )  THEN
+          CALL MPI_FILE_SET_VIEW( fh, array_position, MPI_REAL, ft_surf, 'native', MPI_INFO_NULL,  &
+                                  ierr )
+          CALL MPI_FILE_WRITE_ALL( fh, array_1d, nr_iope, MPI_REAL, status, ierr )
+       ELSE
+          CALL MPI_FILE_SET_VIEW( fh, offset, MPI_BYTE, MPI_BYTE, 'native', MPI_INFO_NULL, ierr )
+          IF ( nr_val > 0 )  THEN
+             disp = array_position + 8 * ( glo_start - 1 )
+             CALL MPI_FILE_SEEK( fh, disp, MPI_SEEK_SET, ierr )
+             CALL MPI_FILE_WRITE( fh, array_1d, nr_iope, MPI_REAL, status, ierr )
+          ENDIF
+       ENDIF
+       CALL MPI_FILE_SET_VIEW( fh, array_position, MPI_REAL, ft_surf, 'native', MPI_INFO_NULL,     &
+                               ierr )
+       CALL MPI_FILE_WRITE_ALL( fh, array_out, SIZE( array_out ), MPI_REAL, status, ierr )
     ENDIF
     CALL sm_io%sm_node_barrier()
+    IF( .NOT. sm_io%is_sm_active() )  DEALLOCATE( array_1d )
+#else
+#else
+    DO  i = nxl, nxr
+       DO  j = nys, nyn
+          array_out(e_start_index(j,i)) = e_end_index(j,i) - e_start_index(j,i) + 1
+          array_out(e_start_index(j,i)+1:e_end_index(j,i)) =                                       &
+                                                          data(m_start_index(j,i):m_end_index(j,i))
+       ENDDO
+    ENDDO
     CALL posix_lseek( fh, array_position )
+    CALL posix_write( fh, data, nr_val )
+#endif
+    CALL posix_write( fh, array_out, SIZE(array_out) )
+#endif
     array_position = array_position + total_number_of_surface_values * wp
-!    IF ( lo_first_index == 1 )  THEN
-!       IF ( debug_level >= 2 .AND. nr_val  > 0 )  WRITE(9,*) 'w_surf_1 ', TRIM( name ), ' ', nr_val, SUM( data(1:nr_val) )
-!    ELSE
-!       IF ( debug_level >= 2 .AND. nr_val  > 0 ) WRITE(9,*) 'w_surf_n ', TRIM( name ), ' ', &
-!                                                            lo_first_index, nr_val, SUM( data(1:nr_val) )
-!    ENDIF
  END SUBROUTINE wrd_mpi_io_surface
 …
     IF ( wr_flag  .AND.  sm_io%iam_io_pe )  THEN
+       tgh%nr_int    = header_int_index - 1
+       tgh%nr_char   = header_char_index - 1
+       tgh%nr_real   = header_real_index - 1
+       tgh%nr_arrays = header_array_index - 1
+       tgh%total_nx  = iog%nx + 1
+       tgh%total_ny  = iog%ny + 1
+       tgh%nr_int      = header_int_index - 1
+       tgh%nr_char     = header_char_index - 1
+       tgh%nr_real     = header_real_index - 1
+       tgh%nr_arrays   = header_array_index - 1
+       tgh%total_nx    = iog%nx + 1
+       tgh%total_ny    = iog%ny + 1
+       tgh%pes_along_x = pdims(1)
+       tgh%pes_along_y = pdims(2)
        IF ( include_total_domain_boundaries )  THEN   ! Not sure, if LOGICAL interpretation is the same for all compilers,
           tgh%i_outer_bound = 1                       ! therefore store as INTEGER in general header
 …
 !-- Close MPI-IO files
 #if defined( __parallel )
+!
-!-- Restart file has been opened with comm2d
-    IF ( fhs /= -1 )  THEN
-       CALL MPI_FILE_CLOSE( fhs, ierr )
-    ENDIF
+!
 !-- Free RMA windows
 …
 #endif
+    IF (.NOT. pe_active_for_read )  RETURN
+    IF ( ALLOCATED( e_start_index ) )   DEALLOCATE( e_start_index  )
+    IF ( ALLOCATED( e_end_index )   )   DEALLOCATE( e_end_index    )
+    IF ( ALLOCATED( m_start_index ) )   DEALLOCATE( m_start_index  )
+    IF ( ALLOCATED( m_end_index )   )   DEALLOCATE( m_end_index    )
+    IF ( ALLOCATED( target_thread ) )   DEALLOCATE( target_thread  )
+    IF ( ALLOCATED( thread_index )  )   DEALLOCATE( thread_index   )
+    IF ( ALLOCATED( thread_values ) )   DEALLOCATE( thread_values  )
+    IF ( ALLOCATED( transfer_index ) )  DEALLOCATE( transfer_index  )
+    IF ( .NOT. pe_active_for_read )  RETURN
+!
 !-- TODO: better explain the following message
 …
 !> data is not time critical (data size is comparably small), it will be read by all cores.
 !--------------------------------------------------------------------------------------------------!
+ SUBROUTINE rd_mpi_io_surface_filetypes( start_index, end_index, data_to_write, global_start )
+ RECURSIVE SUBROUTINE rd_mpi_io_surface_filetypes( start_index, end_index, data_to_write,          &
+                                                   global_start, global_end )
     IMPLICIT NONE
+    INTEGER(iwp)                          ::  i           !<  loop index
+    INTEGER(iwp)                          ::  j           !<  loop index
+    INTEGER(KIND=rd_offset_kind)          ::  offset      !<
+    INTEGER(iwp), DIMENSION(1)            ::  dims1       !<
+    INTEGER(iwp), DIMENSION(1)            ::  lize1       !<
+    INTEGER(iwp), DIMENSION(1)            ::  start1      !<
+    INTEGER(iwp), DIMENSION(0:numprocs-1) ::  all_nr_val  !< number of values for all PEs
+    INTEGER(iwp), DIMENSION(0:numprocs-1) ::  lo_nr_val   !< local number of values in x and y direction
+    INTEGER, INTENT(INOUT), DIMENSION(nys:nyn,nxl:nxr)  ::  end_index     !<
+    INTEGER, INTENT(OUT), DIMENSION(nys:nyn,nxl:nxr)    ::  global_start  !<
+    INTEGER, INTENT(INOUT), DIMENSION(nys:nyn,nxl:nxr)  ::  start_index   !<
+    LOGICAL, INTENT(OUT) ::  data_to_write  !< returns, if surface data have to be written
+!
+!-- Actions during reading
+    IF ( rd_flag )  THEN
+!
+!--    Set start index and end index for the mainrun grid.
+!--    ATTENTION: This works only for horizontal surfaces with one vale per grid cell!!!
+       IF ( cyclic_fill_mode )  THEN
+          DO  i = nxl, nxr
+             DO  j = nys, nyn
+                start_index (j,i) = (i-nxl) * nny + j - nys + 1
+                end_index (j,i)   = start_index(j,i)
+             ENDDO
+    INTEGER(iwp) ::  e_lo_start        !<
+    INTEGER(iwp) ::  i                 !<  loop index
+    INTEGER(iwp) ::  j                 !<  loop index
+    INTEGER(iwp) ::  index_offset      !<
+    INTEGER(iwp) ::  last_end_index    !<
+    INTEGER(iwp) ::  lo_start          !<
+    INTEGER(iwp) ::  nr_surfcells_pe   !<
+    INTEGER(iwp) ::  rest_cells_pe     !<
+    INTEGER(iwp) ::  rest_bound        !<
+#if defined( __parallel )
+    INTEGER(iwp) ::  io_end_index      !<
+    INTEGER(iwp) ::  io_start_index    !<
+    INTEGER(iwp) ::  n                 !<  loop index
+    INTEGER(iwp) ::  nr_previous       !<
+#endif
+    INTEGER(iwp), DIMENSION(0:numprocs-1,2) ::  nr_surfcells_all_s  !<
+    INTEGER(iwp), DIMENSION(0:numprocs-1,2) ::  nr_surfcells_all_r  !<
+#if defined( __parallel )
+    INTEGER(iwp), DIMENSION(1)              ::  dims1               !< global dimension for MPI_TYPE_CREATE_SUBARRAY
+    INTEGER(iwp), DIMENSION(1)              ::  lsize1              !< local size for MPI_TYPE_CREATE_SUBARRAY
+    INTEGER(iwp), DIMENSION(0:numprocs-1)   ::  nr_cells_to_thread  !<
+    INTEGER(iwp), DIMENSION(0:pdims(1))     ::  nr_surf_cells_x     !<
+    INTEGER(iwp), DIMENSION(0:pdims(1))     ::  nr_surf_cells_x_s   !<
+    INTEGER(iwp), DIMENSION(0:numprocs-1)   ::  nr_values_to_thread !<
+    INTEGER(iwp), DIMENSION(1)              ::  start1              !< start index for MPI_TYPE_CREATE_SUBARRAY
+    INTEGER(iwp), DIMENSION(nxl:nxr)        ::  sum_y               !<
+#endif
+    INTEGER(iwp), INTENT(INOUT), DIMENSION(nys:nyn,nxl:nxr) ::  end_index     !< local end indx
+    INTEGER(iwp), INTENT(INOUT), DIMENSION(nys:nyn,nxl:nxr) ::  global_start  !< global start index
+    INTEGER(iwp), INTENT(INOUT), DIMENSION(nys:nyn,nxl:nxr) ::  global_end    !< global end index
+    INTEGER(iwp), INTENT(INOUT), DIMENSION(nys:nyn,nxl:nxr) ::  start_index   !< local start index
+#if defined( __parallel )
+    INTEGER(iwp), DIMENSION(0:myidy,nxl:nxr)    ::  nr_previous_y      !<
+    INTEGER(iwp), DIMENSION(0:pdims(2),nxl:nxr) ::  nr_surf_cells_y    !<
+    INTEGER(iwp), DIMENSION(0:pdims(2),nxl:nxr) ::  nr_surf_cells_y_s  !<
+    INTEGER(iwp), DIMENSION(4,0:numprocs-1)     ::  transfer_index_s   !<
+#endif
+    LOGICAL, INTENT(OUT) ::  data_to_write      !< returns .TRUE., if surface data have been written
+    LOGICAL              ::  only_dummy_values  !< only dummy values, i.e. no data to write
+!
+!-- Clean up previous calls.
+#if defined( __parallel )
+    IF ( win_surf /= -1 )  THEN
+       CALL MPI_WIN_FREE( win_surf, ierr )
+       DEALLOCATE( array_1d )
+       win_surf = -1
+    ENDIF
+    IF ( ft_surf /= -1  .AND.  sm_io%iam_io_pe )  THEN
+       CALL MPI_TYPE_FREE( ft_surf, ierr )
+    ENDIF
+    ft_surf = -1
+    IF ( sm_io%is_sm_active() )  THEN
+       IF ( win_out /= -1 )  THEN
+          CALL MPI_WIN_FREE( win_out, ierr )
+          win_out = -1
+       ENDIF
+    ELSE
+       IF ( ASSOCIATED( array_out ) )  DEALLOCATE( array_out )
+    ENDIF
+#else
+    IF ( ASSOCIATED( array_out ) )  DEALLOCATE( array_out )
+#endif
+    IF ( cyclic_fill_mode )  THEN
+       CALL cyclic_fill_surface_filetype
+       RETURN
+    ELSE
+       IF ( .NOT. ALLOCATED( e_end_index )    )  ALLOCATE( e_end_index(nys:nyn,nxl:nxr)   )
+       IF ( .NOT. ALLOCATED( e_start_index )  )  ALLOCATE( e_start_index(nys:nyn,nxl:nxr) )
+       IF ( .NOT. ALLOCATED( m_end_index )    )  ALLOCATE( m_end_index(nys:nyn,nxl:nxr)   )
+       IF ( .NOT. ALLOCATED( m_start_index )  )  ALLOCATE( m_start_index(nys:nyn,nxl:nxr) )
+       IF ( .NOT. ALLOCATED( target_thread )  )  ALLOCATE( target_thread(nys:nyn,nxl:nxr) )
+       IF ( .NOT. ALLOCATED( thread_index )   )  ALLOCATE( thread_index(0:numprocs-1)     )
+       IF ( .NOT. ALLOCATED( thread_values )  )  ALLOCATE( thread_values(0:numprocs-1)    )
+       IF ( .NOT. ALLOCATED( transfer_index ) )  ALLOCATE( transfer_index(4,0:numprocs-1) )
+    ENDIF
+    IF ( wr_flag)  THEN
+!
+!--    Add one dummy value at every grid box.
+!--    This allows to use MPI_FILE_WRITE_ALL and MPI_FILE_READ_ALL with subarray file type.
+       index_offset   = 0
+       last_end_index = 0
+       DO  i = nxl, nxr
+          DO  j = nys, nyn
+             e_start_index(j,i) = start_index (j,i) + index_offset
+             IF ( end_index (j,i) - start_index(j,i) < 0 )  THEN
+                e_end_index (j,i) = last_end_index+1
+                last_end_index    = last_end_index+1
+             ELSE
+                e_end_index (j,i) = end_index(j,i) + index_offset + 1
+                last_end_index    = e_end_index (j,i)
+             ENDIF
+             index_offset = index_offset + 1
+           ENDDO
+       ENDDO
+#if defined( __parallel )
+!
+!--    Compute indices for global, PE independent 1-d surface element array.
+       nr_surf_cells_y_s = 0
+!
+!--    Count number of surface elements in y-direction for every x.
+       DO  i = nxl, nxr
+          nr_surf_cells_y_s(myidy,i) = SUM( e_end_index (:,i) - e_start_index (:,i) + 1 )
+       ENDDO
+!
+!--    Distribute these values to all PEs along y.
+       CALL MPI_ALLREDUCE( nr_surf_cells_y_s, nr_surf_cells_y, SIZE( nr_surf_cells_y ),            &
+                           MPI_INTEGER, MPI_SUM, comm1dy, ierr )
+!
+!--    Sum all surface elements along y for individual x PEs
+       nr_surf_cells_x_s        = 0
+       nr_surf_cells_x_s(myidx) = SUM( nr_surf_cells_y )
+!
+!--    Distribute to all PEs along x.
+       CALL MPI_ALLREDUCE( nr_surf_cells_x_s, nr_surf_cells_x, SIZE( nr_surf_cells_x ),            &
+                           MPI_INTEGER, MPI_SUM, comm1dx, ierr )
+       DO  i = nxl, nxr
+          nr_previous_y(:,i) = 0
+          DO  n = 1, myidy
+             nr_previous_y(n,i) = nr_previous_y(n-1,i) + nr_surf_cells_y(n-1,i)
           ENDDO
+       ENDIF
+       IF ( .NOT. ALLOCATED( m_start_index )  )  ALLOCATE( m_start_index(nys:nyn,nxl:nxr)  )
+       IF ( .NOT. ALLOCATED( m_end_index )    )  ALLOCATE( m_end_index(nys:nyn,nxl:nxr)    )
+       IF ( .NOT. ALLOCATED( m_global_start ) )  ALLOCATE( m_global_start(nys:nyn,nxl:nxr) )
+!
+!--    Save arrays for later reading
+       m_start_index  = start_index
+       m_end_index    = end_index
+       m_global_start = global_start
+       nr_val = MAXVAL( end_index )
+    ENDIF
+    IF ( .NOT. pe_active_for_read )  RETURN
+    IF ( cyclic_fill_mode )  CALL prerun_grid%activate_grid_from_this_class()
+    offset = 0
+    lo_nr_val= 0
+    lo_nr_val(myid) = MAXVAL( end_index )
+#if defined( __parallel )
+    CALL MPI_ALLREDUCE( lo_nr_val, all_nr_val, numprocs, MPI_INTEGER, MPI_SUM, comm2d, ierr )
+    IF ( ft_surf /= -1  .AND.  sm_io%iam_io_pe )  THEN
+       CALL MPI_TYPE_FREE( ft_surf, ierr )    ! If set, free last surface filetype
+    ENDIF
+    IF ( win_surf /= -1 )  THEN
+       IF ( sm_io%is_sm_active() )  THEN
+          CALL MPI_WIN_FREE( win_surf, ierr )
+       ENDIF
+       win_surf = -1
+    ENDIF
+    IF ( sm_io%is_sm_active() .AND. rd_flag )  THEN
+       IF ( fhs == -1 )  THEN
+          CALL MPI_FILE_OPEN( comm2d, TRIM( io_file_name ), MPI_MODE_RDONLY, MPI_INFO_NULL, fhs,   &
+                              ierr )
+       ENDIF
+       ENDDO
+       sum_y(nxl) = SUM( nr_surf_cells_y(:,nxl) )
+       DO  i = nxl, nxr
+          IF ( i > nxl )  THEN
+             sum_y(i) = sum_y(i-1) + SUM( nr_surf_cells_y(:,i) )
+          ENDIF
+       ENDDO
+       nr_previous = 0
+       IF ( myidx >= 1 )  THEN
+          nr_previous = SUM(nr_surf_cells_x(0:myidx-1))
+       ENDIF
+       global_start(nys,nxl) = 1 + nr_previous + nr_previous_y(myidy,nxl)
+       DO  j = nys+1, nyn
+          global_start(j,nxl) = global_start(j-1,nxl) + e_end_index(j-1,nxl) -                     &
+                                e_start_index(j-1,nxl) + 1
+       ENDDO
+       DO  i = nxl+1, nxr
+          global_start(nys,i) = 1 + nr_previous + nr_previous_y(myidy,i) + sum_y(i-1)
+          DO  j = nys+1, nyn
+             global_start(j,i) = global_start(j-1,i) + e_end_index(j-1,i) - e_start_index(j-1,i) + 1
+          ENDDO
+       ENDDO
+#else
+       global_start = e_start_index
+#endif
+       DO  i = nxl, nxr
+          DO  j = nys, nyn
+             global_end(j,i) = global_start(j,i) + e_end_index (j,i) - e_start_index (j,i)
+          ENDDO
+       ENDDO
     ELSE
+       fhs = fh
+    ENDIF
+#else
+    all_nr_val(myid) = lo_nr_val(myid)
+#endif
+    nr_val = lo_nr_val(myid)
+!
+!--    In case of read, compute e_start_index and e_end_index for current processor grid.
+!--    This data contains one extra value for every i and j.
+       e_lo_start = 1
+       lo_start   = 1
+       DO  i = nxl, nxr
+          DO  j = nys, nyn
+             e_start_index(j,i) = e_lo_start
+             e_end_index(j,i)   = e_lo_start + global_end(j,i) - global_start(j,i)
+             e_lo_start         = e_lo_start + global_end(j,i) - global_start(j,i) + 1
+             start_index(j,i)   = lo_start
+             end_index(j,i)     = lo_start + global_end(j,i) - global_start(j,i) - 1
+             lo_start           = lo_start + global_end(j,i) - global_start(j,i)
+          ENDDO
+       ENDDO
+    ENDIF
+    nr_surfcells_all_s = 0
+    nr_surfcells_all_s(myid,1) = MAXVAL( e_end_index ) ! don't split surface elements of one gridbox
+    nr_surfcells_all_s(myid,2) = MAXVAL( e_end_index - e_start_index )
+#if defined( __parallel )
+    CALL MPI_ALLREDUCE( nr_surfcells_all_s, nr_surfcells_all_r, SIZE( nr_surfcells_all_s ),        &
+                        MPI_INTEGER, MPI_SUM, comm2d, ierr )
+#else
+    nr_surfcells_all_r = nr_surfcells_all_s
+#endif
     total_number_of_surface_values = 0
 …
           glo_start = total_number_of_surface_values + 1
        ENDIF
        total_number_of_surface_values = total_number_of_surface_values + all_nr_val(i)
+       total_number_of_surface_values = total_number_of_surface_values + nr_surfcells_all_r(i,1)
     ENDDO
+!
+!-- Actions during reading
+    IF ( rd_flag )  THEN
+#if defined( __parallel )
+       CALL MPI_FILE_SET_VIEW( fhs, offset, MPI_BYTE, MPI_BYTE, 'native', MPI_INFO_NULL, ierr )
+#endif
+    ENDIF
+    IF ( cyclic_fill_mode )  CALL mainrun_grid%activate_grid_from_this_class()
+!
+!-- Actions during writing
+    IF ( wr_flag )  THEN
+!
+!--    Create surface filetype
+       ft_surf      = -1
+       global_start = start_index + glo_start - 1
+       WHERE ( end_index < start_index )
+          global_start = -1
+       ENDWHERE
+#if defined( __parallel )
+       IF ( sm_io%is_sm_active() )  THEN
+          IF ( sm_io%iam_io_pe )  THEN
+!
+!--          Calculate number of values of all PEs of an I/O group
+             nr_iope = 0
+             DO  i = myid, myid+sm_io%sh_npes-1
+                nr_iope = nr_iope + all_nr_val(i)
+    only_dummy_values = ( MAXVAL( nr_surfcells_all_r(:,2) ) <= 0 )
+!
+!-- Compute indices of equally distributed surface elements.
+!-- Number of surface values scheduled for ouput on this PE:
+    nr_surfcells_pe  = total_number_of_surface_values  / numprocs
+    rest_cells_pe    = MOD( total_number_of_surface_values, numprocs )
+    rest_bound       = rest_cells_pe * ( nr_surfcells_pe + 1 )
+    m_start_index    = start_index
+    m_end_index      = end_index
+!
+!-- Compute number of elements on source PE, which have to be send to the corresponding target PE.
+#if defined( __parallel )
+    nr_cells_to_thread  = 0
+    nr_values_to_thread = 0
+    DO  i = nxl, nxr
+       DO  j = nys, nyn
+          IF ( rest_cells_pe == 0 )  THEN
+             target_thread(j,i) = ( global_start(j,i) - 1 ) / nr_surfcells_pe
+          ELSE
+             IF ( global_start(j,i) <= rest_bound )  THEN
+                target_thread(j,i) = ( global_start(j,i) - 1 ) / ( nr_surfcells_pe + 1 )
+             ELSE
+                target_thread(j,i) = ( global_start(j,i) - rest_bound - 1 ) / nr_surfcells_pe
+                target_thread(j,i) = target_thread(j,i) + rest_cells_pe
+             ENDIF
+!
+!--          TODO: Test output, to be removed later.
+             IF ( target_thread(j,i) >= numprocs )  THEN
+                WRITE( 9,'(A,8I8)' )  'target_thread ', j, i, target_thread(j,i),                  &
+                                      global_start(j,i) , nr_surfcells_pe
+                FLUSH( 9 )
+                CALL MPI_ABORT( comm2d, 1, ierr )
+             ENDIF
+          ENDIF
+          nr_cells_to_thread(target_thread(j,i))  = nr_cells_to_thread(target_thread(j,i)) + 1
+          nr_values_to_thread(target_thread(j,i)) = nr_values_to_thread(target_thread(j,i)) +      &
+                                                    e_end_index(j,i) - e_start_index(j,i) + 1
+       ENDDO
+    ENDDO
+!
+!-- Compute start index in the transfer buffer on the source side for the corresponding target PE.
+    thread_index(0)  = 1
+    thread_values(0) = 1
+    DO  n = 1, numprocs-1
+       thread_index(n)  = thread_index(n-1) + nr_cells_to_thread(n-1)
+       thread_values(n) = thread_values(n-1) + nr_values_to_thread(n-1)
+    ENDDO
+!
+!-- Buffer distribution on the source side.
+    DO  n = 0, numprocs-1
+       transfer_index_s(1,n) = thread_index(n)
+       transfer_index_s(2,n) = nr_cells_to_thread(n)
+       transfer_index_s(3,n) = thread_values(n)
+       transfer_index_s(4,n) = nr_values_to_thread(n)
+    ENDDO
+    CALL MPI_ALLTOALL( transfer_index_s, 4, MPI_INTEGER, transfer_index, 4, MPI_INTEGER, comm2d,   &
+                       ierr)
+!
+!-- Buffer distribution on the target side side.
+    CALL get_remote_indices()
+!
+!-- Create surface element file type.
+    IF ( total_number_of_surface_values > 0 .AND. .NOT. only_dummy_values)  THEN
+        data_to_write = .TRUE.
+    ELSE
+        data_to_write = .FALSE.
+    ENDIF
+    CALL MPI_ALLREDUCE( global_end(nyn,nxr), dims1(1), 1, MPI_INTEGER, MPI_MAX, comm2d, ierr )
+    start1(1) = MINVAL( local_indices(1,:) ) - 1
+    IF ( sm_io%is_sm_active() )  THEN
+       CALL MPI_ALLREDUCE( SUM( local_indices(2,:) ), lsize1(1), 1, MPI_INTEGER, MPI_SUM,          &
+                           sm_io%comm_shared, ierr )
+    ELSE
+       lsize1(1) = SUM( local_indices(2,:) )
+    ENDIF
+    IF ( sm_io%iam_io_pe )  THEN
+       IF ( total_number_of_surface_values > 0 )  THEN
+           CALL MPI_TYPE_CREATE_SUBARRAY( 1, dims1, lsize1, start1, MPI_ORDER_FORTRAN, MPI_REAL,   &
+                                          ft_surf, ierr )
+           CALL MPI_TYPE_COMMIT( ft_surf, ierr )
+       ENDIF
+    ENDIF
+!
+!-- Allocate rma window to supply surface data to other PEs.
+    CALL rd_alloc_rma_mem( array_1d, SUM( nr_values_to_thread ), win_surf )
+!
+!-- Allocate shared array on IO-PE to supply data for MPI-IO (write or read).
+    IF ( sm_io%is_sm_active() )  THEN
+       IF ( sm_io%iam_io_pe )  THEN
+          io_start_index = start1(1) + 1
+          io_end_index   = start1(1) + lsize1(1)
+       ENDIF
+       CALL MPI_BCAST( io_start_index, 1, MPI_INTEGER, 0, sm_io%comm_shared, ierr )
+       CALL MPI_BCAST( io_end_index,   1, MPI_INTEGER, 0, sm_io%comm_shared, ierr )
+       CALL sm_io%sm_allocate_shared( array_out, io_start_index, io_end_index, win_out )
+    ELSE
+       ALLOCATE( array_out(start1(1)+1:start1(1)+lsize1(1)) )
+    ENDIF
+#else
+    IF ( total_number_of_surface_values > 0  .AND.  .NOT. only_dummy_values )  THEN
+        data_to_write = .TRUE.
+    ELSE
+        data_to_write = .FALSE.
+    ENDIF
+    ALLOCATE( array_out(1:total_number_of_surface_values) )
+#endif
+ CONTAINS
+    SUBROUTINE cyclic_fill_surface_filetype
+       INTEGER(iwp) ::  i  !<  loop index
+       INTEGER(iwp) ::  j  !<  loop index
+       IF ( .NOT. ALLOCATED( o_start_index ) )  ALLOCATE( o_start_index(nys:nyn,nxl:nxr) )
+       IF ( .NOT. ALLOCATED( o_end_index )   )  ALLOCATE( o_end_index(nys:nyn,nxl:nxr)   )
+       lo_start   = 1
+       DO  i = nxl, nxr
+           DO  j = nys, nyn
+               o_start_index(j,i) = lo_start
+               o_end_index(j,i)   = lo_start
+               lo_start           = lo_start + 1
+           ENDDO
+       ENDDO
+       start_index = o_start_index
+       end_index   = o_end_index
+       IF ( MAXVAL( global_end-global_start ) > 1 )  THEN
+          message_string = 'cylic-fill method does not allow more than one surface element ' //    &
+                           'per grid box'
+          CALL message( 'cyclic_fill_surface_filetype', 'PA0742', 1, 2, 0, 6, 0 )
+       ENDIF
+!
+!--    Activate grid of the smaller prerun, i.e. grid variables like nxl, nxr, nys, nyn and others
+!--    are set according to the prerun layout.
+       CALL prerun_grid%activate_grid_from_this_class()
+       IF ( pe_active_for_read )  THEN
+          IF ( .NOT. ALLOCATED( c_global_start ) )  ALLOCATE( c_global_start(nys:nyn,nxl:nxr) )
+          IF ( .NOT. ALLOCATED( c_global_end )   )  ALLOCATE( c_global_end(nys:nyn,nxl:nxr)   )
+          IF ( .NOT. ALLOCATED( c_start_index )  )  ALLOCATE( c_start_index(nys:nyn,nxl:nxr)  )
+          IF ( .NOT. ALLOCATED( c_end_index )    )  ALLOCATE( c_end_index(nys:nyn,nxl:nxr)    )
+          DO  i = nxl, nxr
+             DO  j = nys, nyn
+                c_global_start(j,i) = global_start(j,i)
+                c_global_end(j,i)   = global_end(j,i)
              ENDDO
+          ELSE
+             local_start = 0
+             DO  i = myid-sm_io%sh_rank, myid-1
+                local_start = local_start + all_nr_val(i)
+             ENDDO
+          ENDDO
+!
+!--       Recursive call of rd_mpi_io_surface_filetypes.
+!--       Prerun data are read, but they are treated as if they are mainrun data, just on a smaller
+!--       grid.
+          cyclic_fill_mode = .FALSE.
+          CALL rd_mpi_io_surface_filetypes( c_start_index, c_end_index, data_to_write,             &
+                                            c_global_start, c_global_end )
+          cyclic_fill_mode = .TRUE.
+       ENDIF
+!
+!--    Activate grid of the mainrun, i.e. grid variables like nxl, nxr, nys, nyn and others
+!--    are set according to the mainrun layout.
+       CALL mainrun_grid%activate_grid_from_this_class()
+#if defined( __parallel )
+       CALL MPI_BCAST( data_to_write, 1, MPI_LOGICAL, 0, comm2d, ierr )
+#endif
+    END SUBROUTINE cyclic_fill_surface_filetype
+#if defined( __parallel )
+!
+!-- Get the indices of the surface elements inside the RMA window on the remote PE.
+!-- This information is required to fetch the surface element data on remote PEs
+!-- in rrd_mpi_io_surface and wrd_mpi_io_surface.
+    SUBROUTINE get_remote_indices
+       INTEGER(iwp) ::  buf_start  !<
+       INTEGER(iwp) ::  i          !<
+       INTEGER(iwp) ::  j          !<
+       INTEGER(iwp) ::  n          !<
+       INTEGER(iwp) ::  n_trans    !<
+       INTEGER(iwp) ::  win_ind    !<
+       INTEGER(KIND=MPI_ADDRESS_KIND) ::  disp     !< displacement in RMA window
+       INTEGER(KIND=MPI_ADDRESS_KIND) ::  winsize  !< size of RMA window
+       INTEGER(iwp), DIMENSION(0:numprocs-1) ::  lo_index         !<
+       INTEGER(iwp), POINTER, DIMENSION(:,:) ::  surf_val_index  !<
+       IF ( ALLOCATED( local_indices ) )  DEALLOCATE( local_indices )
+       ALLOCATE( local_indices(2,MAX( SUM( transfer_index(2,:) ), 2 )))
+       local_indices(1,:) = 0
+       local_indices(2,:) = 0
+       winsize = MAX( 2 * SUM( nr_cells_to_thread ), 2 )
+       ALLOCATE( surf_val_index(2,winsize) )
+       winsize = winsize * iwp
+       CALL MPI_WIN_CREATE( surf_val_index, winsize, iwp, MPI_INFO_NULL, comm2d, win_ind, ierr )
+       CALL MPI_WIN_FENCE( 0, win_ind, ierr )
+       lo_index = thread_index
+       DO  i = nxl, nxr
+          DO  j = nys, nyn
+             surf_val_index(1,lo_index(target_thread(j,i))) = global_start(j,i)
+             surf_val_index(2,lo_index(target_thread(j,i))) = global_end(j,i) - global_start(j,i)  &
+                                                              + 1
+             lo_index(target_thread(j,i)) = lo_index(target_thread(j,i)) + 1
+          ENDDO
+       ENDDO
+       CALL MPI_WIN_FENCE( 0, win_ind, ierr )
+       buf_start = 1
+       DO  n = 0, numprocs-1
+          n_trans = transfer_index(2,n)
+          IF ( n_trans > 0 )  THEN
+             disp = 2 * ( transfer_index(1,n) - 1 )
+             CALL MPI_GET( local_indices(1,buf_start), 2*n_trans, MPI_INTEGER, n, disp, 2*n_trans, &
+                           MPI_INTEGER, win_ind, ierr )
+             buf_start = buf_start + n_trans
           ENDIF
+!
+!--       Get the size of shared memory window on all PEs
+          CALL MPI_BCAST( nr_iope, 1, MPI_INTEGER, 0, sm_io%comm_shared, ierr )
+          CALL sm_io%sm_allocate_shared( array_1d, 1, MAX( 1, nr_iope ), win_surf )
+       ELSE
+          nr_iope = nr_val
+       ENDIF
+#else
+       nr_iope = nr_val
+#endif
+!
+!--    Check, if surface data exist on this PE
+       data_to_write = .TRUE.
+       IF ( total_number_of_surface_values == 0 )  THEN
+          data_to_write = .FALSE.
+          RETURN
+       ENDIF
+       IF ( sm_io%iam_io_pe )  THEN
+          all_pes_write = ( MINVAL( all_nr_val ) > 0 )
+          IF ( all_pes_write )  THEN
+             dims1(1)  = total_number_of_surface_values
+             lize1(1)  = nr_iope
+             start1(1) = glo_start-1
+#if defined( __parallel )
+             IF ( total_number_of_surface_values > 0 )  THEN
+                 CALL MPI_TYPE_CREATE_SUBARRAY( 1, dims1, lize1, start1, MPI_ORDER_FORTRAN,        &
+                                                MPI_REAL, ft_surf, ierr )
+                 CALL MPI_TYPE_COMMIT( ft_surf, ierr )
+             ENDIF
+#endif
+       ENDDO
+       CALL MPI_WIN_FENCE( 0, win_ind, ierr )
+       buf_start = 1
+       DO  n = 0, numprocs-1
+          n_trans = transfer_index(2,n)
+          IF ( n_trans > 0 )  THEN
+             disp = transfer_index(1,n) - 1
+             buf_start = buf_start + n_trans
           ENDIF
+       ENDIF
+    ENDIF
+       ENDDO
+       CALL MPI_WIN_FREE( win_ind, ierr )
+       DEALLOCATE( surf_val_index )
+    END SUBROUTINE get_remote_indices
+!--------------------------------------------------------------------------------------------------!
+! Description:
+! ------------
+!> Allocate memory and create window for one-sided communication (1-d INTEGER array)
+!--------------------------------------------------------------------------------------------------!
+    SUBROUTINE rd_alloc_rma_mem( array, idim, win )
+       IMPLICIT NONE
+       INTEGER(iwp), INTENT(IN)       ::  idim     !< Dimension of this 1-D array
+       INTEGER                        ::  ierr     !< MPI error code
+       INTEGER(iwp), INTENT(OUT)      ::  win      !< MPI window
+       INTEGER(KIND=MPI_ADDRESS_KIND) ::  winsize  !< size of RMA window
+       REAL(wp), DIMENSION(:), POINTER, INTENT(INOUT) ::  array  !< array to access RMA window locally
+       winsize = MAX( idim, 2 )
+       ALLOCATE( array(winsize) )
+       winsize = winsize * wp
+       CALL MPI_WIN_CREATE( array, winsize, wp, MPI_INFO_NULL, comm2d, win, ierr )
+       array = -1
+       CALL MPI_WIN_FENCE( 0, win, ierr )
+    END SUBROUTINE rd_alloc_rma_mem
+#endif
  END SUBROUTINE rd_mpi_io_surface_filetypes
 …
           iog%nnx = iog%nnx + nbgp
        ENDIF
        IF ( myidx == npex-1  .OR.  npex == -1 )  THEN   ! npex == 1 if -D__parallel not set
+       IF ( myidx == pdims(1)-1 )  THEN
           iog%nxr = iog%nxr + nbgp
           iog%nnx = iog%nnx + nbgp
 …
           iog%nny = iog%nny + nbgp
        ENDIF
        IF ( myidy == npey-1  .OR.  npey == -1 )  THEN   ! npey == 1 if -D__parallel not set
+       IF ( myidy == pdims(2)-1 )  THEN
           iog%nyn = iog%nyn + nbgp
           iog%nny = iog%nny + nbgp
 …
           iog%nnx = iog%nnx + nbgp
        ENDIF
        IF ( myidx == npex-1  .OR.  npex == -1 )  THEN   ! npex == 1 if -D__parallel not set
+       IF ( myidx == pdims(1)-1 )  THEN
           iog%nxr = iog%nxr + nbgp
           iog%nnx = iog%nnx + nbgp
 …
           iog%nny = iog%nny + nbgp
        ENDIF
        IF ( myidy == npey-1  .OR.  npey == -1 )  THEN   ! npey == 1 if -D__parallel not set
+       IF ( myidy == pdims(2)-1 )  THEN
           iog%nyn = iog%nyn + nbgp
           iog%nny = iog%nny + nbgp
 …
 !> to a single file that contains the global arrays. It is not required for the serial mode.
 !--------------------------------------------------------------------------------------------------!
-#if defined( __parallel )
  SUBROUTINE rd_mpi_io_create_filetypes_3dsoil( nzb_soil, nzt_soil )
 …
     INTEGER, INTENT(IN)   ::  nzt_soil  !<
+#if defined( __parallel )
     INTEGER, DIMENSION(3) ::  dims3     !<
     INTEGER, DIMENSION(3) ::  lize3     !<
 …
        CALL MPI_TYPE_COMMIT( ft_3dsoil, ierr )
     ENDIF
+#else
+    ALLOCATE( array_3d_soil(nzb_soil:nzt_soil,iog%nxl:iog%nxr,iog%nys:iog%nyn) )
+    sm_io%io_grid = iog
+#endif
  END SUBROUTINE rd_mpi_io_create_filetypes_3dsoil
-#endif
 !--------------------------------------------------------------------------------------------------!
 …
     IMPLICIT NONE
 #if defined( __parallel )
 …
     ENDIF
+!
 !-- Free last surface filetype
 …
     IF ( sm_io%iam_io_pe .AND. ft_3di4 /= -1 )  THEN
        CALL MPI_TYPE_FREE( ft_3di4, ierr )
+       ft_3di4 = -1
+    ENDIF
+    IF ( sm_io%iam_io_pe .AND. ft_3di8 /= -1 )  THEN
        CALL MPI_TYPE_FREE( ft_3di8, ierr )
+       ft_3di8 = -1
     ENDIF
     IF ( sm_io%is_sm_active() .AND.  win_3di4 /= -1 )  THEN
        CALL sm_io%sm_free_shared( win_3di4 )
+       win_3di4 = -1
+    ENDIF
+    IF ( sm_io%is_sm_active() .AND.  win_3di8 /= -1 )  THEN
        CALL sm_io%sm_free_shared( win_3di8 )
+       win_3di8 = -1
+    ENDIF
+    IF ( win_start /= -1 )  THEN
+       CALL sm_io%sm_free_shared( win_start)
+       CALL sm_io%sm_free_shared( win_end)
+       CALL sm_io%sm_free_shared( win_glost)
+       win_start = -1
+       win_end   = -1
+       win_glost = -1
     ENDIF
 …
     win_surf = -1
 #else
     IF ( ASSOCIATED(array_2d)   )  DEALLOCATE( array_2d )
     IF ( ASSOCIATED(array_2di)  )  DEALLOCATE( array_2di )
     IF ( ASSOCIATED(array_3d)   )  DEALLOCATE( array_3d )
     IF ( ASSOCIATED(array_3di4) )  DEALLOCATE( array_3di4 )
     IF ( ASSOCIATED(array_3di8) )  DEALLOCATE( array_3di8 )
+    IF ( ASSOCIATED( array_2d )   )  DEALLOCATE( array_2d )
+    IF ( ASSOCIATED( array_2di )  )  DEALLOCATE( array_2di )
+    IF ( ASSOCIATED( array_3d )   )  DEALLOCATE( array_3d )
+    IF ( ASSOCIATED( array_3di4 ) )  DEALLOCATE( array_3di4 )
+    IF ( ASSOCIATED( array_3di8 ) )  DEALLOCATE( array_3di8 )
 #endif

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 4893 for palm/trunk/SOURCE/restart_data_mpi_io_mod.f90

Legend:

palm/trunk/SOURCE/restart_data_mpi_io_mod.f90

Download in other formats: