Home

Context Navigation

← Previous Change
Next Change →

Changeset 4893 for palm/trunk/SOURCE

Timestamp:

Mar 2, 2021 4:39:14 PM (4 years ago)

Author:

raasch

Message:

revised output of surface data via MPI-IO for better performance

Location:

palm/trunk/SOURCE

Files:

: 9 edited

init_pegrid.f90 (modified) (1 diff)
land_surface_model_mod.f90 (modified) (8 diffs)
read_restart_data_mod.f90 (modified) (6 diffs)
restart_data_mpi_io_mod.f90 (modified) (72 diffs)
shared_memory_io_mod.f90 (modified) (50 diffs)
surface_data_output_mod.f90 (modified) (5 diffs)
surface_mod.f90 (modified) (12 diffs)
urban_surface_mod.f90 (modified) (12 diffs)
write_restart_data_mod.f90 (modified) (2 diffs)

Legend:

: Unmodified
: Added
: Removed

palm/trunk/SOURCE/init_pegrid.f90

r4848	r4893
358	358	CALL MPI_COMM_RANK( comm1dy, myidy, ierr )
359	359
360
361	360	!
362	361	!-- Calculate array bounds along x-direction for every PE.

palm/trunk/SOURCE/land_surface_model_mod.f90

-                      r4876
+                      r4893
 ! -----------------
 ! $Id$
+! revised output of surface data via MPI-IO for better performance
+!
+! 4876 2021-02-17 12:27:36Z raasch
 ! bugfix for instantaneous c_liq output
+!
 …
     INTEGER(iwp)      ::  l      !< index variable for surface orientation
+    INTEGER(iwp),DIMENSION(nys:nyn,nxl:nxr) ::  global_start_index  !< index for surface data (MPI-IO)
+    INTEGER(iwp),DIMENSION(nys:nyn,nxl:nxr) ::  global_end_index    !< end index for surface data (MPI-IO)
+    INTEGER(iwp),DIMENSION(nys:nyn,nxl:nxr) ::  global_start_index  !< start index for surface data (MPI-IO)
     LOGICAL ::  surface_data_to_write  !< switch for MPI-I/O if PE has surface data to write
 …
           CALL rd_mpi_io_surface_filetypes( surf_lsm_h(l)%start_index, surf_lsm_h(l)%end_index,    &
+                                            surface_data_to_write, global_start_index )
+          CALL wrd_mpi_io( 'lsm_start_index_h_' // dum,  surf_lsm_h(l)%start_index )
+          CALL wrd_mpi_io( 'lsm_end_index_h_' // dum,  surf_lsm_h(l)%end_index )
+                                            surface_data_to_write, global_start_index,             &
+                                            global_end_index )
           CALL wrd_mpi_io( 'lsm_global_start_index_h_' // dum, global_start_index )
+          CALL wrd_mpi_io( 'lsm_global_end_index_h_' // dum, global_end_index )
           IF ( .NOT. surface_data_to_write )  CYCLE
           CALL wrd_mpi_io_surface( 't_soil_h(' // dum // ')', t_soil_h(l)%var_2d )
           CALL wrd_mpi_io_surface( 'm_soil_h(' // dum // ')',  m_soil_h(l)%var_2d )
+          CALL wrd_mpi_io_surface( 'm_soil_h(' // dum // ')', m_soil_h(l)%var_2d )
           CALL wrd_mpi_io_surface( 'm_liq_h(' // dum // ')', m_liq_h(l)%var_1d )
           CALL wrd_mpi_io_surface( 't_surface_h(' // dum // ')', t_surface_h(l)%var_1d )
 …
           CALL rd_mpi_io_surface_filetypes ( surf_lsm_v(l)%start_index, surf_lsm_v(l)%end_index,   &
                                              surface_data_to_write, global_start_index )
+          CALL wrd_mpi_io( 'lsm_start_index_v_' // dum,  surf_lsm_v(l)%start_index )
           CALL wrd_mpi_io( 'lsm_end_index_v_' // dum,  surf_lsm_v(l)%end_index )
           CALL wrd_mpi_io( 'lsm_global_start_index_v_' // dum , global_start_index )
+                                            surface_data_to_write, global_start_index,             &
+                                            global_end_index )
+          CALL wrd_mpi_io( 'lsm_global_start_index_v_' // dum, global_start_index )
+          CALL wrd_mpi_io( 'lsm_global_end_index_v_' // dum, global_end_index )
           IF ( .NOT. surface_data_to_write )  CYCLE
 …
     INTEGER(iwp) ::  l   !< running index surface orientation
+    !INTEGER(iwp), DIMENSION(nys:nyn,nxl:nxr) ::  end_index
+    INTEGER(iwp), DIMENSION(nys:nyn,nxl:nxr) ::  global_start
+    !INTEGER(iwp), DIMENSION(nys:nyn,nxl:nxr) ::  start_index
+    LOGICAL      :: array_found
+    LOGICAL      :: ldum
+    INTEGER(iwp), DIMENSION(nys:nyn,nxl:nxr) ::  global_end_index
+    INTEGER(iwp), DIMENSION(nys:nyn,nxl:nxr) ::  global_start_index
+    LOGICAL ::  array_found
+    LOGICAL ::  data_to_read    !< switch to steer reading of data
 …
        WRITE( dum, '(I1)')  l
+       CALL rrd_mpi_io( 'lsm_start_index_h_' // dum,  surf_lsm_h(l)%start_index )
+       CALL rrd_mpi_io( 'lsm_end_index_h_' // dum,  surf_lsm_h(l)%end_index )
+       CALL rrd_mpi_io( 'lsm_global_start_index_h_' // dum, global_start )
+       CALL rd_mpi_io_surface_filetypes ( surf_lsm_h(l)%start_index, surf_lsm_h(l)%end_index, ldum,&
+                                          global_start )
+       IF ( MAXVAL( surf_lsm_h(l)%end_index ) <= 0 )  CYCLE
+!
+!--    surf_lsm_h(l)%start_index and surf_lsm_h(l)%end_index are already set and should not be read
+!--    from restart file.
+       CALL rrd_mpi_io( 'lsm_global_start_index_h_' // dum, global_start_index )
+       CALL rrd_mpi_io( 'lsm_global_end_index_h_' // dum, global_end_index )
+       CALL rd_mpi_io_surface_filetypes ( surf_lsm_h(l)%start_index, surf_lsm_h(l)%end_index,      &
+                                          data_to_read, global_start_index, global_end_index )
+       IF ( .NOT. data_to_read )  CYCLE
        CALL rrd_mpi_io_surface( 't_soil_h(' // dum // ')', t_soil_h(l)%var_2d )
 …
        CALL rrd_mpi_io_surface( 'm_liq_h(' // dum // ')', m_liq_h(l)%var_1d )
        CALL rrd_mpi_io_surface( 't_surface_h(' // dum // ')', t_surface_h(l)%var_1d )
     ENDDO
 …
        WRITE( dum, '(I1)')  l
+!kk    In case of nothing to do, the settings of start_index and end_index differ
+!kk    between writing and reading restart file
+!kk
+!kk    Has to be discussed with the developers
+       CALL rrd_mpi_io( 'lsm_start_index_v_' // dum,  surf_lsm_v(l)%start_index )
+       CALL rrd_mpi_io( 'lsm_end_index_v_' // dum,  surf_lsm_v(l)%end_index )
+       CALL rrd_mpi_io( 'lsm_global_start_index_v_' // dum , global_start )
+       CALL rd_mpi_io_surface_filetypes( surf_lsm_v(l)%start_index, surf_lsm_v(l)%end_index, ldum, &
+                                         global_start )
+       IF ( MAXVAL( surf_lsm_v(l)%end_index ) <= 0 )  CYCLE
+!
+!--    surf_lsm_v(l)%start_index and surf_lsm_v(l)%end_index are already set and should not be read
+!--    from restart file.
+       CALL rrd_mpi_io( 'lsm_global_start_index_v_' // dum , global_start_index )
+       CALL rrd_mpi_io( 'lsm_global_end_index_v_' // dum , global_end_index )
+       CALL rd_mpi_io_surface_filetypes( surf_lsm_v(l)%start_index, surf_lsm_v(l)%end_index,       &
+                                         data_to_read, global_start_index, global_end_index )
+       IF ( .NOT. data_to_read )  CYCLE
        CALL rrd_mpi_io_surface( 't_soil_v(' // dum // ')', t_soil_v(l)%var_2d )

palm/trunk/SOURCE/read_restart_data_mod.f90

-                      r4848
+                      r4893
 ! -----------------
 ! $Id$
+! revised output of surface data via MPI-IO for better performance,
+! therefore binary version number has changed
+!
+! 4848 2021-01-21 15:51:51Z gronemeier
 ! bugfix: removed syn_turb_gen from restart files
+!
 …
     CALL location_message( 'read global restart data', 'start' )
+!
+!-- Caution: When any of the read instructions have been changed, the
+!-- -------  version number stored in the variable binary_version_global has
+!--          to be increased. The same changes must also be done in wrd_write_global.
+    binary_version_global = '5.3'
     IF ( TRIM( restart_data_format_input ) == 'fortran_binary' )  THEN
+!
 …
        READ ( 13 )  version_on_file
-       binary_version_global = '5.2'
        IF ( TRIM( version_on_file ) /= TRIM( binary_version_global ) )  THEN
           WRITE( message_string, * ) 'version mismatch concerning ',           &
 …
+!
 !--    Now read all control parameters:
+!--    Caution: When the following read instructions have been changed, the
+!--    -------  version number stored in the variable binary_version_global has
+!--             to be increased. The same changes must also be done in
+!--             wrd_write_global.
        READ ( 13 )  length
        READ ( 13 )  restart_string(1:length)
 …
 !--    Read global restart data using MPI-IO
 !--    ATTENTION: Arrays need to be read with routine rrd_mpi_io_global_array!
+!--    Caution: When any of the following read instructions have been changed, the
+!--    -------  version number stored in the variable binary_version_global has
+!--             to be increased. The same changes must also be done in
+!--             wrd_write_global.
+!
 !--    Open the MPI-IO restart file.
 …
        CALL rrd_mpi_io( 'binary_version_global',  version_on_file )
-       binary_version_global = '5.1'
        IF ( TRIM( version_on_file ) /= TRIM( binary_version_global ) )  THEN
           WRITE( message_string, * ) 'version mismatch concerning binary_version_global:',         &

palm/trunk/SOURCE/restart_data_mpi_io_mod.f90

-                      r4857
+                      r4893
 ! -----------------
 ! $Id$
+! revised output of surface data via MPI-IO for better performance
+!
+! 4857 2021-01-26 07:24:41Z raasch
 ! bugfix: allocation of 3d-int4 array moved from particle output to standard output
+!
 …
                myidx,                                                                              &
                myidy,                                                                              &
-               npex,                                                                               &
-               npey,                                                                               &
                numprocs,                                                                           &
                pdims
 …
     INTEGER(iwp)            ::  fh = -1          !< MPI-IO file handle
 #if defined( __parallel )
-    INTEGER(iwp)            ::  fhs = -1         !< MPI-IO file handle to open file with comm2d always
-#endif
     INTEGER(iwp)            ::  ft_surf = -1     !< MPI filetype surface data
-#if defined( __parallel )
     INTEGER(iwp)            ::  ft_2di_nb        !< MPI filetype 2D array INTEGER no outer boundary
     INTEGER(iwp)            ::  ft_2d            !< MPI filetype 2D array REAL with outer boundaries
 …
     INTEGER(iwp)            ::  glo_start        !< global start index on this PE
 #if defined( __parallel )
-    INTEGER(iwp)            ::  local_start      !<
-#endif
-    INTEGER(iwp)            ::  nr_iope          !<
-    INTEGER(iwp)            ::  nr_val           !< local number of values in x and y direction
-#if defined( __parallel )
     INTEGER(iwp)            ::  win_2di          !<
     INTEGER(iwp)            ::  win_2dr          !<
 …
     INTEGER(iwp)            ::  win_3dr          !<
     INTEGER(iwp)            ::  win_3ds          !<
+    INTEGER(iwp)            ::  win_end   = -1   !<
+    INTEGER(iwp)            ::  win_glost = -1   !<
+    INTEGER(iwp)            ::  win_out   = -1   !<
+    INTEGER(iwp)            ::  win_start = -1   !<
     INTEGER(iwp)            ::  win_surf = -1    !<
 #endif
 …
     INTEGER(iwp), DIMENSION(:,:), POINTER, CONTIGUOUS   ::  array_2di   !<
+    INTEGER(iwp), DIMENSION(:,:), ALLOCATABLE ::  m_end_index     !<
+    INTEGER(iwp), DIMENSION(:,:), ALLOCATABLE ::  m_global_start  !<
+    INTEGER(iwp), DIMENSION(:,:), ALLOCATABLE ::  e_end_index     !< extended end index, every grid cell has at least one value
+    INTEGER(iwp), DIMENSION(:,:), ALLOCATABLE ::  e_start_index   !<
+    INTEGER(iwp), DIMENSION(:,:), ALLOCATABLE ::  m_end_index     !< module copy of end_index
     INTEGER(iwp), DIMENSION(:,:), ALLOCATABLE ::  m_start_index   !<
+    INTEGER(iwp), DIMENSION(:),   ALLOCATABLE ::  thread_index    !<
+    INTEGER(iwp), DIMENSION(:,:), ALLOCATABLE ::  target_thread   !<
+    INTEGER(iwp), DIMENSION(:,:), ALLOCATABLE ::  transfer_index  !<
+    INTEGER(iwp), DIMENSION(:),   ALLOCATABLE ::  thread_values   !<
+!
+!-- Indices for cyclic fill
+    INTEGER(iwp), DIMENSION(:,:), ALLOCATABLE ::  o_start_index   !<
+    INTEGER(iwp), DIMENSION(:,:), ALLOCATABLE ::  c_start_index   !<
+!#if defined( __parallel )
+    INTEGER(iwp), DIMENSION(:,:), ALLOCATABLE ::  o_end_index     !< extended end index, every grid cell has at least one value
+    INTEGER(iwp), DIMENSION(:,:), ALLOCATABLE ::  c_end_index     !< extended end index, every grid cell has at least one value
+    INTEGER(iwp), DIMENSION(:,:), ALLOCATABLE ::  c_global_start  !<
+    INTEGER(iwp), DIMENSION(:,:), ALLOCATABLE ::  c_global_end    !<
+!#endif
     INTEGER(isp), DIMENSION(:,:,:), POINTER, CONTIGUOUS ::  array_3di4  !<
     INTEGER(idp), DIMENSION(:,:,:), POINTER, CONTIGUOUS ::  array_3di8  !<
-    LOGICAL ::  all_pes_write                 !< all PEs have data to write
     LOGICAL ::  filetypes_created             !<
     LOGICAL ::  io_on_limited_cores_per_node  !< switch to shared memory MPI-IO
 …
     LOGICAL ::  wr_flag                       !< file is opened for write
+#if defined( __parallel )
+    INTEGER(iwp), DIMENSION(:,:), ALLOCATABLE       :: local_indices
+#endif
+    REAL(wp), DIMENSION(:), POINTER, CONTIGUOUS     ::  array_out      !<
 #if defined( __parallel )
     REAL(wp), DIMENSION(:), POINTER, CONTIGUOUS     ::  array_1d       !<
 …
        INTEGER(iwp) :: nr_int         !< number of INTEGER entries in header
        INTEGER(iwp) :: nr_real        !< number of REAL entries in header
+       INTEGER(iwp) :: pes_along_x    !< number of PEs along x-direction during writing restart file
+       INTEGER(iwp) :: pes_along_y    !< number of PEs along y-direction during writing restart file
        INTEGER(iwp) :: total_nx       !< total number of points in x-direction
        INTEGER(iwp) :: total_ny       !< total number of points in y-direction
     END TYPE general_header
     TYPE(general_header), TARGET ::  tgh    !<
+    TYPE(general_header), TARGET, PUBLIC ::  tgh    !<
     TYPE(sm_class)               ::  sm_io  !<
 …
             wrd_mpi_io_surface
  CONTAINS
 …
     TYPE(C_PTR)                   ::  buf_ptr  !<
 #endif
     offset = 0
 …
     io_file_name = file_name
+!
 !-- Setup for IO on a limited number of threads per node (using shared memory MPI)
+!-- Setup for IO on a limited number of PEs per node (using shared memory MPI)
     IF ( rd_flag )  THEN
        set_filetype = .TRUE.
 …
+!
 !--    TODO: describe in more detail what is done here and why it is done
 !--    save grid of main run
+!--    Save grid information of the mainrun, i.e. grid variables like nxl, nxr, nys, nyn and other
+!--    values are stored within the mainrun_grid structure
        CALL mainrun_grid%save_grid_into_this_class()
 …
        rma_offset_s = 0
+!
 !--    Determine, if gridpoints of the prerun are located on this thread.
+!--    Determine, if gridpoints of the prerun are located on this PE.
 !--    Set the (cyclic) prerun grid.
        nxr = MIN( nxr, nx_on_file )
 …
        ny = ny_on_file
+!
 !--    Determine, if this thread is doing IO
+!--    Determine, if this PE is doing IO
        IF ( nnx > 0  .AND.  nny > 0 )  THEN
           color = 1
 …
 #endif
+!
 !--    Allocate 2d buffers as RMA window, accessible on all threads
+!--    Allocate 2d buffers as RMA window, accessible on all PEs
        IF ( pe_active_for_read )  THEN
           ALLOCATE( rmabuf_2di(nys:nyn,nxl:nxr) )
 …
+!
 !--    Allocate 3d buffer as RMA window, accessable on all threads
+!--    Allocate 3d buffer as RMA window, accessable on all PEs
        IF ( pe_active_for_read )  THEN
           ALLOCATE( rmabuf_3d(nzb:nzt+1,nys:nyn,nxl:nxr) )
 …
+!
+!--    TODO: comment in more detail, what is done here, and why
+!--    save small grid
+!--    Save grid of the prerun, i.e. grid variables like nxl, nxr, nys, nyn and other values
+!--    are stored within the prerun_grid structure.
+!--    The prerun grid can later be activated by calling prerun_grid%activate_grid_from_this_class()
        CALL prerun_grid%save_grid_into_this_class()
        prerun_grid%comm2d = comm_cyclic_fill
 …
-!kk       write(9,*) 'Here is rma_cylic_fill_real_2d ',nxl,nxr,nys,nyn; FLUSH(9)
+!
 !--    Reading 2d real array on prerun grid
 …
 !--                  array would be dimensioned in the caller subroutine like this:
 !--                  INTEGER, DIMENSION(nysg:nyng,nxlg:nxrg)::  data
           message_string = '2d-INTEGER array "' // TRIM( name ) // '" to be read from restart ' // &
                            'file is defined with illegal dimensions in the PALM code'
+          message_string = '2d-INTEGER array with nbgp "' // TRIM( name ) // '" to be read ' //    &
+                           'from restart file is defined with illegal dimensions in the PALM code'
           CALL message( 'rrd_mpi_io_int_2d', 'PA0723', 3, 2, 0, 6, 0 )
 …
-       CALL prerun_grid%activate_grid_from_this_class()
        IF ( pe_active_for_read )  THEN
+          CALL prerun_grid%activate_grid_from_this_class()
 #if defined( __parallel )
           CALL MPI_FILE_SET_VIEW( fh, array_position, MPI_INTEGER, ft_2di_nb, 'native',            &
                                   MPI_INFO_NULL, ierr )
           CALL MPI_FILE_READ_ALL( fh, array_2di, SIZE( array_2di ), MPI_INTEGER, status, ierr )
+#else
+          CALL posix_lseek( fh, array_position )
+          CALL posix_read( fh, array_2di, SIZE( array_2di ) )
 #endif
           DO  i = nxl, nxr
 …
           ENDDO
           data(1:nny,1:nnx) = rmabuf_2di
+       ENDIF
        CALL mainrun_grid%activate_grid_from_this_class()
+          CALL mainrun_grid%activate_grid_from_this_class()
+       ENDIF
 #if defined( __parallel )
 …
 #endif
+       IF ( .NOT. pe_active_for_read )  THEN
+          is = nxl
+          ie = nxr
+          js = nys
+          je = nyn
+       ELSE
+          is = nxl
+          ie = nxr
+          js = prerun_grid%nys+1
+          je = nyn
+          DO  i = is, ie
+             DO  j = js, je
+                i_remote = MOD(i,nx_on_file+1)
+                j_remote = MOD(j,ny_on_file+1)
+                rem_pe   = remote_pe(i_remote,j_remote)
+                rem_offs = rma_offset(i_remote,j_remote)
+                nval     = 1
+#if defined( __parallel )
+                IF ( rem_pe /= myid )  THEN
+                   CALL MPI_GET( data(j-nys+1,i-nxl+1), nval, MPI_INTEGER, rem_pe, rem_offs, nval, &
+                                 MPI_INTEGER, rmawin_2di, ierr )
+                ELSE
+                   data(j-nys+1,i-nxl+1) = rmabuf_2di(j_remote,i_remote)
+                ENDIF
+#else
+                data(j-nys+1,i-nxl+1) = array_2di(i_remote,j_remote)
+#endif
+             ENDDO
+          ENDDO
+          is = prerun_grid%nxr+1
+          ie = nxr
+          js = nys
+          je = nyn
+       ENDIF
+       is = nxl
+       ie = nxr
+       js = nys
+       je = nyn
        DO  i = is, ie
 …
                                   ierr )
           CALL MPI_FILE_READ_ALL( fh, array_3d, SIZE( array_3d ), MPI_REAL, status, ierr )
+#else
+          CALL posix_lseek( fh, array_position )
+          CALL posix_read( fh, array_3d, SIZE( array_3d ) )
 #endif
           DO  i = nxl, nxr
 …
 #if defined( __parallel )
+!
+!--     Close RMA window to allow remote access
+        CALL MPI_WIN_FENCE( 0, rmawin_3d, ierr )
+#endif
+       IF ( .NOT. pe_active_for_read )  THEN
+          is = nxl
+          ie = nxr
+          js = nys
+          je = nyn
+       ELSE
+          is = nxl
+          ie = nxr
+          js = prerun_grid%nys+1
+          je = nyn
+          DO  i = is, ie
+             DO  j = js, je
+                i_remote = MOD(i,nx_on_file+1)
+                j_remote = MOD(j,ny_on_file+1)
+                rem_pe   = remote_pe(i_remote,j_remote)
+                rem_offs = rma_offset(i_remote,j_remote)*(nzt-nzb+2)
+                nval     = nzt-nzb+2
+#if defined( __parallel )
+                IF(rem_pe /= myid)   THEN
+                   CALL MPI_GET( data(nzb,j,i), nval, MPI_REAL, rem_pe, rem_offs, nval, MPI_REAL,  &
+                                 rmawin_3d, ierr)
+                ELSE
+                   data(:,j,i) = rmabuf_3d(:,j_remote,i_remote)
+                ENDIF
+#else
+                data(:,j,i) = array_3d(:,i_remote,j_remote)
+#endif
+             ENDDO
+          ENDDO
+          is = prerun_grid%nxr+1
+          ie = nxr
+          js = nys
+          je = nyn
+       ENDIF
+!--    Close RMA window to allow remote access
+       CALL MPI_WIN_FENCE( 0, rmawin_3d, ierr )
+#endif
+       is = nxl
+       ie = nxr
+       js = nys
+       je = nyn
        DO  i = is, ie
           DO  j = js, je
              i_remote = MOD(i,nx_on_file+1)
              j_remote = MOD(j,ny_on_file+1)
+             i_remote = MOD( i, nx_on_file+1 )
+             j_remote = MOD( j, ny_on_file+1 )
              rem_pe   = remote_pe(i_remote,j_remote)
              rem_offs = rma_offset(i_remote,j_remote) * ( nzt-nzb+2 )
 …
     IF ( found )  THEN
-#if defined( __parallel )
        CALL rd_mpi_io_create_filetypes_3dsoil( nzb_soil, nzt_soil )
+#if defined( __parallel )
        CALL sm_io%sm_node_barrier()  ! Has no effect if I/O on limited number of cores is inactive
        IF ( sm_io%iam_io_pe )  THEN
 …
        ENDIF
+#if defined( __parallel )
+      IF ( sm_io%is_sm_active() )  THEN
+         CALL MPI_WIN_FREE( win_3ds, ierr )
+      ELSE
+         DEALLOCATE( array_3d_soil )
+      ENDIF
+#else
+      DEALLOCATE( array_3d_soil )
+#endif
     ELSE
 …
     IF ( header_array_index == max_nr_arrays )  THEN
+       STOP '+++ maximum number of 2d/3d-array entries in restart file header exceeded'
+       message_string = 'maximum number of 2d/3d-array entries in restart file header exceeded'
+       CALL message( 'wrd_mpi_io_real_2d', 'PA0585', 1, 2, 0, 6, 0 )
     ENDIF
 …
     IF ( header_array_index == max_nr_arrays )  THEN
+       STOP '+++ maximum number of 2d/3d-array entries in restart file header exceeded'
+       message_string = 'maximum number of 2d/3d-array entries in restart file header exceeded'
+       CALL message( 'wrd_mpi_io_int_2d', 'PA0585', 1, 2, 0, 6, 0 )
     ENDIF
 …
     IF ( header_array_index == max_nr_arrays )  THEN
+       STOP '+++ maximum number of 2d/3d-array entries in restart file header exceeded'
+       message_string = 'maximum number of 2d/3d-array entries in restart file header exceeded'
+       CALL message( 'wrd_mpi_io_int4_3d', 'PA0585', 1, 2, 0, 6, 0 )
     ENDIF
 …
                                       INT( (iog%nx+1), KIND = rd_offset_kind ) * isp
-    write(9,*) 'array_position int4_3d ',trim(name),' ',array_position
  END SUBROUTINE wrd_mpi_io_int4_3d
 …
     IF ( header_array_index == max_nr_arrays )  THEN
+       STOP '+++ maximum number of 2d/3d-array entries in restart file header exceeded'
+       message_string = 'maximum number of 2d/3d-array entries in restart file header exceeded'
+       CALL message( 'wrd_mpi_io_int8_3d', 'PA0585', 1, 2, 0, 6, 0 )
     ENDIF
 …
                                       INT( (iog%nx+1), KIND = rd_offset_kind ) * dp
-    write(9,*) 'array_position int8_3d ',trim(name),' ',array_position
  END SUBROUTINE wrd_mpi_io_int8_3d
 …
     IF ( header_array_index == max_nr_arrays )  THEN
+       STOP '+++ maximum number of 2d/3d-array entries in restart file header exceeded'
+       message_string = 'maximum number of 2d/3d-array entries in restart file header exceeded'
+       CALL message( 'wrd_mpi_io_real_3d', 'PA0585', 1, 2, 0, 6, 0 )
     ENDIF
 …
     IF ( header_array_index == max_nr_arrays )  THEN
+       STOP '+++ maximum number of 2d/3d-array entries in restart file header exceeded'
+       message_string = 'maximum number of 2d/3d-array entries in restart file header exceeded'
+       CALL message( 'wrd_mpi_io_real_3d_soil', 'PA0585', 1, 2, 0, 6, 0 )
     ENDIF
 …
     header_array_index = header_array_index + 1
-#if defined( __parallel )
     CALL rd_mpi_io_create_filetypes_3dsoil( nzb_soil, nzt_soil )
-#endif
     IF ( include_total_domain_boundaries)  THEN
 …
     ENDIF
     CALL sm_io%sm_node_barrier()
+    IF ( sm_io%is_sm_active() )  THEN
+       CALL MPI_WIN_FREE( win_3ds, ierr )
+    ELSE
+       DEALLOCATE( array_3d_soil )
+    ENDIF
+    IF ( sm_io%iam_io_pe )  THEN
+       CALL MPI_TYPE_FREE( ft_3dsoil, ierr )
+    ENDIF
 #else
     CALL posix_lseek( fh, array_position )
     CALL posix_write( fh, array_3d_soil, SIZE( array_3d_soil ) )
+    DEALLOCATE( array_3d_soil )
 #endif
+!
 …
          CALL MPI_BCAST( data, SIZE( data ), MPI_REAL, 0, comm2d, ierr )
        ELSE
           IF ( sm_io%iam_io_pe )  THEN
+          IF( sm_io%iam_io_pe )  THEN
              CALL MPI_FILE_SET_VIEW( fh, offset, MPI_BYTE, MPI_BYTE, 'native', MPI_INFO_NULL, ierr )
+          ENDIF
+          IF ( myid == 0 )  THEN
              CALL MPI_FILE_SEEK( fh, array_position, MPI_SEEK_SET, ierr )
              CALL MPI_FILE_READ_ALL( fh, data, SIZE( data ), MPI_REAL, status, ierr )
+             CALL MPI_FILE_READ( fh, data, SIZE( data ), MPI_REAL, status, ierr )
           ENDIF
+          IF ( sm_io%is_sm_active() )  THEN
+             CALL MPI_BCAST( data, SIZE( data ), MPI_REAL, 0, sm_io%comm_shared, ierr )
+          ENDIF
+          CALL MPI_BCAST( data, SIZE( data ), MPI_REAL, 0, comm2d, ierr )
        ENDIF
 #else
 …
              CALL MPI_FILE_READ_ALL( fh, data, SIZE( data), MPI_INTEGER, status, ierr )
           ENDIF
           CALL MPI_BCAST( data, SIZE( data ), MPI_REAL, 0, comm2d, ierr )
+          CALL MPI_BCAST( data, SIZE( data ), MPI_INTEGER, 0, comm2d, ierr )
        ELSE
           IF ( sm_io%iam_io_pe )  THEN
+          IF( sm_io%iam_io_pe )  THEN
              CALL MPI_FILE_SET_VIEW( fh, offset, MPI_BYTE, MPI_BYTE, 'native', MPI_INFO_NULL, ierr )
+          ENDIF
+          IF ( myid == 0 )  THEN
              CALL MPI_FILE_SEEK( fh, array_position, MPI_SEEK_SET, ierr )
              CALL MPI_FILE_READ_ALL( fh, data, SIZE( data), MPI_INTEGER, status, ierr )
+             CALL MPI_FILE_READ( fh, data, SIZE( data), MPI_INTEGER, status, ierr )
           ENDIF
+          IF ( sm_io%is_sm_active() )  THEN
+             CALL MPI_BCAST( data, SIZE( data ), MPI_INTEGER, 0, sm_io%comm_shared, ierr )
+          ENDIF
+       ENDIF
+          CALL MPI_BCAST( data, SIZE( data ), MPI_INTEGER, 0, comm2d, ierr )
+        ENDIF
 #else
        CALL posix_lseek( fh, array_position )
 …
     IF ( header_array_index == max_nr_arrays )  THEN
+       STOP '+++ maximum number of 2d/3d-array entries in restart file header exceeded'
+       message_string = 'maximum number of 2d/3d-array entries in restart file header exceeded'
+       CALL message( 'wrd_mpi_io_global_array_real_1d', 'PA0585', 1, 2, 0, 6, 0 )
     ENDIF
 …
     IF ( header_array_index == max_nr_arrays )  THEN
+       STOP '+++ maximum number of 2d/3d-array entries in restart file header exceeded'
+       message_string = 'maximum number of 2d/3d-array entries in restart file header exceeded'
+       CALL message( 'wrd_mpi_io_global_array_int_1d', 'PA0585', 1, 2, 0, 6, 0 )
     ENDIF
 …
        ENDDO
-       write(9,*) 'particle_size_read ',particle_size,array_size,array_position,sum(prt_global_index)
        ALLOCATE( prt_data(MAX(array_size,1)) )
 …
        array_position = prt_nr_bytes
-       write(9,*) 'array_position after particle read ',array_position,prt_nr_bytes,rs
        DEALLOCATE( prt_data )
 …
 ! ------------
 !> Read 1d-REAL surface data array with MPI-IO.
+!--------------------------------------------------------------------------------------------------!
+ SUBROUTINE rrd_mpi_io_surface( name, data, first_index )
+!> This is a recursive subroutine. In case of cyclic fill mode it may call itself for reading parts
+!> of the prerun grid.
+!--------------------------------------------------------------------------------------------------!
+ RECURSIVE SUBROUTINE rrd_mpi_io_surface( name, data, first_index )
     IMPLICIT NONE
 …
     CHARACTER(LEN=*), INTENT(IN) ::  name            !<
+    INTEGER(iwp), OPTIONAL       ::  first_index     !<
+    INTEGER(iwp)                 ::  i               !<
+    INTEGER(iwp)                 ::  j               !<
+    INTEGER(iwp)                 ::  lo_first_index  !<
+#if defined( __parallel )
+    INTEGER(iwp)                 ::  buf_start       !<
     INTEGER(KIND=rd_offset_kind) ::  disp            !< displacement of actual indices
+    INTEGER(KIND=rd_offset_kind) ::  disp_f          !< displacement in file
+    INTEGER(KIND=rd_offset_kind) ::  disp_n          !< displacement of next column
+    INTEGER(iwp), OPTIONAL       ::  first_index     !<
+    INTEGER(iwp)                 ::  i               !<
+    INTEGER(iwp)                 ::  i_f             !<
+    INTEGER(iwp)                 ::  j               !<
+    INTEGER(iwp)                 ::  j_f             !<
+    INTEGER(iwp)                 ::  lo_first_index  !<
+    INTEGER(iwp)                 ::  nr_bytes        !<
+    INTEGER(iwp)                 ::  nr_bytes_f      !<
+    INTEGER(iwp)                 ::  nr_words        !<
+#if defined( __parallel )
+    INTEGER, DIMENSION(rd_status_size)  ::  status   !<
+#else
+    TYPE(C_PTR)                         ::  buf      !<
+#endif
+    LOGICAL                             ::  found    !<
+    INTEGER(iwp)                 ::  ie              !<
+    INTEGER(iwp)                 ::  ind_gb          !<
+    INTEGER(iwp)                 ::  ind_out         !<
+    INTEGER(iwp)                 ::  is              !<
+    INTEGER(iwp)                 ::  n               !<
+    INTEGER(iwp)                 ::  n_trans         !<
+    INTEGER(iwp),DIMENSION(0:numprocs-1) ::  lo_index  !<
+    INTEGER, DIMENSION(rd_status_size)   ::  status    !<
+#endif
+    LOGICAL                      ::  found  !<
     REAL(wp), INTENT(OUT), DIMENSION(:), TARGET ::  data  !<
+#if defined( __parallel )
+    REAL(wp),DIMENSION(:),ALLOCATABLE    ::  put_buffer  !<
+#endif
 …
     DO  i = 1, tgh%nr_arrays
         IF ( TRIM( array_names(i) ) == TRIM( name ) )  THEN
+!
+!--        ATTENTION: The total_number_of_surface_values and wp MUST be INTERGER(8).
+!--        The compiler (at least Intel) first computes total_number_of_surface_values*wp
+!--        and then does the conversion to INTEGER(8).
+!--        This may lead to wrong results when total_number_of_surface_values*wp is > 2*10**6
            array_position = array_offset(i) + ( lo_first_index - 1 ) *                             &
                             total_number_of_surface_values * wp
+                            INT( total_number_of_surface_values, idp ) * INT( wp, idp )
            found = .TRUE.
            EXIT
 …
     ENDDO
+    disp   = -1
+    disp_f = -1
+    disp_n = -1
+!
+!-- In case of 2d-data, name is written only once
+    IF ( lo_first_index == 1 )  THEN
+       IF ( header_array_index == max_nr_arrays )  THEN
+          message_string = 'maximum number of 2d/3d-array entries in restart file header exceeded'
+          CALL message( 'rrd_mpi_io_surface', 'PA0585', 1, 2, 0, 6, 0 )
+       ENDIF
+       array_names(header_array_index)  = name
+       array_offset(header_array_index) = array_position
+       header_array_index = header_array_index + 1
+    ENDIF
     IF ( found )  THEN
        IF ( cyclic_fill_mode )  THEN
           CALL rrd_mpi_io_surface_cyclic_fill
+          RETURN
        ELSE
+          IF ( MAXVAL( m_global_start ) == -1 )   RETURN   ! Nothing to do on this PE
+#if defined( __parallel )
+!
+!--       Read data from restart file
+          CALL sm_io%sm_node_barrier() ! has no effect if I/O on limited number of cores is inactive
+          IF ( sm_io%iam_io_pe )  THEN
+             CALL MPI_FILE_SET_VIEW( fh, array_position, MPI_REAL, ft_surf, 'native',              &
+                                     MPI_INFO_NULL, ierr )
+             CALL MPI_FILE_READ_ALL ( fh, array_out, SIZE(array_out), MPI_REAL, status, ierr )
+          ENDIF
+          CALL sm_io%sm_node_barrier()
+!
+!--       Copy data into transfer buffer. Data is organized in a way that only one MPI_PUT to the
+!--       respective PE ist required.
+          ALLOCATE( put_buffer(SUM( transfer_index(4,:) )) )
+          ind_gb = 1
+          DO  i = 1, SIZE( local_indices, 2 )
+             ind_out = local_indices(1,i)
+             DO  j = 1, local_indices(2,i)
+                put_buffer(ind_gb) = array_out(ind_out)
+                ind_out = ind_out + 1
+                ind_gb  = ind_gb  + 1
+             ENDDO
+          ENDDO
+!
+!--       Transfer data from I/O PEs to the respective PEs to which they belong.
+          CALL MPI_WIN_FENCE( 0, win_surf, ierr )
+          buf_start = 1
+          DO  n = 0, numprocs-1
+             n_trans = transfer_index(4,n)
+             IF ( n_trans > 0 )  THEN
+                disp = transfer_index(3,n) - 1
+                CALL MPI_PUT( put_buffer(buf_start), n_trans, MPI_REAL, n, disp, n_trans, MPI_REAL,&
+                              win_surf, ierr)
+                buf_start = buf_start + n_trans
+             ENDIF
+          ENDDO
+          CALL MPI_WIN_FENCE( 0, win_surf, ierr )
+          DEALLOCATE( put_buffer )
+!
+!--       Copy from RMA window into output array (data) to allow transfering data to target PEs.
+!--       Check, if the number of surface values per grid cell match the index setup.
+          lo_index = thread_values
           DO  i = nxl, nxr
              DO  j = nys, nyn
+                IF ( m_global_start(j,i) > 0 )  THEN
+                   disp     = array_position+(m_global_start(j,i)-1) * wp
+                   nr_words = m_end_index(j,i)-m_start_index(j,i)+1
+                   nr_bytes = nr_words * wp
+                is = lo_index(target_thread(j,i)) + 1
+                ie = is + m_end_index(j,i) - m_start_index(j,i)
+                data(m_start_index(j,i):m_end_index(j,i)) = array_1d(is:ie)
+                lo_index(target_thread(j,i)) = lo_index(target_thread(j,i)) +                      &
+                                               e_end_index(j,i) - e_start_index(j,i) + 1
+!
+!--             TODO: Test can be removed later.
+                IF ( e_end_index(j,i)-e_start_index(j,i)+1 /= NINT( array_1d(is-1) ) )  THEN
+                   WRITE( 9, '(A,6I8)' ) 'Nr surface values does not match ', j, i,                &
+                                         e_start_index(j,i), e_end_index(j,i),                     &
+                                         e_end_index(j,i)-e_start_index(j,i)+1 ,                   &
+                                         NINT( array_1d(is-1) )
+                   FLUSH( 9 )
+                   CALL MPI_ABORT( comm2d, 1, ierr )
                 ENDIF
-                IF ( disp >= 0  .AND.  disp_f == -1 )  THEN   ! First entry
-                   disp_f     = disp
-                   nr_bytes_f = 0
-                   i_f = i
-                   j_f = j
-                ENDIF
-                IF ( j == nyn  .AND.  i == nxr )  THEN        ! Last entry
-                   disp_n = -1
-                   IF (  nr_bytes > 0 )  THEN
-                      nr_bytes_f = nr_bytes_f+nr_bytes
-                   ENDIF
-                ELSEIF ( j == nyn )  THEN                     ! Next x
-                   IF ( m_global_start(nys,i+1) > 0  .AND.  disp > 0 )  THEN
-                      disp_n = array_position + ( m_global_start(nys,i+1) - 1 ) * wp
-                   ELSE
-                      CYCLE
-                   ENDIF
-                ELSE
-                   IF ( m_global_start(j+1,i) > 0  .AND.  disp > 0 )  THEN
-                      disp_n = array_position + ( m_global_start(j+1,i) - 1 ) * wp
-                   ELSE
-                      CYCLE
-                   ENDIF
-                ENDIF
-                IF ( disp + nr_bytes == disp_n )  THEN        ! Contiguous block
-                   nr_bytes_f = nr_bytes_f + nr_bytes
-                ELSE                                          ! Read
-#if defined( __parallel )
-                   CALL MPI_FILE_SEEK( fhs, disp_f, MPI_SEEK_SET, ierr )
-                   nr_words = nr_bytes_f / wp
-                   CALL MPI_FILE_READ( fhs, data(m_start_index(j_f,i_f)), nr_words, MPI_REAL, status, &
-                      ierr )
-#else
+!
-!--                Use C_PTR here, because posix read does not work with indexed array
-                   buf = C_LOC( data(m_start_index(j_f,i_f)) )
-                   CALL posix_lseek( fh, disp_f )
-                   CALL posix_read( fh, buf, nr_bytes_f )
-#endif
-                   disp_f     = disp
-                   nr_bytes_f = nr_bytes
-                   i_f = i
-                   j_f = j
-                ENDIF
              ENDDO
           ENDDO
+       ENDIF
+    ELSE
+       message_string = 'surface array "' // TRIM( name ) // '" not found in restart file'
+       CALL message( 'rrd_mpi_io_surface', 'PA0722', 3, 2, 0, 6, 0 )
+#else
+          CALL posix_lseek( fh, array_position )
+          CALL posix_read( fh, array_out, SIZE(array_out) )
+          DO  i = nxl, nxr
+             DO  j = nys, nyn
+                data(m_start_index(j,i):m_end_index(j,i)) =                                        &
+                                                   array_out(e_start_index(j,i)+1:e_end_index(j,i))
+!
+!--             TODO: Test can be removed later.
+                IF ( e_end_index(j,i)-e_start_index(j,i)+1 /= NINT(array_out(e_start_index(j,i))) )&
+                THEN
+                   WRITE( 9, '(A,6I8)' ) 'Nr surface values does not match ', j, i,                &
+                                         e_start_index(j,i), e_end_index(j,i),                     &
+                                         e_end_index(j,i)-e_start_index(j,i)+1,                    &
+                                         NINT( array_out(e_start_index(j,i)) )
+                   FLUSH( 9 )
+                   CALL ABORT()
+                ENDIF
+             ENDDO
+          ENDDO
+#endif
+       ENDIF
     ENDIF
 …
        INTEGER(iwp) ::  i         !<
        INTEGER(iwp) ::  ie        !<
-#if defined( __parallel )
-       INTEGER(iwp) ::  ierr      !<
-#endif
        INTEGER(iwp) ::  is        !<
        INTEGER(iwp) ::  i_remote  !<
 …
        INTEGER(KIND=MPI_ADDRESS_KIND) ::  rem_offs  !<
 #else
+       INTEGER(idp) ::  rem_offs
+#endif
+       LOGICAL ::  write_done  !<
+!
+!--    In the current version, there is only 1 value per grid cell allowed.
+!--    In this special case, the cyclical repetition can be done with the same method as for 2d-real
+!--    array.
+       INTEGER(idp) ::  rem_offs                    !<
+#endif
+       REAL(wp), DIMENSION(:), ALLOCATABLE ::  c_data  !<
+!
+!--    ATTENTION: This version allows only 1 surface element per grid cell.
+!
+!--    Activate grid of the smaller prerun, i.e. grid variables like nxl, nxr, nys, nyn and other
+!--    values are set according to the prerun settings.
        CALL prerun_grid%activate_grid_from_this_class()
        IF ( pe_active_for_read )  THEN
+          rmabuf_2d = -1.0
+          IF ( MAXVAL( m_end_index ) <= 0 )  THEN
+             CALL mainrun_grid%activate_grid_from_this_class()
+             IF ( debug_output )  THEN
+                CALL debug_message( 'PE inactive for reading restart or prerun data', 'start' )
+             ENDIF
+             RETURN
+          ENDIF
+          ALLOCATE( c_data(MAXVAL( m_end_index )) )
+!
+!--       Recursive CALL of rrd_mpi_io_surface.
+!--       rrd_mpi_io_surface is called with cyclic_fill_mode = .FALSE. on the smaller prerun grid.
+          cyclic_fill_mode = .FALSE.
+          CALL rrd_mpi_io_surface( name, c_data )
+          cyclic_fill_mode = .TRUE.
           DO  i = nxl, nxr
              DO  j = nys, nyn
+                IF ( m_global_start(j,i) > 0 )  THEN
+                   disp     = array_position+(m_global_start(j,i)-1) * wp
+                   nr_words = m_end_index(j,i)-m_start_index(j,i)+1
+                   nr_bytes = nr_words * wp
+                ENDIF
+                IF ( disp >= 0  .AND.  disp_f == -1 )  THEN   ! First entry
+                   disp_f     = disp
+                   nr_bytes_f = 0
+                   write_done = .TRUE.
+                ENDIF
+                IF( write_done )  THEN
+                   i_f = i
+                   j_f = j
+                   write_done = .FALSE.
+                ENDIF
+                IF ( j == nyn  .AND.  i == nxr )  THEN        ! Last entry
+                   disp_n = -1
+                   IF (  nr_bytes > 0 )  THEN
+                      nr_bytes_f = nr_bytes_f+nr_bytes
+                   ENDIF
+                ELSEIF ( j == nyn )  THEN                     ! Next x
+                   IF ( m_global_start(nys,i+1) > 0  .AND.  disp > 0 )  THEN
+                      disp_n = array_position + ( m_global_start(nys,i+1) - 1 ) * wp
+                   ELSE
+                      CYCLE
+                   ENDIF
+                ELSE
+                   IF ( m_global_start(j+1,i) > 0  .AND.  disp > 0 )  THEN
+                      disp_n = array_position + ( m_global_start(j+1,i) - 1 ) * wp
+                   ELSE
+                      CYCLE
+                   ENDIF
+                ENDIF
+                IF ( disp + nr_bytes == disp_n )  THEN        ! Contiguous block
+                   nr_bytes_f = nr_bytes_f + nr_bytes
+                ELSE                                          ! Read
+#if defined( __parallel )
+                   CALL MPI_FILE_SEEK( fhs, disp_f, MPI_SEEK_SET, ierr )
+                   nr_words = nr_bytes_f / wp
+                   CALL MPI_FILE_READ( fhs, rmabuf_2d(j_f,i_f), nr_words, MPI_REAL, status, ierr )
+#else
+                   CALL posix_lseek( fh, disp_f )
+                   CALL posix_read( fh, rmabuf_2d(j_f:,i_f:), nr_bytes_f )
+#endif
+                   disp_f     = disp
+                   nr_bytes_f = nr_bytes
+                   write_done = .TRUE.
+                ENDIF
+                rmabuf_2d(j,i) = c_data(c_start_index(j,i))
              ENDDO
           ENDDO
        ENDIF
+!
+!--    Activate grid of the mainrun, i.e. grid variables like nxl, nxr, nys, nyn and other values
+!--    are set according to the mainrun settings.
        CALL mainrun_grid%activate_grid_from_this_class()
 …
 #endif
+       IF ( .NOT. pe_active_for_read )  THEN
+          is = nxl
+          ie = nxr
+          js = nys
+          je = nyn
+       ELSE
+          is = nxl
+          ie = nxr
+          js = prerun_grid%nys+1
+          je = nyn
+          DO  i = is, ie
+             DO  j = js, je
+                i_remote = MOD(i,nx_on_file+1)
+                j_remote = MOD(j,ny_on_file+1)
+                rem_pe   = remote_pe(i_remote,j_remote)
+                rem_offs = rma_offset(i_remote,j_remote)
+                nval     = 1
+#if defined( __parallel )
+                IF ( rem_pe /= myid )  THEN
+                   CALL MPI_GET( data(m_start_index(j,i)), nval, MPI_REAL, rem_pe, rem_offs, nval, &
+                                 MPI_REAL, rmawin_2d, ierr)
+                ELSE
+                   data(m_start_index(j,i)) = rmabuf_2d(j_remote,i_remote)
+                ENDIF
+#else
+                data(m_start_index(j,i)) = array_2d(i_remote,j_remote)
+#endif
+             ENDDO
+          ENDDO
+          is = prerun_grid%nxr+1
+          ie = nxr
+          js = nys
+          je = nyn
+       ENDIF
+!
+!--   After reading surface data on the small grid, map these data in a cyclic way to all respective
+!--   grid points of the main run.
+      is = nxl
+      ie = nxr
+      js = nys
+      je = nyn
        DO  i = is, ie
           DO  j = js, je
              i_remote = MOD(i,nx_on_file+1)
              j_remote = MOD(j,ny_on_file+1)
+             i_remote = MOD( i, nx_on_file+1 )
+             j_remote = MOD( j, ny_on_file+1 )
              rem_pe   = remote_pe(i_remote,j_remote)
              rem_offs = rma_offset(i_remote,j_remote)
 …
 #if defined( __parallel )
              IF ( rem_pe /= myid )  THEN
                 CALL MPI_GET( data(m_start_index(j,i)), nval, MPI_REAL, rem_pe, rem_offs, nval,    &
+                CALL MPI_GET( data(o_start_index(j,i)), nval, MPI_REAL, rem_pe, rem_offs, nval,    &
                               MPI_REAL, rmawin_2d, ierr)
              ELSE
                 data(m_start_index(j,i)) = rmabuf_2d(j_remote,i_remote)
+                data(o_start_index(j,i)) = rmabuf_2d(j_remote,i_remote)
              ENDIF
 #else
              data(m_start_index(j,i)) = array_2d(i_remote,j_remote)
+             data(o_start_index(j,i)) = array_2d(i_remote,j_remote)
 #endif
           ENDDO
 …
        CALL MPI_WIN_FENCE( 0, rmawin_2d, ierr )
 #endif
+       IF ( ALLOCATED( c_data ) )  DEALLOCATE( c_data )
     END SUBROUTINE rrd_mpi_io_surface_cyclic_fill
 …
     array_position = prt_nr_bytes
-    write(9,*) 'array_position after particle ',array_position,prt_nr_bytes,rs
     DEALLOCATE( prt_data )
 …
     IMPLICIT NONE
+    CHARACTER(LEN=*), INTENT(IN)       ::  name            !<
+#if defined( __parallel )
+    INTEGER(KIND=rd_offset_kind)       ::  disp            !<
+#endif
+    INTEGER(iwp), OPTIONAL             ::  first_index     !<
+#if defined( __parallel )
+    INTEGER(iwp)                       ::  i               !<
+#endif
+    INTEGER(iwp)                       ::  lo_first_index  !<
+    INTEGER(KIND=rd_offset_kind)       ::  offset          !<
+#if defined( __parallel )
+    INTEGER, DIMENSION(rd_status_size) ::  status          !<
+#endif
+    REAL(wp), INTENT(IN), DIMENSION(:), TARGET ::  data    !<
+    offset = 0
+    CHARACTER(LEN=*), INTENT(IN) ::  name  !<
+    INTEGER(iwp), OPTIONAL ::  first_index     !<
+    INTEGER(iwp) ::  i               !<
+    INTEGER(iwp) ::  j               !<
+    INTEGER(iwp) ::  lo_first_index  !<
+#if defined( __parallel )
+    INTEGER(iwp) ::  buf_start       !<
+    INTEGER(iwp) ::  ie              !<
+    INTEGER(iwp) ::  is              !<
+    INTEGER(iwp) ::  ind_gb          !<
+    INTEGER(iwp) ::  ind_out         !<
+    INTEGER(iwp) ::  n               !<
+    INTEGER(iwp) ::  n_trans         !<
+#endif
+#if defined( __parallel )
+    INTEGER(KIND=MPI_ADDRESS_KIND) ::  disp    !< displacement in RMA window
+    INTEGER(KIND=rd_offset_kind)   ::  offset  !<
+    INTEGER(iwp), DIMENSION(0:numprocs-1)   ::  lo_index  !<
+    INTEGER(iwp), DIMENSION(rd_status_size) ::  status    !<
+#endif
+    REAL(wp), INTENT(IN), DIMENSION(:), TARGET ::  data  !<
+#if defined( __parallel )
+    REAL(wp), DIMENSION(:), ALLOCATABLE ::  get_buffer  !<
+#endif
     lo_first_index = 1
 …
        lo_first_index = first_index
     ENDIF
+!
 !-- In case of 2d-data, name is written only once
 …
        IF ( header_array_index == max_nr_arrays )  THEN
+          STOP '+++ maximum number of 2d/3d-array entries in restart file header exceeded'
+          message_string = 'maximum number of 2d/3d-array entries in restart file header exceeded'
+          CALL message( 'wrd_mpi_io_surface', 'PA0585', 1, 2, 0, 6, 0 )
        ENDIF
 …
 #if defined( __parallel )
+    IF ( sm_io%is_sm_active() )  THEN
+       DO  i = 1, nr_val
+          array_1d(i+local_start) = data(i)
+    offset = 0
+    ALLOCATE( get_buffer(SUM( transfer_index(4,:) )) )
+!
+!-- Copy from input array (data) to RMA window to allow the target PEs to get the appropiate data.
+!-- At this point, a dummy surface element is added. This makes sure that every x-y grid cell owns
+!-- at least one surface element. This way, bookkeeping becomes much easier.
+    lo_index = thread_values
+    DO  i = nxl, nxr
+       DO  j = nys, nyn
+          is = lo_index(target_thread(j,i)) + 1
+          ie = is + m_end_index(j,i) - m_start_index(j,i)
+!
+!--       Store number of surface elements in dummy additional surface element
+          array_1d(is-1)  = e_end_index(j,i) - e_start_index(j,i) + 1
+          array_1d(is:ie) = data(m_start_index(j,i):m_end_index(j,i))
+          lo_index(target_thread(j,i)) = lo_index(target_thread(j,i)) +                            &
+                                         e_end_index(j,i) - e_start_index(j,i) + 1
        ENDDO
+    ELSE
+!       array_1d => data                           !kk Did not work in all cases    why???
+       ALLOCATE( array_1d( SIZE( data ) ) )
+       array_1d = data
+    ENDIF
+    CALL sm_io%sm_node_barrier()  ! Has no effect if I/O on limited number of cores is inactive
+    ENDDO
+!
+!-- On target PE, get data from source PEs which are assigned for output on this PE.
+    CALL MPI_WIN_FENCE( 0, win_surf, ierr )
+    buf_start = 1
+    DO  n = 0, numprocs-1
+       n_trans = transfer_index(4,n)
+       IF ( n_trans > 0 )  THEN
+          disp = transfer_index(3,n) - 1
+          CALL MPI_GET( get_buffer(buf_start), n_trans, MPI_REAL, n, disp, n_trans, MPI_REAL,      &
+                        win_surf, ierr )
+          buf_start = buf_start + n_trans
+       ENDIF
+    ENDDO
+    CALL MPI_WIN_FENCE( 0, win_surf, ierr )
+!
+!-- Copy data to output buffer. Here, the outpuf buffer matches the indices global_start and
+!-- global_end.
+    ind_gb  = 1
+    DO  i = 1, SIZE( local_indices, 2 )
+       ind_out = local_indices(1,i)
+       DO  j = 1, local_indices(2,i)
+          array_out(ind_out) = get_buffer(ind_gb)
+          ind_out = ind_out+1
+          ind_gb  = ind_gb+1
+       ENDDO
+    ENDDO
+    DEALLOCATE( get_buffer )
+!
+!-- Write data to disk.
+    CALL sm_io%sm_node_barrier()  ! has no effect if I/O on limited number of cores is inactive
     IF ( sm_io%iam_io_pe )  THEN
+       IF ( all_pes_write )  THEN
+          CALL MPI_FILE_SET_VIEW( fh, array_position, MPI_REAL, ft_surf, 'native', MPI_INFO_NULL,  &
+                                  ierr )
+          CALL MPI_FILE_WRITE_ALL( fh, array_1d, nr_iope, MPI_REAL, status, ierr )
+       ELSE
+          CALL MPI_FILE_SET_VIEW( fh, offset, MPI_BYTE, MPI_BYTE, 'native', MPI_INFO_NULL, ierr )
+          IF ( nr_val > 0 )  THEN
+             disp = array_position + 8 * ( glo_start - 1 )
+             CALL MPI_FILE_SEEK( fh, disp, MPI_SEEK_SET, ierr )
+             CALL MPI_FILE_WRITE( fh, array_1d, nr_iope, MPI_REAL, status, ierr )
+          ENDIF
+       ENDIF
+       CALL MPI_FILE_SET_VIEW( fh, array_position, MPI_REAL, ft_surf, 'native', MPI_INFO_NULL,     &
+                               ierr )
+       CALL MPI_FILE_WRITE_ALL( fh, array_out, SIZE( array_out ), MPI_REAL, status, ierr )
     ENDIF
     CALL sm_io%sm_node_barrier()
+    IF( .NOT. sm_io%is_sm_active() )  DEALLOCATE( array_1d )
+#else
+#else
+    DO  i = nxl, nxr
+       DO  j = nys, nyn
+          array_out(e_start_index(j,i)) = e_end_index(j,i) - e_start_index(j,i) + 1
+          array_out(e_start_index(j,i)+1:e_end_index(j,i)) =                                       &
+                                                          data(m_start_index(j,i):m_end_index(j,i))
+       ENDDO
+    ENDDO
     CALL posix_lseek( fh, array_position )
+    CALL posix_write( fh, data, nr_val )
+#endif
+    CALL posix_write( fh, array_out, SIZE(array_out) )
+#endif
     array_position = array_position + total_number_of_surface_values * wp
-!    IF ( lo_first_index == 1 )  THEN
-!       IF ( debug_level >= 2 .AND. nr_val  > 0 )  WRITE(9,*) 'w_surf_1 ', TRIM( name ), ' ', nr_val, SUM( data(1:nr_val) )
-!    ELSE
-!       IF ( debug_level >= 2 .AND. nr_val  > 0 ) WRITE(9,*) 'w_surf_n ', TRIM( name ), ' ', &
-!                                                            lo_first_index, nr_val, SUM( data(1:nr_val) )
-!    ENDIF
  END SUBROUTINE wrd_mpi_io_surface
 …
     IF ( wr_flag  .AND.  sm_io%iam_io_pe )  THEN
+       tgh%nr_int    = header_int_index - 1
+       tgh%nr_char   = header_char_index - 1
+       tgh%nr_real   = header_real_index - 1
+       tgh%nr_arrays = header_array_index - 1
+       tgh%total_nx  = iog%nx + 1
+       tgh%total_ny  = iog%ny + 1
+       tgh%nr_int      = header_int_index - 1
+       tgh%nr_char     = header_char_index - 1
+       tgh%nr_real     = header_real_index - 1
+       tgh%nr_arrays   = header_array_index - 1
+       tgh%total_nx    = iog%nx + 1
+       tgh%total_ny    = iog%ny + 1
+       tgh%pes_along_x = pdims(1)
+       tgh%pes_along_y = pdims(2)
        IF ( include_total_domain_boundaries )  THEN   ! Not sure, if LOGICAL interpretation is the same for all compilers,
           tgh%i_outer_bound = 1                       ! therefore store as INTEGER in general header
 …
 !-- Close MPI-IO files
 #if defined( __parallel )
+!
-!-- Restart file has been opened with comm2d
-    IF ( fhs /= -1 )  THEN
-       CALL MPI_FILE_CLOSE( fhs, ierr )
-    ENDIF
+!
 !-- Free RMA windows
 …
 #endif
+    IF (.NOT. pe_active_for_read )  RETURN
+    IF ( ALLOCATED( e_start_index ) )   DEALLOCATE( e_start_index  )
+    IF ( ALLOCATED( e_end_index )   )   DEALLOCATE( e_end_index    )
+    IF ( ALLOCATED( m_start_index ) )   DEALLOCATE( m_start_index  )
+    IF ( ALLOCATED( m_end_index )   )   DEALLOCATE( m_end_index    )
+    IF ( ALLOCATED( target_thread ) )   DEALLOCATE( target_thread  )
+    IF ( ALLOCATED( thread_index )  )   DEALLOCATE( thread_index   )
+    IF ( ALLOCATED( thread_values ) )   DEALLOCATE( thread_values  )
+    IF ( ALLOCATED( transfer_index ) )  DEALLOCATE( transfer_index  )
+    IF ( .NOT. pe_active_for_read )  RETURN
+!
 !-- TODO: better explain the following message
 …
 !> data is not time critical (data size is comparably small), it will be read by all cores.
 !--------------------------------------------------------------------------------------------------!
+ SUBROUTINE rd_mpi_io_surface_filetypes( start_index, end_index, data_to_write, global_start )
+ RECURSIVE SUBROUTINE rd_mpi_io_surface_filetypes( start_index, end_index, data_to_write,          &
+                                                   global_start, global_end )
     IMPLICIT NONE
+    INTEGER(iwp)                          ::  i           !<  loop index
+    INTEGER(iwp)                          ::  j           !<  loop index
+    INTEGER(KIND=rd_offset_kind)          ::  offset      !<
+    INTEGER(iwp), DIMENSION(1)            ::  dims1       !<
+    INTEGER(iwp), DIMENSION(1)            ::  lize1       !<
+    INTEGER(iwp), DIMENSION(1)            ::  start1      !<
+    INTEGER(iwp), DIMENSION(0:numprocs-1) ::  all_nr_val  !< number of values for all PEs
+    INTEGER(iwp), DIMENSION(0:numprocs-1) ::  lo_nr_val   !< local number of values in x and y direction
+    INTEGER, INTENT(INOUT), DIMENSION(nys:nyn,nxl:nxr)  ::  end_index     !<
+    INTEGER, INTENT(OUT), DIMENSION(nys:nyn,nxl:nxr)    ::  global_start  !<
+    INTEGER, INTENT(INOUT), DIMENSION(nys:nyn,nxl:nxr)  ::  start_index   !<
+    LOGICAL, INTENT(OUT) ::  data_to_write  !< returns, if surface data have to be written
+!
+!-- Actions during reading
+    IF ( rd_flag )  THEN
+!
+!--    Set start index and end index for the mainrun grid.
+!--    ATTENTION: This works only for horizontal surfaces with one vale per grid cell!!!
+       IF ( cyclic_fill_mode )  THEN
+          DO  i = nxl, nxr
+             DO  j = nys, nyn
+                start_index (j,i) = (i-nxl) * nny + j - nys + 1
+                end_index (j,i)   = start_index(j,i)
+             ENDDO
+    INTEGER(iwp) ::  e_lo_start        !<
+    INTEGER(iwp) ::  i                 !<  loop index
+    INTEGER(iwp) ::  j                 !<  loop index
+    INTEGER(iwp) ::  index_offset      !<
+    INTEGER(iwp) ::  last_end_index    !<
+    INTEGER(iwp) ::  lo_start          !<
+    INTEGER(iwp) ::  nr_surfcells_pe   !<
+    INTEGER(iwp) ::  rest_cells_pe     !<
+    INTEGER(iwp) ::  rest_bound        !<
+#if defined( __parallel )
+    INTEGER(iwp) ::  io_end_index      !<
+    INTEGER(iwp) ::  io_start_index    !<
+    INTEGER(iwp) ::  n                 !<  loop index
+    INTEGER(iwp) ::  nr_previous       !<
+#endif
+    INTEGER(iwp), DIMENSION(0:numprocs-1,2) ::  nr_surfcells_all_s  !<
+    INTEGER(iwp), DIMENSION(0:numprocs-1,2) ::  nr_surfcells_all_r  !<
+#if defined( __parallel )
+    INTEGER(iwp), DIMENSION(1)              ::  dims1               !< global dimension for MPI_TYPE_CREATE_SUBARRAY
+    INTEGER(iwp), DIMENSION(1)              ::  lsize1              !< local size for MPI_TYPE_CREATE_SUBARRAY
+    INTEGER(iwp), DIMENSION(0:numprocs-1)   ::  nr_cells_to_thread  !<
+    INTEGER(iwp), DIMENSION(0:pdims(1))     ::  nr_surf_cells_x     !<
+    INTEGER(iwp), DIMENSION(0:pdims(1))     ::  nr_surf_cells_x_s   !<
+    INTEGER(iwp), DIMENSION(0:numprocs-1)   ::  nr_values_to_thread !<
+    INTEGER(iwp), DIMENSION(1)              ::  start1              !< start index for MPI_TYPE_CREATE_SUBARRAY
+    INTEGER(iwp), DIMENSION(nxl:nxr)        ::  sum_y               !<
+#endif
+    INTEGER(iwp), INTENT(INOUT), DIMENSION(nys:nyn,nxl:nxr) ::  end_index     !< local end indx
+    INTEGER(iwp), INTENT(INOUT), DIMENSION(nys:nyn,nxl:nxr) ::  global_start  !< global start index
+    INTEGER(iwp), INTENT(INOUT), DIMENSION(nys:nyn,nxl:nxr) ::  global_end    !< global end index
+    INTEGER(iwp), INTENT(INOUT), DIMENSION(nys:nyn,nxl:nxr) ::  start_index   !< local start index
+#if defined( __parallel )
+    INTEGER(iwp), DIMENSION(0:myidy,nxl:nxr)    ::  nr_previous_y      !<
+    INTEGER(iwp), DIMENSION(0:pdims(2),nxl:nxr) ::  nr_surf_cells_y    !<
+    INTEGER(iwp), DIMENSION(0:pdims(2),nxl:nxr) ::  nr_surf_cells_y_s  !<
+    INTEGER(iwp), DIMENSION(4,0:numprocs-1)     ::  transfer_index_s   !<
+#endif
+    LOGICAL, INTENT(OUT) ::  data_to_write      !< returns .TRUE., if surface data have been written
+    LOGICAL              ::  only_dummy_values  !< only dummy values, i.e. no data to write
+!
+!-- Clean up previous calls.
+#if defined( __parallel )
+    IF ( win_surf /= -1 )  THEN
+       CALL MPI_WIN_FREE( win_surf, ierr )
+       DEALLOCATE( array_1d )
+       win_surf = -1
+    ENDIF
+    IF ( ft_surf /= -1  .AND.  sm_io%iam_io_pe )  THEN
+       CALL MPI_TYPE_FREE( ft_surf, ierr )
+    ENDIF
+    ft_surf = -1
+    IF ( sm_io%is_sm_active() )  THEN
+       IF ( win_out /= -1 )  THEN
+          CALL MPI_WIN_FREE( win_out, ierr )
+          win_out = -1
+       ENDIF
+    ELSE
+       IF ( ASSOCIATED( array_out ) )  DEALLOCATE( array_out )
+    ENDIF
+#else
+    IF ( ASSOCIATED( array_out ) )  DEALLOCATE( array_out )
+#endif
+    IF ( cyclic_fill_mode )  THEN
+       CALL cyclic_fill_surface_filetype
+       RETURN
+    ELSE
+       IF ( .NOT. ALLOCATED( e_end_index )    )  ALLOCATE( e_end_index(nys:nyn,nxl:nxr)   )
+       IF ( .NOT. ALLOCATED( e_start_index )  )  ALLOCATE( e_start_index(nys:nyn,nxl:nxr) )
+       IF ( .NOT. ALLOCATED( m_end_index )    )  ALLOCATE( m_end_index(nys:nyn,nxl:nxr)   )
+       IF ( .NOT. ALLOCATED( m_start_index )  )  ALLOCATE( m_start_index(nys:nyn,nxl:nxr) )
+       IF ( .NOT. ALLOCATED( target_thread )  )  ALLOCATE( target_thread(nys:nyn,nxl:nxr) )
+       IF ( .NOT. ALLOCATED( thread_index )   )  ALLOCATE( thread_index(0:numprocs-1)     )
+       IF ( .NOT. ALLOCATED( thread_values )  )  ALLOCATE( thread_values(0:numprocs-1)    )
+       IF ( .NOT. ALLOCATED( transfer_index ) )  ALLOCATE( transfer_index(4,0:numprocs-1) )
+    ENDIF
+    IF ( wr_flag)  THEN
+!
+!--    Add one dummy value at every grid box.
+!--    This allows to use MPI_FILE_WRITE_ALL and MPI_FILE_READ_ALL with subarray file type.
+       index_offset   = 0
+       last_end_index = 0
+       DO  i = nxl, nxr
+          DO  j = nys, nyn
+             e_start_index(j,i) = start_index (j,i) + index_offset
+             IF ( end_index (j,i) - start_index(j,i) < 0 )  THEN
+                e_end_index (j,i) = last_end_index+1
+                last_end_index    = last_end_index+1
+             ELSE
+                e_end_index (j,i) = end_index(j,i) + index_offset + 1
+                last_end_index    = e_end_index (j,i)
+             ENDIF
+             index_offset = index_offset + 1
+           ENDDO
+       ENDDO
+#if defined( __parallel )
+!
+!--    Compute indices for global, PE independent 1-d surface element array.
+       nr_surf_cells_y_s = 0
+!
+!--    Count number of surface elements in y-direction for every x.
+       DO  i = nxl, nxr
+          nr_surf_cells_y_s(myidy,i) = SUM( e_end_index (:,i) - e_start_index (:,i) + 1 )
+       ENDDO
+!
+!--    Distribute these values to all PEs along y.
+       CALL MPI_ALLREDUCE( nr_surf_cells_y_s, nr_surf_cells_y, SIZE( nr_surf_cells_y ),            &
+                           MPI_INTEGER, MPI_SUM, comm1dy, ierr )
+!
+!--    Sum all surface elements along y for individual x PEs
+       nr_surf_cells_x_s        = 0
+       nr_surf_cells_x_s(myidx) = SUM( nr_surf_cells_y )
+!
+!--    Distribute to all PEs along x.
+       CALL MPI_ALLREDUCE( nr_surf_cells_x_s, nr_surf_cells_x, SIZE( nr_surf_cells_x ),            &
+                           MPI_INTEGER, MPI_SUM, comm1dx, ierr )
+       DO  i = nxl, nxr
+          nr_previous_y(:,i) = 0
+          DO  n = 1, myidy
+             nr_previous_y(n,i) = nr_previous_y(n-1,i) + nr_surf_cells_y(n-1,i)
           ENDDO
+       ENDIF
+       IF ( .NOT. ALLOCATED( m_start_index )  )  ALLOCATE( m_start_index(nys:nyn,nxl:nxr)  )
+       IF ( .NOT. ALLOCATED( m_end_index )    )  ALLOCATE( m_end_index(nys:nyn,nxl:nxr)    )
+       IF ( .NOT. ALLOCATED( m_global_start ) )  ALLOCATE( m_global_start(nys:nyn,nxl:nxr) )
+!
+!--    Save arrays for later reading
+       m_start_index  = start_index
+       m_end_index    = end_index
+       m_global_start = global_start
+       nr_val = MAXVAL( end_index )
+    ENDIF
+    IF ( .NOT. pe_active_for_read )  RETURN
+    IF ( cyclic_fill_mode )  CALL prerun_grid%activate_grid_from_this_class()
+    offset = 0
+    lo_nr_val= 0
+    lo_nr_val(myid) = MAXVAL( end_index )
+#if defined( __parallel )
+    CALL MPI_ALLREDUCE( lo_nr_val, all_nr_val, numprocs, MPI_INTEGER, MPI_SUM, comm2d, ierr )
+    IF ( ft_surf /= -1  .AND.  sm_io%iam_io_pe )  THEN
+       CALL MPI_TYPE_FREE( ft_surf, ierr )    ! If set, free last surface filetype
+    ENDIF
+    IF ( win_surf /= -1 )  THEN
+       IF ( sm_io%is_sm_active() )  THEN
+          CALL MPI_WIN_FREE( win_surf, ierr )
+       ENDIF
+       win_surf = -1
+    ENDIF
+    IF ( sm_io%is_sm_active() .AND. rd_flag )  THEN
+       IF ( fhs == -1 )  THEN
+          CALL MPI_FILE_OPEN( comm2d, TRIM( io_file_name ), MPI_MODE_RDONLY, MPI_INFO_NULL, fhs,   &
+                              ierr )
+       ENDIF
+       ENDDO
+       sum_y(nxl) = SUM( nr_surf_cells_y(:,nxl) )
+       DO  i = nxl, nxr
+          IF ( i > nxl )  THEN
+             sum_y(i) = sum_y(i-1) + SUM( nr_surf_cells_y(:,i) )
+          ENDIF
+       ENDDO
+       nr_previous = 0
+       IF ( myidx >= 1 )  THEN
+          nr_previous = SUM(nr_surf_cells_x(0:myidx-1))
+       ENDIF
+       global_start(nys,nxl) = 1 + nr_previous + nr_previous_y(myidy,nxl)
+       DO  j = nys+1, nyn
+          global_start(j,nxl) = global_start(j-1,nxl) + e_end_index(j-1,nxl) -                     &
+                                e_start_index(j-1,nxl) + 1
+       ENDDO
+       DO  i = nxl+1, nxr
+          global_start(nys,i) = 1 + nr_previous + nr_previous_y(myidy,i) + sum_y(i-1)
+          DO  j = nys+1, nyn
+             global_start(j,i) = global_start(j-1,i) + e_end_index(j-1,i) - e_start_index(j-1,i) + 1
+          ENDDO
+       ENDDO
+#else
+       global_start = e_start_index
+#endif
+       DO  i = nxl, nxr
+          DO  j = nys, nyn
+             global_end(j,i) = global_start(j,i) + e_end_index (j,i) - e_start_index (j,i)
+          ENDDO
+       ENDDO
     ELSE
+       fhs = fh
+    ENDIF
+#else
+    all_nr_val(myid) = lo_nr_val(myid)
+#endif
+    nr_val = lo_nr_val(myid)
+!
+!--    In case of read, compute e_start_index and e_end_index for current processor grid.
+!--    This data contains one extra value for every i and j.
+       e_lo_start = 1
+       lo_start   = 1
+       DO  i = nxl, nxr
+          DO  j = nys, nyn
+             e_start_index(j,i) = e_lo_start
+             e_end_index(j,i)   = e_lo_start + global_end(j,i) - global_start(j,i)
+             e_lo_start         = e_lo_start + global_end(j,i) - global_start(j,i) + 1
+             start_index(j,i)   = lo_start
+             end_index(j,i)     = lo_start + global_end(j,i) - global_start(j,i) - 1
+             lo_start           = lo_start + global_end(j,i) - global_start(j,i)
+          ENDDO
+       ENDDO
+    ENDIF
+    nr_surfcells_all_s = 0
+    nr_surfcells_all_s(myid,1) = MAXVAL( e_end_index ) ! don't split surface elements of one gridbox
+    nr_surfcells_all_s(myid,2) = MAXVAL( e_end_index - e_start_index )
+#if defined( __parallel )
+    CALL MPI_ALLREDUCE( nr_surfcells_all_s, nr_surfcells_all_r, SIZE( nr_surfcells_all_s ),        &
+                        MPI_INTEGER, MPI_SUM, comm2d, ierr )
+#else
+    nr_surfcells_all_r = nr_surfcells_all_s
+#endif
     total_number_of_surface_values = 0
 …
           glo_start = total_number_of_surface_values + 1
        ENDIF
        total_number_of_surface_values = total_number_of_surface_values + all_nr_val(i)
+       total_number_of_surface_values = total_number_of_surface_values + nr_surfcells_all_r(i,1)
     ENDDO
+!
+!-- Actions during reading
+    IF ( rd_flag )  THEN
+#if defined( __parallel )
+       CALL MPI_FILE_SET_VIEW( fhs, offset, MPI_BYTE, MPI_BYTE, 'native', MPI_INFO_NULL, ierr )
+#endif
+    ENDIF
+    IF ( cyclic_fill_mode )  CALL mainrun_grid%activate_grid_from_this_class()
+!
+!-- Actions during writing
+    IF ( wr_flag )  THEN
+!
+!--    Create surface filetype
+       ft_surf      = -1
+       global_start = start_index + glo_start - 1
+       WHERE ( end_index < start_index )
+          global_start = -1
+       ENDWHERE
+#if defined( __parallel )
+       IF ( sm_io%is_sm_active() )  THEN
+          IF ( sm_io%iam_io_pe )  THEN
+!
+!--          Calculate number of values of all PEs of an I/O group
+             nr_iope = 0
+             DO  i = myid, myid+sm_io%sh_npes-1
+                nr_iope = nr_iope + all_nr_val(i)
+    only_dummy_values = ( MAXVAL( nr_surfcells_all_r(:,2) ) <= 0 )
+!
+!-- Compute indices of equally distributed surface elements.
+!-- Number of surface values scheduled for ouput on this PE:
+    nr_surfcells_pe  = total_number_of_surface_values  / numprocs
+    rest_cells_pe    = MOD( total_number_of_surface_values, numprocs )
+    rest_bound       = rest_cells_pe * ( nr_surfcells_pe + 1 )
+    m_start_index    = start_index
+    m_end_index      = end_index
+!
+!-- Compute number of elements on source PE, which have to be send to the corresponding target PE.
+#if defined( __parallel )
+    nr_cells_to_thread  = 0
+    nr_values_to_thread = 0
+    DO  i = nxl, nxr
+       DO  j = nys, nyn
+          IF ( rest_cells_pe == 0 )  THEN
+             target_thread(j,i) = ( global_start(j,i) - 1 ) / nr_surfcells_pe
+          ELSE
+             IF ( global_start(j,i) <= rest_bound )  THEN
+                target_thread(j,i) = ( global_start(j,i) - 1 ) / ( nr_surfcells_pe + 1 )
+             ELSE
+                target_thread(j,i) = ( global_start(j,i) - rest_bound - 1 ) / nr_surfcells_pe
+                target_thread(j,i) = target_thread(j,i) + rest_cells_pe
+             ENDIF
+!
+!--          TODO: Test output, to be removed later.
+             IF ( target_thread(j,i) >= numprocs )  THEN
+                WRITE( 9,'(A,8I8)' )  'target_thread ', j, i, target_thread(j,i),                  &
+                                      global_start(j,i) , nr_surfcells_pe
+                FLUSH( 9 )
+                CALL MPI_ABORT( comm2d, 1, ierr )
+             ENDIF
+          ENDIF
+          nr_cells_to_thread(target_thread(j,i))  = nr_cells_to_thread(target_thread(j,i)) + 1
+          nr_values_to_thread(target_thread(j,i)) = nr_values_to_thread(target_thread(j,i)) +      &
+                                                    e_end_index(j,i) - e_start_index(j,i) + 1
+       ENDDO
+    ENDDO
+!
+!-- Compute start index in the transfer buffer on the source side for the corresponding target PE.
+    thread_index(0)  = 1
+    thread_values(0) = 1
+    DO  n = 1, numprocs-1
+       thread_index(n)  = thread_index(n-1) + nr_cells_to_thread(n-1)
+       thread_values(n) = thread_values(n-1) + nr_values_to_thread(n-1)
+    ENDDO
+!
+!-- Buffer distribution on the source side.
+    DO  n = 0, numprocs-1
+       transfer_index_s(1,n) = thread_index(n)
+       transfer_index_s(2,n) = nr_cells_to_thread(n)
+       transfer_index_s(3,n) = thread_values(n)
+       transfer_index_s(4,n) = nr_values_to_thread(n)
+    ENDDO
+    CALL MPI_ALLTOALL( transfer_index_s, 4, MPI_INTEGER, transfer_index, 4, MPI_INTEGER, comm2d,   &
+                       ierr)
+!
+!-- Buffer distribution on the target side side.
+    CALL get_remote_indices()
+!
+!-- Create surface element file type.
+    IF ( total_number_of_surface_values > 0 .AND. .NOT. only_dummy_values)  THEN
+        data_to_write = .TRUE.
+    ELSE
+        data_to_write = .FALSE.
+    ENDIF
+    CALL MPI_ALLREDUCE( global_end(nyn,nxr), dims1(1), 1, MPI_INTEGER, MPI_MAX, comm2d, ierr )
+    start1(1) = MINVAL( local_indices(1,:) ) - 1
+    IF ( sm_io%is_sm_active() )  THEN
+       CALL MPI_ALLREDUCE( SUM( local_indices(2,:) ), lsize1(1), 1, MPI_INTEGER, MPI_SUM,          &
+                           sm_io%comm_shared, ierr )
+    ELSE
+       lsize1(1) = SUM( local_indices(2,:) )
+    ENDIF
+    IF ( sm_io%iam_io_pe )  THEN
+       IF ( total_number_of_surface_values > 0 )  THEN
+           CALL MPI_TYPE_CREATE_SUBARRAY( 1, dims1, lsize1, start1, MPI_ORDER_FORTRAN, MPI_REAL,   &
+                                          ft_surf, ierr )
+           CALL MPI_TYPE_COMMIT( ft_surf, ierr )
+       ENDIF
+    ENDIF
+!
+!-- Allocate rma window to supply surface data to other PEs.
+    CALL rd_alloc_rma_mem( array_1d, SUM( nr_values_to_thread ), win_surf )
+!
+!-- Allocate shared array on IO-PE to supply data for MPI-IO (write or read).
+    IF ( sm_io%is_sm_active() )  THEN
+       IF ( sm_io%iam_io_pe )  THEN
+          io_start_index = start1(1) + 1
+          io_end_index   = start1(1) + lsize1(1)
+       ENDIF
+       CALL MPI_BCAST( io_start_index, 1, MPI_INTEGER, 0, sm_io%comm_shared, ierr )
+       CALL MPI_BCAST( io_end_index,   1, MPI_INTEGER, 0, sm_io%comm_shared, ierr )
+       CALL sm_io%sm_allocate_shared( array_out, io_start_index, io_end_index, win_out )
+    ELSE
+       ALLOCATE( array_out(start1(1)+1:start1(1)+lsize1(1)) )
+    ENDIF
+#else
+    IF ( total_number_of_surface_values > 0  .AND.  .NOT. only_dummy_values )  THEN
+        data_to_write = .TRUE.
+    ELSE
+        data_to_write = .FALSE.
+    ENDIF
+    ALLOCATE( array_out(1:total_number_of_surface_values) )
+#endif
+ CONTAINS
+    SUBROUTINE cyclic_fill_surface_filetype
+       INTEGER(iwp) ::  i  !<  loop index
+       INTEGER(iwp) ::  j  !<  loop index
+       IF ( .NOT. ALLOCATED( o_start_index ) )  ALLOCATE( o_start_index(nys:nyn,nxl:nxr) )
+       IF ( .NOT. ALLOCATED( o_end_index )   )  ALLOCATE( o_end_index(nys:nyn,nxl:nxr)   )
+       lo_start   = 1
+       DO  i = nxl, nxr
+           DO  j = nys, nyn
+               o_start_index(j,i) = lo_start
+               o_end_index(j,i)   = lo_start
+               lo_start           = lo_start + 1
+           ENDDO
+       ENDDO
+       start_index = o_start_index
+       end_index   = o_end_index
+       IF ( MAXVAL( global_end-global_start ) > 1 )  THEN
+          message_string = 'cylic-fill method does not allow more than one surface element ' //    &
+                           'per grid box'
+          CALL message( 'cyclic_fill_surface_filetype', 'PA0742', 1, 2, 0, 6, 0 )
+       ENDIF
+!
+!--    Activate grid of the smaller prerun, i.e. grid variables like nxl, nxr, nys, nyn and others
+!--    are set according to the prerun layout.
+       CALL prerun_grid%activate_grid_from_this_class()
+       IF ( pe_active_for_read )  THEN
+          IF ( .NOT. ALLOCATED( c_global_start ) )  ALLOCATE( c_global_start(nys:nyn,nxl:nxr) )
+          IF ( .NOT. ALLOCATED( c_global_end )   )  ALLOCATE( c_global_end(nys:nyn,nxl:nxr)   )
+          IF ( .NOT. ALLOCATED( c_start_index )  )  ALLOCATE( c_start_index(nys:nyn,nxl:nxr)  )
+          IF ( .NOT. ALLOCATED( c_end_index )    )  ALLOCATE( c_end_index(nys:nyn,nxl:nxr)    )
+          DO  i = nxl, nxr
+             DO  j = nys, nyn
+                c_global_start(j,i) = global_start(j,i)
+                c_global_end(j,i)   = global_end(j,i)
              ENDDO
+          ELSE
+             local_start = 0
+             DO  i = myid-sm_io%sh_rank, myid-1
+                local_start = local_start + all_nr_val(i)
+             ENDDO
+          ENDDO
+!
+!--       Recursive call of rd_mpi_io_surface_filetypes.
+!--       Prerun data are read, but they are treated as if they are mainrun data, just on a smaller
+!--       grid.
+          cyclic_fill_mode = .FALSE.
+          CALL rd_mpi_io_surface_filetypes( c_start_index, c_end_index, data_to_write,             &
+                                            c_global_start, c_global_end )
+          cyclic_fill_mode = .TRUE.
+       ENDIF
+!
+!--    Activate grid of the mainrun, i.e. grid variables like nxl, nxr, nys, nyn and others
+!--    are set according to the mainrun layout.
+       CALL mainrun_grid%activate_grid_from_this_class()
+#if defined( __parallel )
+       CALL MPI_BCAST( data_to_write, 1, MPI_LOGICAL, 0, comm2d, ierr )
+#endif
+    END SUBROUTINE cyclic_fill_surface_filetype
+#if defined( __parallel )
+!
+!-- Get the indices of the surface elements inside the RMA window on the remote PE.
+!-- This information is required to fetch the surface element data on remote PEs
+!-- in rrd_mpi_io_surface and wrd_mpi_io_surface.
+    SUBROUTINE get_remote_indices
+       INTEGER(iwp) ::  buf_start  !<
+       INTEGER(iwp) ::  i          !<
+       INTEGER(iwp) ::  j          !<
+       INTEGER(iwp) ::  n          !<
+       INTEGER(iwp) ::  n_trans    !<
+       INTEGER(iwp) ::  win_ind    !<
+       INTEGER(KIND=MPI_ADDRESS_KIND) ::  disp     !< displacement in RMA window
+       INTEGER(KIND=MPI_ADDRESS_KIND) ::  winsize  !< size of RMA window
+       INTEGER(iwp), DIMENSION(0:numprocs-1) ::  lo_index         !<
+       INTEGER(iwp), POINTER, DIMENSION(:,:) ::  surf_val_index  !<
+       IF ( ALLOCATED( local_indices ) )  DEALLOCATE( local_indices )
+       ALLOCATE( local_indices(2,MAX( SUM( transfer_index(2,:) ), 2 )))
+       local_indices(1,:) = 0
+       local_indices(2,:) = 0
+       winsize = MAX( 2 * SUM( nr_cells_to_thread ), 2 )
+       ALLOCATE( surf_val_index(2,winsize) )
+       winsize = winsize * iwp
+       CALL MPI_WIN_CREATE( surf_val_index, winsize, iwp, MPI_INFO_NULL, comm2d, win_ind, ierr )
+       CALL MPI_WIN_FENCE( 0, win_ind, ierr )
+       lo_index = thread_index
+       DO  i = nxl, nxr
+          DO  j = nys, nyn
+             surf_val_index(1,lo_index(target_thread(j,i))) = global_start(j,i)
+             surf_val_index(2,lo_index(target_thread(j,i))) = global_end(j,i) - global_start(j,i)  &
+                                                              + 1
+             lo_index(target_thread(j,i)) = lo_index(target_thread(j,i)) + 1
+          ENDDO
+       ENDDO
+       CALL MPI_WIN_FENCE( 0, win_ind, ierr )
+       buf_start = 1
+       DO  n = 0, numprocs-1
+          n_trans = transfer_index(2,n)
+          IF ( n_trans > 0 )  THEN
+             disp = 2 * ( transfer_index(1,n) - 1 )
+             CALL MPI_GET( local_indices(1,buf_start), 2*n_trans, MPI_INTEGER, n, disp, 2*n_trans, &
+                           MPI_INTEGER, win_ind, ierr )
+             buf_start = buf_start + n_trans
           ENDIF
+!
+!--       Get the size of shared memory window on all PEs
+          CALL MPI_BCAST( nr_iope, 1, MPI_INTEGER, 0, sm_io%comm_shared, ierr )
+          CALL sm_io%sm_allocate_shared( array_1d, 1, MAX( 1, nr_iope ), win_surf )
+       ELSE
+          nr_iope = nr_val
+       ENDIF
+#else
+       nr_iope = nr_val
+#endif
+!
+!--    Check, if surface data exist on this PE
+       data_to_write = .TRUE.
+       IF ( total_number_of_surface_values == 0 )  THEN
+          data_to_write = .FALSE.
+          RETURN
+       ENDIF
+       IF ( sm_io%iam_io_pe )  THEN
+          all_pes_write = ( MINVAL( all_nr_val ) > 0 )
+          IF ( all_pes_write )  THEN
+             dims1(1)  = total_number_of_surface_values
+             lize1(1)  = nr_iope
+             start1(1) = glo_start-1
+#if defined( __parallel )
+             IF ( total_number_of_surface_values > 0 )  THEN
+                 CALL MPI_TYPE_CREATE_SUBARRAY( 1, dims1, lize1, start1, MPI_ORDER_FORTRAN,        &
+                                                MPI_REAL, ft_surf, ierr )
+                 CALL MPI_TYPE_COMMIT( ft_surf, ierr )
+             ENDIF
+#endif
+       ENDDO
+       CALL MPI_WIN_FENCE( 0, win_ind, ierr )
+       buf_start = 1
+       DO  n = 0, numprocs-1
+          n_trans = transfer_index(2,n)
+          IF ( n_trans > 0 )  THEN
+             disp = transfer_index(1,n) - 1
+             buf_start = buf_start + n_trans
           ENDIF
+       ENDIF
+    ENDIF
+       ENDDO
+       CALL MPI_WIN_FREE( win_ind, ierr )
+       DEALLOCATE( surf_val_index )
+    END SUBROUTINE get_remote_indices
+!--------------------------------------------------------------------------------------------------!
+! Description:
+! ------------
+!> Allocate memory and create window for one-sided communication (1-d INTEGER array)
+!--------------------------------------------------------------------------------------------------!
+    SUBROUTINE rd_alloc_rma_mem( array, idim, win )
+       IMPLICIT NONE
+       INTEGER(iwp), INTENT(IN)       ::  idim     !< Dimension of this 1-D array
+       INTEGER                        ::  ierr     !< MPI error code
+       INTEGER(iwp), INTENT(OUT)      ::  win      !< MPI window
+       INTEGER(KIND=MPI_ADDRESS_KIND) ::  winsize  !< size of RMA window
+       REAL(wp), DIMENSION(:), POINTER, INTENT(INOUT) ::  array  !< array to access RMA window locally
+       winsize = MAX( idim, 2 )
+       ALLOCATE( array(winsize) )
+       winsize = winsize * wp
+       CALL MPI_WIN_CREATE( array, winsize, wp, MPI_INFO_NULL, comm2d, win, ierr )
+       array = -1
+       CALL MPI_WIN_FENCE( 0, win, ierr )
+    END SUBROUTINE rd_alloc_rma_mem
+#endif
  END SUBROUTINE rd_mpi_io_surface_filetypes
 …
           iog%nnx = iog%nnx + nbgp
        ENDIF
        IF ( myidx == npex-1  .OR.  npex == -1 )  THEN   ! npex == 1 if -D__parallel not set
+       IF ( myidx == pdims(1)-1 )  THEN
           iog%nxr = iog%nxr + nbgp
           iog%nnx = iog%nnx + nbgp
 …
           iog%nny = iog%nny + nbgp
        ENDIF
        IF ( myidy == npey-1  .OR.  npey == -1 )  THEN   ! npey == 1 if -D__parallel not set
+       IF ( myidy == pdims(2)-1 )  THEN
           iog%nyn = iog%nyn + nbgp
           iog%nny = iog%nny + nbgp
 …
           iog%nnx = iog%nnx + nbgp
        ENDIF
        IF ( myidx == npex-1  .OR.  npex == -1 )  THEN   ! npex == 1 if -D__parallel not set
+       IF ( myidx == pdims(1)-1 )  THEN
           iog%nxr = iog%nxr + nbgp
           iog%nnx = iog%nnx + nbgp
 …
           iog%nny = iog%nny + nbgp
        ENDIF
        IF ( myidy == npey-1  .OR.  npey == -1 )  THEN   ! npey == 1 if -D__parallel not set
+       IF ( myidy == pdims(2)-1 )  THEN
           iog%nyn = iog%nyn + nbgp
           iog%nny = iog%nny + nbgp
 …
 !> to a single file that contains the global arrays. It is not required for the serial mode.
 !--------------------------------------------------------------------------------------------------!
-#if defined( __parallel )
  SUBROUTINE rd_mpi_io_create_filetypes_3dsoil( nzb_soil, nzt_soil )
 …
     INTEGER, INTENT(IN)   ::  nzt_soil  !<
+#if defined( __parallel )
     INTEGER, DIMENSION(3) ::  dims3     !<
     INTEGER, DIMENSION(3) ::  lize3     !<
 …
        CALL MPI_TYPE_COMMIT( ft_3dsoil, ierr )
     ENDIF
+#else
+    ALLOCATE( array_3d_soil(nzb_soil:nzt_soil,iog%nxl:iog%nxr,iog%nys:iog%nyn) )
+    sm_io%io_grid = iog
+#endif
  END SUBROUTINE rd_mpi_io_create_filetypes_3dsoil
-#endif
 !--------------------------------------------------------------------------------------------------!
 …
     IMPLICIT NONE
 #if defined( __parallel )
 …
     ENDIF
+!
 !-- Free last surface filetype
 …
     IF ( sm_io%iam_io_pe .AND. ft_3di4 /= -1 )  THEN
        CALL MPI_TYPE_FREE( ft_3di4, ierr )
+       ft_3di4 = -1
+    ENDIF
+    IF ( sm_io%iam_io_pe .AND. ft_3di8 /= -1 )  THEN
        CALL MPI_TYPE_FREE( ft_3di8, ierr )
+       ft_3di8 = -1
     ENDIF
     IF ( sm_io%is_sm_active() .AND.  win_3di4 /= -1 )  THEN
        CALL sm_io%sm_free_shared( win_3di4 )
+       win_3di4 = -1
+    ENDIF
+    IF ( sm_io%is_sm_active() .AND.  win_3di8 /= -1 )  THEN
        CALL sm_io%sm_free_shared( win_3di8 )
+       win_3di8 = -1
+    ENDIF
+    IF ( win_start /= -1 )  THEN
+       CALL sm_io%sm_free_shared( win_start)
+       CALL sm_io%sm_free_shared( win_end)
+       CALL sm_io%sm_free_shared( win_glost)
+       win_start = -1
+       win_end   = -1
+       win_glost = -1
     ENDIF
 …
     win_surf = -1
 #else
     IF ( ASSOCIATED(array_2d)   )  DEALLOCATE( array_2d )
     IF ( ASSOCIATED(array_2di)  )  DEALLOCATE( array_2di )
     IF ( ASSOCIATED(array_3d)   )  DEALLOCATE( array_3d )
     IF ( ASSOCIATED(array_3di4) )  DEALLOCATE( array_3di4 )
     IF ( ASSOCIATED(array_3di8) )  DEALLOCATE( array_3di8 )
+    IF ( ASSOCIATED( array_2d )   )  DEALLOCATE( array_2d )
+    IF ( ASSOCIATED( array_2di )  )  DEALLOCATE( array_2di )
+    IF ( ASSOCIATED( array_3d )   )  DEALLOCATE( array_3d )
+    IF ( ASSOCIATED( array_3di4 ) )  DEALLOCATE( array_3di4 )
+    IF ( ASSOCIATED( array_3di8 ) )  DEALLOCATE( array_3di8 )
 #endif

palm/trunk/SOURCE/shared_memory_io_mod.f90

-                      r4828
+                      r4893
 ! -----------------
 ! $Id$
+! revised output of surface data via MPI-IO for better performance
+!
+! 4828 2021-01-05 11:21:41Z Giersch
 ! additions for output of particle time series
+!
 …
 ! unused variable removed
+!
+!
 ! Additions for cyclic fill mode
+!
+!
 ! File re-formatted to follow the PALM coding standard
 …
+!
 ! Initial version (Klaus Ketelsen)
+!
+!
+!
 ! Description:
 …
               comm1dy,                                                                             &
               comm2d,                                                                              &
+              comm_palm,                                                                           &
               ierr,                                                                                &
               myid,                                                                                &
 …
 #endif
+    USE transpose_indices,                                                                         &
+        ONLY:  nxl_y, nxl_z, nxr_y, nxr_z, nys_x, nys_z, nyn_x, nyn_z, nzb_x, nzb_y, nzt_x, nzt_y
     IMPLICIT NONE
 …
     END TYPE domain_decomposition_grid_features
+    TYPE, PUBLIC ::  sm_remote_array
+       TYPE(C_PTR)  ::  rem_ptr  !<
+       INTEGER(iwp) ::  d1e      !<
+       INTEGER(iwp) ::  d1s      !<
+       INTEGER(iwp) ::  d2e      !<
+       INTEGER(iwp) ::  d2s      !<
+       INTEGER(iwp) ::  d3e      !<
+       INTEGER(iwp) ::  d3s      !<
+       INTEGER(iwp) ::  d4e      !<
+       INTEGER(iwp) ::  d4s      !<
+    END TYPE sm_remote_array
+!
 !-- Class definition for shared memory instances.
 …
        INTEGER(iwp), PUBLIC ::  sh_rank       !<
-       LOGICAL, PUBLIC ::  iam_io_pe = .TRUE.  !< This PE is an IO-PE
+!
 !--    Variables for the I/O virtual grid
        INTEGER(iwp), PUBLIC ::  comm_io  !< Communicator for all IO processes
+       INTEGER(iwp), PUBLIC ::  comm_io  !< communicator for all IO processes
        INTEGER(iwp), PUBLIC ::  io_npes  !<
        INTEGER(iwp), PUBLIC ::  io_rank  !<
+!
 !--    Variables for the node local communicator
        INTEGER(iwp) ::  comm_node          !< Communicator for all processes of current node
+       INTEGER(iwp) ::  comm_node          !< communicator for all processes of current node
        INTEGER(iwp) ::  io_pe_global_rank  !<
        INTEGER(iwp) ::  n_npes             !<
        INTEGER(iwp) ::  n_rank             !<
+       TYPE(domain_decomposition_grid_features), PUBLIC ::  io_grid  !< io grid features, depending on reading from prerun or restart run
+       LOGICAL, PUBLIC ::  is_root_pe          !<
+       LOGICAL, PUBLIC ::  iam_io_pe = .TRUE.  !< this PE is an IO-PE
+       TYPE(domain_decomposition_grid_features), PUBLIC ::  io_grid  !< io grid features, depending on reading from prerun or main run
        CONTAINS
 …
           PROCEDURE, PASS(this), PUBLIC ::  sm_node_barrier
 #if defined( __parallel )
+          PROCEDURE, PASS(this), PUBLIC ::  sm_allocate_shared_1d_32
           PROCEDURE, PASS(this), PUBLIC ::  sm_allocate_shared_1d_64
-          PROCEDURE, PASS(this), PUBLIC ::  sm_allocate_shared_1d_32
           PROCEDURE, PASS(this), PUBLIC ::  sm_allocate_shared_1di
+          PROCEDURE, PASS(this), PUBLIC ::  sm_allocate_shared_2d_32
           PROCEDURE, PASS(this), PUBLIC ::  sm_allocate_shared_2d_64
-          PROCEDURE, PASS(this), PUBLIC ::  sm_allocate_shared_2d_32
           PROCEDURE, PASS(this), PUBLIC ::  sm_allocate_shared_2di
+          PROCEDURE, PASS(this), PUBLIC ::  sm_allocate_shared_3d_32
           PROCEDURE, PASS(this), PUBLIC ::  sm_allocate_shared_3d_64
+          PROCEDURE, PASS(this), PUBLIC ::  sm_allocate_shared_3d_32
+          PROCEDURE, PASS(this), PUBLIC ::  sm_allocate_shared_4d_32
+          PROCEDURE, PASS(this), PUBLIC ::  sm_allocate_shared_4d_64
           PROCEDURE, PASS(this), PUBLIC ::  sm_allocate_shared_3di_32
           PROCEDURE, PASS(this), PUBLIC ::  sm_allocate_shared_3di_64
+          PROCEDURE, PASS(this), PUBLIC ::  sm_all_allocate_shared_3d_64
           GENERIC, PUBLIC ::  sm_allocate_shared =>                                                &
+                                              sm_allocate_shared_1d_64,  sm_allocate_shared_1d_32, &
+                                              sm_allocate_shared_2d_64,  sm_allocate_shared_2d_32, &
+                                              sm_allocate_shared_2di,    sm_allocate_shared_3d_64, &
+                                              sm_allocate_shared_3d_32,  sm_allocate_shared_1di,   &
+                                              sm_allocate_shared_3di_32, sm_allocate_shared_3di_64
+                                             sm_allocate_shared_1d_64,  sm_allocate_shared_1d_32,  &
+                                             sm_allocate_shared_2d_64,  sm_allocate_shared_2d_32,  &
+                                             sm_allocate_shared_2di,    sm_allocate_shared_3d_64,  &
+                                             sm_allocate_shared_4d_64,  sm_allocate_shared_4d_32,  &
+                                             sm_allocate_shared_3d_32,  sm_allocate_shared_1di,    &
+                                             sm_allocate_shared_3di_32, sm_allocate_shared_3di_64
+          GENERIC, PUBLIC ::  sm_all_allocate_shared => sm_all_allocate_shared_3d_64
 #endif
     END TYPE sm_class
 …
     CLASS(sm_class), INTENT(INOUT) ::  this        !< pointer to access internal variables of this call
     INTEGER, INTENT(IN), OPTIONAL  ::  comm_input  !< main model communicator (comm2d) can optional be set
+    INTEGER(iwp), INTENT(IN), OPTIONAL ::  comm_input  !< main model communicator (comm2d) can optional be set
 #if defined( __parallel )
     INTEGER ::  color
     INTEGER ::  max_n_npes  !< maximum number of PEs/node
+    INTEGER ::  color              !<
+    INTEGER ::  max_npes_per_node  !< maximum number of PEs/node
 #endif
 …
     this%nr_io_pe_per_node = 2
+#if defined( __parallel )
     IF ( PRESENT( comm_input ) )  THEN
        this%comm_model = comm_input
 …
     IF ( this%no_shared_memory_in_this_run )  THEN
        this%iam_io_pe = .TRUE.
+       this%sh_rank   = 0
+       this%sh_npes   = 1
        RETURN
     ENDIF
+#if defined( __parallel )
+!
+!-- Determine, how many MPI threads are running on a node
+!
+!-- Determine, how many PEs are running on a node.
     this%iam_io_pe = .FALSE.
     CALL MPI_COMM_SPLIT_TYPE( this%comm_model, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,             &
 …
     CALL MPI_COMM_RANK( this%comm_node, this%n_rank, ierr )
+    CALL MPI_ALLREDUCE( this%n_npes, max_n_npes, 1, MPI_INTEGER, MPI_MAX, this%comm_model, ierr )
+    CALL MPI_ALLREDUCE( this%n_npes, max_npes_per_node, 1, MPI_INTEGER, MPI_MAX, this%comm_model,  &
+                        ierr )
+!
 !-- Decide, if the configuration can run with shared-memory IO
     IF ( max_n_npes > 64 )  THEN
+    IF ( max_npes_per_node > 64 )  THEN
+!
 !--    Special configuration on the HLRN-IV system with 4 shared memory blocks/node
        this%nr_io_pe_per_node = 4
     ELSEIF ( max_n_npes <= 32 )  THEN
+!
 !--    No shared memory IO with less than 32 threads/node
+    ELSEIF ( max_npes_per_node <= 3 )  THEN
+!
+!--    No shared memory IO with less than 3 MPI tasks/node
        this%no_shared_memory_in_this_run = .TRUE.
        this%iam_io_pe = .TRUE.
 …
+!
 !-- No shared memory IO with small setups
     IF ( nx < 24  .OR.  ny < 24 )  THEN
+!-- No shared memory IO with small setups.
+    IF ( nx < 16  .OR.  ny < 16 )  THEN
        this%no_shared_memory_in_this_run = .TRUE.
        this%iam_io_pe = .TRUE.
 …
+!
 !-- Setup the communicator across the nodes depending on the shared memory rank.
 !-- All threads with shared memory rank 0 will be I/O threads.
+!-- All PEs with shared memory rank 0 will be I/O PEs.
     color = this%sh_rank
     CALL MPI_COMM_SPLIT( this%comm_model, color, 0, this%comm_io, ierr )
 …
     ENDIF
     CALL MPI_BCAST( this%io_pe_global_rank, 1, MPI_INTEGER, 0, this%comm_shared, ierr )
 #else
+    this%iam_io_pe = .TRUE.
+    this%iam_io_pe  = .TRUE.
+    this%comm_model = comm2d
+    this%sh_rank    = 0
+    this%sh_npes    = 1
+    this%no_shared_memory_in_this_run = .TRUE.
 #endif
-!      write(9,'(a,8i7)') ' end of sm_init_comm ',this%sh_rank,this%sh_npes,this%io_rank,this%io_npes,this%io_pe_global_rank
-!      write(9,*) 'This process is IO Process ',this%iam_io_pe
 #if defined( __parallel )
 …
     CALL MPI_ALLREDUCE( local_dim_s, local_dim_r, SIZE( local_dim_s ), MPI_INTEGER, MPI_SUM,       &
                         this%comm_node, ierr )
     sh_group_size = ( max_n_npes + this%nr_io_pe_per_node - 1 ) / this%nr_io_pe_per_node
+    sh_group_size = ( max_npes_per_node + this%nr_io_pe_per_node - 1 ) / this%nr_io_pe_per_node
     pe       = 0
 …
  END SUBROUTINE sm_init_comm
+!
 !-- Initializing setup for output of particle time series.
 …
 #if defined( __parallel )
     INTEGER(iwp) ::  color             !<
     INTEGER(iwp) ::  ierr              !<
     INTEGER(iwp) ::  max_n_npes        !< maximum number of PEs/node
+    INTEGER(iwp) ::  color              !<
+    INTEGER(iwp) ::  ierr               !<
+    INTEGER(iwp) ::  max_npes_per_node  !< maximum number of PEs/node
 #endif
 …
 #if defined( __parallel )
+!
 !-- Determine, how many MPI threads are running on a node
+!-- Determine, how many PEs are running on a node.
     this%iam_io_pe = .FALSE.
     CALL MPI_COMM_SPLIT_TYPE( this%comm_model, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL,             &
 …
     CALL MPI_COMM_RANK( this%comm_node, this%n_rank, ierr )
     CALL MPI_ALLREDUCE( this%n_npes, max_n_npes, 1, MPI_INTEGER, MPI_MAX, this%comm_model, ierr )
+    CALL MPI_ALLREDUCE( this%n_npes, max_npes_per_node, 1, MPI_INTEGER, MPI_MAX, this%comm_model,  &
+                        ierr )
+!
 !-- TODO: better explanation
 …
 !-- even better to use the complete node for MPI shared memory (this%nr_io_pe_per_node = 1).
 !-  In the latter case, the access to the MPI shared memory buffer is slower, the number of
 !-- particles to move between threads will be much smaller.
     IF ( max_n_npes > 64 )  THEN
+!-- particles to move between PEs will be much smaller.
+    IF ( max_npes_per_node > 64 )  THEN
+!
 !--    Special configuration on the HLRN-IV system with 4 shared memory blocks/node
 …
+!
 !-- Setup the communicator across the nodes depending on the shared memory rank.
 !-- All threads with shared memory rank 0 will be I/O threads.
+!-- All PEs with shared memory rank 0 will be I/O PEs.
     color = this%sh_rank
     CALL MPI_COMM_SPLIT( this%comm_model, color, 0, this%comm_io, ierr )
 …
 ! Description:
 ! ------------
 !> Allocate shared 1d-REAL (64 Bit) array on ALL threads
+!> Allocate shared 1d-REAL (64 bit) array on PE 0 and pass address to all PEs.
 !--------------------------------------------------------------------------------------------------!
  SUBROUTINE sm_allocate_shared_1d_64( this, p1, d1, d2, win )
 …
     INTEGER(KIND=MPI_ADDRESS_KIND)  ::  wsize
     INTEGER, DIMENSION(1)           ::  buf_shape
+    INTEGER(iwp), DIMENSION(1)      ::  buf_shape
     REAL(dp), DIMENSION(:), POINTER ::  buf
 …
     IF ( this%no_shared_memory_in_this_run )  RETURN
+!
 !-- Allocate shared memory on node rank 0 threads.
+!-- Allocate shared memory on node rank 0 PEs.
     IF ( this%sh_rank == pe_from )  THEN
        wsize = d2 - d1 + 1
 …
 ! Description:
 ! ------------
 !> Allocate shared 1d-REAL (32 Bit) array on ALL threads
+!> Allocate shared 1d-REAL (32 bit) array on PE 0 and pass address to all PEs
 !--------------------------------------------------------------------------------------------------!
  SUBROUTINE sm_allocate_shared_1d_32( this, p1, d1, d2, win )
 …
     INTEGER(KIND=MPI_ADDRESS_KIND)  ::  wsize
     INTEGER, DIMENSION(1)           ::  buf_shape
+    INTEGER(iwp), DIMENSION(1)      ::  buf_shape
     REAL(sp), DIMENSION(:), POINTER ::  buf
 …
     IF ( this%no_shared_memory_in_this_run )  RETURN
+!
 !-- Allocate shared memory on node rank 0 threads.
+!-- Allocate shared memory on node rank 0 PEs.
     IF ( this%sh_rank == pe_from )  THEN
        wsize = d2 - d1 + 1
 …
 ! Description:
 ! ------------
 !> Allocate shared 1d-INTEGER array on ALL threads
+!> Allocate shared 1d-INTEGER array on PE 0 and pass address to all PEs.
 !--------------------------------------------------------------------------------------------------!
  SUBROUTINE sm_allocate_shared_1di( this, p1, d1, d2, win )
 …
     INTEGER(KIND=MPI_ADDRESS_KIND)  ::  wsize
     INTEGER, DIMENSION(1)           ::  buf_shape
+    INTEGER(iwp), DIMENSION(1)          ::  buf_shape
     INTEGER(iwp), DIMENSION(:), POINTER ::  buf
 …
     IF ( this%no_shared_memory_in_this_run )  RETURN
+!
 !-- Allocate shared memory on node rank 0 threads.
+!-- Allocate shared memory on node rank 0 PEs.
     IF ( this%sh_rank == pe_from )  THEN
        wsize = d2 - d1 + 1
 …
 ! Description:
 ! ------------
 !> Allocate shared 2d-REAL array on ALL threads (64 Bit)
+!> Allocate shared 2d-REAL array (64 bit) on PE 0 and pass address to all PEs.
 !--------------------------------------------------------------------------------------------------!
  SUBROUTINE sm_allocate_shared_2d_64( this, p2, n_nxlg, n_nxrg, n_nysg, n_nyng, win )
 …
     IF ( this%no_shared_memory_in_this_run )  RETURN
+!
 !-- Allocate shared memory on node rank 0 threads.
+!-- Allocate shared memory on node rank 0 PEs.
     IF ( this%sh_rank == pe_from )  THEN
        wsize = ( n_nyng - n_nysg + 1 ) * ( n_nxrg - n_nxlg + 1 )
 …
 ! Description:
 ! ------------
 !> Allocate shared 2d-REAL (32 Bit) array on ALL threads
+!> Allocate shared 2d-REAL (32 Bit) array on PE 0 and pass address to all PEs.
 !--------------------------------------------------------------------------------------------------!
  SUBROUTINE sm_allocate_shared_2d_32( this, p2, n_nxlg, n_nxrg, n_nysg, n_nyng, win )
 …
     IF ( this%no_shared_memory_in_this_run )  RETURN
+!
 !-- Allocate shared memory on node rank 0 threads.
+!-- Allocate shared memory on node rank 0 PEs.
     IF ( this%sh_rank == pe_from )  THEN
        wsize = ( n_nyng - n_nysg + 1 ) * ( n_nxrg - n_nxlg + 1 )
 …
 ! Description:
 ! ------------
 !> Allocate shared 2d-INTEGER array on ALL threads
+!> Allocate shared 2d-INTEGER array on PE 0 and pass address to all PEs.
 !--------------------------------------------------------------------------------------------------!
  SUBROUTINE sm_allocate_shared_2di( this, p2i, n_nxlg, n_nxrg, n_nysg, n_nyng, win )
 …
     IF ( this%no_shared_memory_in_this_run )  RETURN
+!
 !-- Allocate shared memory on node rank 0 threads.
+!-- Allocate shared memory on node rank 0 PEs.
     IF ( this%sh_rank == pe_from )  THEN
        wsize = ( n_nyng - n_nysg + 1 ) * ( n_nxrg - n_nxlg + 1 )
 …
 ! Description:
 ! ------------
 !> Allocate shared 3d-REAL (64 Bit) array on ALL threads
+!> Allocate shared 3d-REAL (64 bit) array on PE 0 and pass address to all PEs.
 !--------------------------------------------------------------------------------------------------!
  SUBROUTINE sm_allocate_shared_3d_64( this, p3, d1s, d1e, d2s, d2e, d3s, d3e, win )
 …
     CLASS(sm_class), INTENT(inout)      ::  this         !<
     INTEGER                             ::  disp_unit    !<
     INTEGER, INTENT(IN)                 ::  d1e          !<
     INTEGER, INTENT(IN)                 ::  d1s          !<
     INTEGER, INTENT(IN)                 ::  d2e          !<
     INTEGER, INTENT(IN)                 ::  d2s          !<
     INTEGER, INTENT(IN)                 ::  d3e          !<
     INTEGER, INTENT(IN)                 ::  d3s          !<
     INTEGER, SAVE                       ::  pe_from = 0  !<
     INTEGER, INTENT(OUT)                ::  win          !<
+    INTEGER(iwp)                        ::  disp_unit    !<
+    INTEGER(iwp), INTENT(IN)            ::  d1e          !<
+    INTEGER(iwp), INTENT(IN)            ::  d1s          !<
+    INTEGER(iwp), INTENT(IN)            ::  d2e          !<
+    INTEGER(iwp), INTENT(IN)            ::  d2s          !<
+    INTEGER(iwp), INTENT(IN)            ::  d3e          !<
+    INTEGER(iwp), INTENT(IN)            ::  d3s          !<
+    INTEGER(iwp), SAVE                  ::  pe_from = 0  !<
+    INTEGER(iwp), INTENT(OUT)           ::  win          !<
     INTEGER(KIND=MPI_ADDRESS_KIND)      ::  rem_size     !<
     INTEGER(KIND=MPI_ADDRESS_KIND)      ::  wsize        !<
     INTEGER, DIMENSION(3)               ::  buf_shape    !<
+    INTEGER(iwp), DIMENSION(3)          ::  buf_shape    !<
     REAL(dp), DIMENSION(:,:,:), POINTER ::  buf          !<
 …
     IF ( this%no_shared_memory_in_this_run )  RETURN
+!
 !-- Allocate shared memory on node rank 0 threads.
+!-- Allocate shared memory on node rank 0 PEs.
     IF ( this%sh_rank == pe_from )  THEN
        wsize = ( d3e - d3s + 1 ) * ( d2e - d2s + 1 ) * ( d1e - d1s + 1 )
 …
 ! Description:
 ! ------------
 !> Allocate shared 3d-REAL (32 Bit) array on ALL threads
+!> Allocate shared 3d-REAL (32 bit) array on PE 0 and pass address to all PEs.
 !--------------------------------------------------------------------------------------------------!
  SUBROUTINE sm_allocate_shared_3d_32( this, p3, d1s, d1e, d2s, d2e, d3s, d3e, win )
 …
     CLASS(sm_class), INTENT(inout)      ::  this
     INTEGER                             ::  disp_unit
     INTEGER, INTENT(IN)                 ::  d1e
     INTEGER, INTENT(IN)                 ::  d1s
     INTEGER, INTENT(IN)                 ::  d2e
     INTEGER, INTENT(IN)                 ::  d2s
     INTEGER, INTENT(IN)                 ::  d3e
     INTEGER, INTENT(IN)                 ::  d3s
     INTEGER, SAVE                       ::  pe_from = 0
     INTEGER, INTENT(OUT)                ::  win
+    INTEGER(iwp)                        ::  disp_unit
+    INTEGER(iwp), INTENT(IN)            ::  d1e
+    INTEGER(iwp), INTENT(IN)            ::  d1s
+    INTEGER(iwp), INTENT(IN)            ::  d2e
+    INTEGER(iwp), INTENT(IN)            ::  d2s
+    INTEGER(iwp), INTENT(IN)            ::  d3e
+    INTEGER(iwp), INTENT(IN)            ::  d3s
+    INTEGER(iwp), SAVE                  ::  pe_from = 0
+    INTEGER(iwp), INTENT(OUT)           ::  win
     INTEGER(KIND=MPI_ADDRESS_KIND)      ::  rem_size
     INTEGER(KIND=MPI_ADDRESS_KIND)      ::  wsize
     INTEGER, DIMENSION(3)               ::  buf_shape
+    INTEGER(iwp), DIMENSION(3)          ::  buf_shape
     REAL(sp), DIMENSION(:,:,:), POINTER ::  buf
 …
     IF ( this%no_shared_memory_in_this_run )  RETURN
+!
 !-- Allocate shared memory on node rank 0 threads.
+!-- Allocate shared memory on node rank 0 PEs.
     IF ( this%sh_rank == pe_from )  THEN
        wsize = ( d3e - d3s + 1 ) * ( d2e - d2s + 1 ) * ( d1e - d1s + 1 )
 …
 ! Description:
 ! ------------
+!> Allocate shared 3d-REAL (32 bit) array on ALL threads
+!> Allocate shared 4d-REAL (64 bit) array on PE 0 and pass address to all PEs.
+!--------------------------------------------------------------------------------------------------!
+ SUBROUTINE sm_allocate_shared_4d_64( this, p3, d1s, d1e, d2s, d2e, d3s, d3e, d4s, d4e, win )
+    IMPLICIT NONE
+    CLASS(sm_class), INTENT(inout)        ::  this         !<
+    INTEGER                               ::  disp_unit    !<
+    INTEGER(iwp), INTENT(IN)              ::  d1e          !<
+    INTEGER(iwp), INTENT(IN)              ::  d1s          !<
+    INTEGER(iwp), INTENT(IN)              ::  d2e          !<
+    INTEGER(iwp), INTENT(IN)              ::  d2s          !<
+    INTEGER(iwp), INTENT(IN)              ::  d3e          !<
+    INTEGER(iwp), INTENT(IN)              ::  d3s          !<
+    INTEGER(iwp), INTENT(IN)              ::  d4e          !<
+    INTEGER(iwp), INTENT(IN)              ::  d4s          !<
+    INTEGER(iwp), SAVE                    ::  pe_from = 0  !<
+    INTEGER(iwp), INTENT(OUT)             ::  win          !<
+    INTEGER(KIND=MPI_ADDRESS_KIND)        ::  rem_size     !<
+    INTEGER(KIND=MPI_ADDRESS_KIND)        ::  wsize        !<
+    INTEGER(iwp), DIMENSION(4)            ::  buf_shape    !<
+    REAL(dp), DIMENSION(:,:,:,:), POINTER ::  buf          !<
+    REAL(dp), DIMENSION(:,:,:,:), POINTER ::  p3           !<
+    TYPE(C_PTR), SAVE                     ::  base_ptr     !<
+    TYPE(C_PTR), SAVE                     ::  rem_ptr      !<
+    IF ( this%no_shared_memory_in_this_run )  RETURN
+!
+!-- Allocate shared memory on node rank 0 PEs.
+    IF ( this%sh_rank == pe_from )  THEN
+       wsize = (d4e - d4s +1) * ( d3e - d3s + 1 ) * ( d2e - d2s + 1 ) * ( d1e - d1s + 1 )
+    ELSE
+       wsize = 1
+    ENDIF
+    wsize = wsize * dp ! Please note, size is always in bytes, independently of the displacement
+                       ! unit
+    CALL MPI_WIN_ALLOCATE_SHARED( wsize, dp, MPI_INFO_NULL, this%comm_shared, base_ptr, win, ierr )
+!
+!-- Get C-pointer of the memory located on node-rank pe_from (sh_rank == pe_from)
+    CALL MPI_WIN_SHARED_QUERY( win, pe_from, rem_size, disp_unit, rem_ptr, ierr )
+!
+!-- Convert C- to Fortran-pointer
+    buf_shape(4) = d4e - d4s + 1
+    buf_shape(3) = d3e - d3s + 1
+    buf_shape(2) = d2e - d2s + 1
+    buf_shape(1) = d1e - d1s + 1
+    CALL C_F_POINTER( rem_ptr, buf, buf_shape )
+    p3(d1s:,d2s:,d3s:,d4s:) => buf
+!
+!-- Allocate shared memory in round robin on all PEs of a node.
+    pe_from = MOD( pe_from, this%sh_npes )
+ END SUBROUTINE sm_allocate_shared_4d_64
+!--------------------------------------------------------------------------------------------------!
+! Description:
+! ------------
+!> Allocate shared 4d-REAL (32 bit) array on PE 0 and pass address to all PEs.
+!--------------------------------------------------------------------------------------------------!
+ SUBROUTINE sm_allocate_shared_4d_32( this, p3, d1s, d1e, d2s, d2e, d3s, d3e, d4s, d4e, win )
+    IMPLICIT NONE
+    CLASS(sm_class), INTENT(inout)        ::  this         !<
+    INTEGER                               ::  disp_unit    !<
+    INTEGER(iwp), INTENT(IN)              ::  d1e          !<
+    INTEGER(iwp), INTENT(IN)              ::  d1s          !<
+    INTEGER(iwp), INTENT(IN)              ::  d2e          !<
+    INTEGER(iwp), INTENT(IN)              ::  d2s          !<
+    INTEGER(iwp), INTENT(IN)              ::  d3e          !<
+    INTEGER(iwp), INTENT(IN)              ::  d3s          !<
+    INTEGER(iwp), INTENT(IN)              ::  d4e          !<
+    INTEGER(iwp), INTENT(IN)              ::  d4s          !<
+    INTEGER(iwp), SAVE                    ::  pe_from = 0  !<
+    INTEGER(iwp), INTENT(OUT)             ::  win          !<
+    INTEGER(KIND=MPI_ADDRESS_KIND)        ::  rem_size     !<
+    INTEGER(KIND=MPI_ADDRESS_KIND)        ::  wsize        !<
+    INTEGER(iwp), DIMENSION(4)            ::  buf_shape    !<
+    REAL(sp), DIMENSION(:,:,:,:), POINTER ::  buf          !<
+    REAL(sp), DIMENSION(:,:,:,:), POINTER ::  p3           !<
+    TYPE(C_PTR), SAVE                     ::  base_ptr     !<
+    TYPE(C_PTR), SAVE                     ::  rem_ptr      !<
+    IF ( this%no_shared_memory_in_this_run )  RETURN
+!
+!-- Allocate shared memory on node rank 0 PEs.
+    IF ( this%sh_rank == pe_from )  THEN
+       wsize = (d4e - d4s +1) * ( d3e - d3s + 1 ) * ( d2e - d2s + 1 ) * ( d1e - d1s + 1 )
+    ELSE
+       wsize = 1
+    ENDIF
+    wsize = wsize * sp ! Please note, size is always in bytes, independently of the displacement
+                       ! unit
+    CALL MPI_WIN_ALLOCATE_SHARED( wsize, dp, MPI_INFO_NULL, this%comm_shared, base_ptr, win, ierr )
+!
+!-- Get C-pointer of the memory located on node-rank pe_from (sh_rank == pe_from)
+    CALL MPI_WIN_SHARED_QUERY( win, pe_from, rem_size, disp_unit, rem_ptr, ierr )
+!
+!-- Convert C- to Fortran-pointer
+    buf_shape(4) = d4e - d4s + 1
+    buf_shape(3) = d3e - d3s + 1
+    buf_shape(2) = d2e - d2s + 1
+    buf_shape(1) = d1e - d1s + 1
+    CALL C_F_POINTER( rem_ptr, buf, buf_shape )
+    p3(d1s:,d2s:,d3s:,d4s:) => buf
+!
+!-- Allocate shared memory in round robin on all PEs of a node.
+    pe_from = MOD( pe_from, this%sh_npes )
+ END SUBROUTINE sm_allocate_shared_4d_32
+!--------------------------------------------------------------------------------------------------!
+! Description:
+! ------------
+!> Allocate shared 3d-INTEGER (32 bit) array on PE 0 and pass address to all PEs.
 !--------------------------------------------------------------------------------------------------!
  SUBROUTINE sm_allocate_shared_3di_32( this, p3, d1s, d1e, d2s, d2e, d3s, d3e, win )
 …
     IMPLICIT NONE
     CLASS(sm_class), INTENT(inout)      ::  this
     INTEGER                             ::  disp_unit
     INTEGER, INTENT(IN)                 ::  d1e
     INTEGER, INTENT(IN)                 ::  d1s
     INTEGER, INTENT(IN)                 ::  d2e
     INTEGER, INTENT(IN)                 ::  d2s
     INTEGER, INTENT(IN)                 ::  d3e
     INTEGER, INTENT(IN)                 ::  d3s
     INTEGER, SAVE                       ::  pe_from = 0
     INTEGER, INTENT(OUT)                ::  win
     INTEGER(KIND=MPI_ADDRESS_KIND)      ::  rem_size
     INTEGER(KIND=MPI_ADDRESS_KIND)      ::  wsize
     INTEGER, DIMENSION(3)               ::  buf_shape
+    CLASS(sm_class), INTENT(inout)          ::  this
+    INTEGER                                 ::  disp_unit
+    INTEGER(iwp), INTENT(IN)                ::  d1e
+    INTEGER(iwp), INTENT(IN)                ::  d1s
+    INTEGER(iwp), INTENT(IN)                ::  d2e
+    INTEGER(iwp), INTENT(IN)                ::  d2s
+    INTEGER(iwp), INTENT(IN)                ::  d3e
+    INTEGER(iwp), INTENT(IN)                ::  d3s
+    INTEGER(iwp), SAVE                      ::  pe_from = 0
+    INTEGER(iwp), INTENT(OUT)               ::  win
+    INTEGER(KIND=MPI_ADDRESS_KIND)          ::  rem_size
+    INTEGER(KIND=MPI_ADDRESS_KIND)          ::  wsize
+    INTEGER(iwp), DIMENSION(3)              ::  buf_shape
     INTEGER(isp), DIMENSION(:,:,:), POINTER ::  buf
     INTEGER(isp), DIMENSION(:,:,:), POINTER ::  p3
     TYPE(C_PTR), SAVE                   ::  base_ptr
     TYPE(C_PTR), SAVE                   ::  rem_ptr
     IF ( this%no_shared_memory_in_this_run )  RETURN
+!
 !-- Allocate shared memory on node rank 0 threads.
+    TYPE(C_PTR), SAVE                       ::  base_ptr
+    TYPE(C_PTR), SAVE                       ::  rem_ptr
+    IF ( this%no_shared_memory_in_this_run )  RETURN
+!
+!-- Allocate shared memory on node rank 0 PEs.
     IF ( this%sh_rank == pe_from )  THEN
        wsize = ( d3e - d3s + 1 ) * ( d2e - d2s + 1 ) * ( d1e - d1s + 1 )
 …
 ! Description:
 ! ------------
 !> Allocate shared 3d-REAL (64 bit) array on ALL threads
+!> Allocate shared 3d-INTEGER (64 bit) array on PE 0 and pass address to all PEs.
 !--------------------------------------------------------------------------------------------------!
  SUBROUTINE sm_allocate_shared_3di_64( this, p3, d1s, d1e, d2s, d2e, d3s, d3e, win )
 …
     IMPLICIT NONE
     CLASS(sm_class), INTENT(inout)      ::  this         !<
     INTEGER                             ::  disp_unit    !<
     INTEGER, INTENT(IN)                 ::  d1e          !<
     INTEGER, INTENT(IN)                 ::  d1s          !<
     INTEGER, INTENT(IN)                 ::  d2e          !<
     INTEGER, INTENT(IN)                 ::  d2s          !<
     INTEGER, INTENT(IN)                 ::  d3e          !<
     INTEGER, INTENT(IN)                 ::  d3s          !<
     INTEGER, SAVE                       ::  pe_from = 0  !<
     INTEGER, INTENT(OUT)                ::  win          !<
     INTEGER(KIND=MPI_ADDRESS_KIND)      ::  rem_size     !<
     INTEGER(KIND=MPI_ADDRESS_KIND)      ::  wsize        !<
     INTEGER, DIMENSION(3)               ::  buf_shape    !<
+    CLASS(sm_class), INTENT(inout)          ::  this         !<
+    INTEGER                                 ::  disp_unit    !<
+    INTEGER(iwp), INTENT(IN)                ::  d1e          !<
+    INTEGER(iwp), INTENT(IN)                ::  d1s          !<
+    INTEGER(iwp), INTENT(IN)                ::  d2e          !<
+    INTEGER(iwp), INTENT(IN)                ::  d2s          !<
+    INTEGER(iwp), INTENT(IN)                ::  d3e          !<
+    INTEGER(iwp), INTENT(IN)                ::  d3s          !<
+    INTEGER(iwp), SAVE                      ::  pe_from = 0  !<
+    INTEGER(iwp), INTENT(OUT)               ::  win          !<
+    INTEGER(KIND=MPI_ADDRESS_KIND)          ::  rem_size     !<
+    INTEGER(KIND=MPI_ADDRESS_KIND)          ::  wsize        !<
+    INTEGER(iwp), DIMENSION(3)              ::  buf_shape    !<
     INTEGER(idp), DIMENSION(:,:,:), POINTER ::  buf          !<
     INTEGER(idp), DIMENSION(:,:,:), POINTER ::  p3           !<
     TYPE(C_PTR), SAVE                   ::  base_ptr     !<
     TYPE(C_PTR), SAVE                   ::  rem_ptr      !<
     IF ( this%no_shared_memory_in_this_run )  RETURN
+!
 !-- Allocate shared memory on node rank 0 threads.
+    TYPE(C_PTR), SAVE                       ::  base_ptr     !<
+    TYPE(C_PTR), SAVE                       ::  rem_ptr      !<
+    IF ( this%no_shared_memory_in_this_run )  RETURN
+!
+!-- Allocate shared memory on node rank 0 PEs.
     IF ( this%sh_rank == pe_from )  THEN
        wsize = ( d3e - d3s + 1 ) * ( d2e - d2s + 1 ) * ( d1e - d1s + 1 )
 …
  END SUBROUTINE sm_allocate_shared_3di_64
+!--------------------------------------------------------------------------------------------------!
+! Description:
+! ------------
+!> Allocate shared 3d-REAL (64 Bit) array on ALL PEs.
+!>
+!> Every PE allocates the local part of a node-shared array.
+!> The C-Pointer of this array and the local limits are broadcasted to all PEs of the node
+!> The information is store in an array of type sm_remote_array and can be retrieved
+!> by sm_remote_array to access remote data.
+!--------------------------------------------------------------------------------------------------!
+ SUBROUTINE sm_all_allocate_shared_3d_64( this, p3, d1s, d1e, d2s, d2e, d3s, d3e, remote_arrays, win )
+    IMPLICIT NONE
+    CLASS(sm_class), INTENT(inout)      ::  this         !< class pointer
+    REAL(dp), DIMENSION(:,:,:), POINTER ::  p3           !< return local array pointer
+    INTEGER(iwp), INTENT(IN)            ::  d1e          !< end index dimension 1
+    INTEGER(iwp), INTENT(IN)            ::  d1s          !< start index dimension 1
+    INTEGER(iwp), INTENT(IN)            ::  d2e          !< end index dimension 2
+    INTEGER(iwp), INTENT(IN)            ::  d2s          !< start index dimension 2
+    INTEGER(iwp), INTENT(IN)            ::  d3e          !< end index dimension 3
+    INTEGER(iwp), INTENT(IN)            ::  d3s          !< start index dimension 3
+    INTEGER(iwp), INTENT(OUT)           ::  win          !< MPI Window
+    INTEGER(iwp), DIMENSION(3)          ::  buf_shape    !<
+    INTEGER(iwp)                        ::  disp_unit    !<
+    INTEGER(iwp)                        ::  i            !<
+    INTEGER(iwp), SAVE                  ::  pe_from = 0  !<
+    INTEGER(KIND=MPI_ADDRESS_KIND)      ::  rem_size     !<
+    INTEGER(KIND=MPI_ADDRESS_KIND)      ::  wsize        !<
+    REAL(dp), DIMENSION(:,:,:), POINTER ::  buf          !<
+    TYPE(sm_remote_array),INTENT(INOUT), DIMENSION(0:this%sh_npes-1) :: remote_arrays !< info about all remote arrays
+    TYPE(C_PTR), SAVE                   ::  base_ptr     !<
+    INTEGER(iwp),DIMENSION(6,0:this%sh_npes-1)              ::  all_indices_s
+    INTEGER(iwp),DIMENSION(6,0:this%sh_npes-1)              ::  all_indices
+    IF ( this%no_shared_memory_in_this_run )  RETURN
+    all_indices_s = 0
+    wsize = ( d3e - d3s + 1 ) * ( d2e - d2s + 1 ) * ( d1e - d1s + 1 )
+    wsize = wsize * dp   ! Please note, size is always in bytes, independently of the displacement unit
+    CALL MPI_WIN_ALLOCATE_SHARED( wsize, dp, MPI_INFO_NULL, this%comm_shared, base_ptr, win, ierr )
+!
+!-- Get C-pointer of the memory located on node-rank pe_from (sh_rank == pe_from)
+    all_indices_s(1,this%sh_rank) = d1s
+    all_indices_s(2,this%sh_rank) = d1e
+    all_indices_s(3,this%sh_rank) = d2s
+    all_indices_s(4,this%sh_rank) = d2e
+    all_indices_s(5,this%sh_rank) = d3s
+    all_indices_s(6,this%sh_rank) = d3e
+    CALL MPI_ALLREDUCE (all_indices_s ,all_indices, SIZE(all_indices_s), MPI_INTEGER, MPI_SUM, this%comm_shared, ierr)
+    DO i=0,this%sh_npes-1
+       CALL MPI_WIN_SHARED_QUERY( win, i, rem_size, disp_unit, remote_arrays(i)%rem_ptr, ierr )
+       remote_arrays(i)%d1s = all_indices(1,i)
+       remote_arrays(i)%d1e = all_indices(2,i)
+       remote_arrays(i)%d2s = all_indices(3,i)
+       remote_arrays(i)%d2e = all_indices(4,i)
+       remote_arrays(i)%d3s = all_indices(5,i)
+       remote_arrays(i)%d3e = all_indices(6,i)
+    END DO
+!
+!-- Convert C- to Fortran-pointer
+    buf_shape(3) = d3e - d3s + 1
+    buf_shape(2) = d2e - d2s + 1
+    buf_shape(1) = d1e - d1s + 1
+    CALL C_F_POINTER( remote_arrays(this%sh_rank)%rem_ptr, buf, buf_shape )
+    p3(d1s:,d2s:,d3s:) => buf
+!
+!-- Allocate shared memory in round robin on all PEs of a node.
+    pe_from = MOD( pe_from, this%sh_npes )
+ END SUBROUTINE sm_all_allocate_shared_3d_64
 #endif
 …
 !> ...
 !--------------------------------------------------------------------------------------------------!
+ SUBROUTINE sm_node_barrier( this )
+    IMPLICIT NONE
+ SUBROUTINE sm_node_barrier( this, win )
+    IMPLICIT NONE
+    INTEGER(iwp), OPTIONAL         ::  win   !<
     CLASS(sm_class), INTENT(inout) ::  this  !<
 …
 #if defined( __parallel )
     CALL MPI_BARRIER( this%comm_shared, ierr )
+    IF ( PRESENT(win) )  THEN
+       CALL MPI_WIN_FENCE(0, win, ierr )
+    ENDIF
 #endif

palm/trunk/SOURCE/surface_data_output_mod.f90

-                      r4892
+                      r4893
 ! -----------------
 ! $Id$
+! revised output of surface data via MPI-IO for better performance
+!
+! 4892 2021-03-02 11:53:58Z suehring
 ! Remove outdated error message.
+!
 …
     INTEGER(iwp), DIMENSION(:,:), ALLOCATABLE ::  end_index           !< end index of surface data at (j,i)
+    INTEGER(iwp), DIMENSION(:,:), ALLOCATABLE ::  global_start_index  !< index array for surface data (MPI-IO)
+    INTEGER(iwp), DIMENSION(:,:), ALLOCATABLE ::  global_end_index    !< end index array for surface data (MPI-IO)
+    INTEGER(iwp), DIMENSION(:,:), ALLOCATABLE ::  global_start_index  !< start index array for surface data (MPI-IO)
     INTEGER(iwp), DIMENSION(:,:), ALLOCATABLE ::  num_surf            !< number of surface data at (j,i)
     INTEGER(iwp), DIMENSION(:,:), ALLOCATABLE ::  start_index         !< start index of surface data at (j,i)
 …
     ALLOCATE( surf_in(1:surfaces%ns) )
-    CALL rd_mpi_io_check_array( 'surfaces%start_index', found = array_found )
-    IF ( array_found )  CALL rrd_mpi_io( 'surfaces%start_index', start_index )
-    CALL rd_mpi_io_check_array( 'surfaces%end_index', found = array_found )
-    IF ( array_found )  CALL rrd_mpi_io( 'surfaces%end_index', end_index )
     CALL rd_mpi_io_check_array( 'surfaces%global_start_index', found = array_found )
     IF ( array_found )  CALL rrd_mpi_io( 'surfaces%global_start_index', global_start_index )
+    CALL rd_mpi_io_surface_filetypes( start_index, end_index, ldum, global_start_index )
+    CALL rd_mpi_io_check_array( 'surfaces%global_end_index', found = array_found )
+    IF ( array_found )  CALL rrd_mpi_io( 'surfaces%global_end_index', global_end_index )
+    CALL rd_mpi_io_surface_filetypes( start_index, end_index, ldum, global_start_index,            &
+                                      global_end_index )
     DO  nv = 1, dosurf_no(1)
 …
     INTEGER(iwp), DIMENSION(:,:), ALLOCATABLE ::  end_index           !< end index of surface data at (j,i)
+    INTEGER(iwp), DIMENSION(:,:), ALLOCATABLE ::  global_start_index  !< index array for surface data (MPI-IO)
+    INTEGER(iwp), DIMENSION(:,:), ALLOCATABLE ::  global_end_index    !< end index array for surface data (MPI-IO)
+    INTEGER(iwp), DIMENSION(:,:), ALLOCATABLE ::  global_start_index  !< start index array for surface data (MPI-IO)
     INTEGER(iwp), DIMENSION(:,:), ALLOCATABLE ::  num_surf            !< number of surface data at (j,i)
     INTEGER(iwp), DIMENSION(:,:), ALLOCATABLE ::  start_index         !< start index of surface data at (j,i)
 …
           CALL rd_mpi_io_surface_filetypes( start_index, end_index, surface_data_to_write,         &
+                                            global_start_index )
+          CALL wrd_mpi_io( 'surfaces%start_index',        start_index        )
+          CALL wrd_mpi_io( 'surfaces%end_index',          end_index          )
+                                            global_start_index, global_end_index )
           CALL wrd_mpi_io( 'surfaces%global_start_index', global_start_index )
+          CALL wrd_mpi_io( 'surfaces%global_end_index', global_end_index )
           DO  nv = 1, dosurf_no(1)

palm/trunk/SOURCE/surface_mod.f90

-                      r4882
+                      r4893
 ! -----------------
 ! $Id$
+! revised output of surface data via MPI-IO for better performance
+!
+! 4882 2021-02-19 22:49:44Z forkel
 ! removed lsp in subroutine nitialize_top
+!
+!
 ! 4881 2021-02-19 22:05:08Z forkel
 ! removed constant_top_csflux option
+!
+!
 ! 4877 2021-02-17 16:17:35Z suehring
 …
 !> Allocating memory for upward and downward-facing horizontal surface types, except for top fluxes.
 !--------------------------------------------------------------------------------------------------!
+ SUBROUTINE allocate_surface_attributes_h( surfaces, nys_l, nyn_l, nxl_l, nxr_l )
+ SUBROUTINE allocate_surface_attributes_h( surfaces, nys_l, nyn_l, nxl_l, nxr_l,                   &
+                                           no_allocate_index_arrays )
     IMPLICIT NONE
 …
     INTEGER(iwp) ::  nxr_l  !< east bound of local 2d array start/end_index, is equal to nyn, except for restart-array
+    LOGICAL ::  allocate_index_arrays
+    LOGICAL, INTENT(IN), OPTIONAL  :: no_allocate_index_arrays
     TYPE(surf_type) ::  surfaces  !< respective surface type
+    IF ( PRESENT( no_allocate_index_arrays ) )  THEN
+       allocate_index_arrays = .NOT. no_allocate_index_arrays
+    ELSE
+       allocate_index_arrays = .TRUE.
+    ENDIF
+!
 !-- Allocate arrays for start and end index of horizontal surface type for each (j,i)-grid point.
 !-- This is required e.g. in diffion_x, which is called for each (j,i). In order to find the
+!-- This is required e.g. in diffusion_x, which is called for each (j,i). In order to find the
 !-- location where the respective flux is store within the surface-type, start- and end-index are
 !-- stored for each (j,i). For example, each (j,i) can have several entries where fluxes for
 …
 !-- surfaces might exist for given (j,i). If no surface of respective type exist at current (j,i),
 !-- set indicies such that loop in diffusion routines will not be entered.
+    ALLOCATE ( surfaces%start_index(nys_l:nyn_l,nxl_l:nxr_l) )
+    ALLOCATE ( surfaces%end_index(nys_l:nyn_l,nxl_l:nxr_l) )
+    surfaces%start_index = 0
+    surfaces%end_index   = -1
+    IF ( allocate_index_arrays )  THEN
+       ALLOCATE( surfaces%start_index(nys_l:nyn_l,nxl_l:nxr_l) )
+       ALLOCATE( surfaces%end_index(nys_l:nyn_l,nxl_l:nxr_l)   )
+       surfaces%start_index = 0
+       surfaces%end_index   = -1
+    ENDIF
+!
 !-- Indices to locate surface element
 …
 !> Allocating memory for vertical surface types.
 !--------------------------------------------------------------------------------------------------!
+ SUBROUTINE allocate_surface_attributes_v( surfaces, nys_l, nyn_l, nxl_l, nxr_l )
+ SUBROUTINE allocate_surface_attributes_v( surfaces, nys_l, nyn_l, nxl_l, nxr_l,                   &
+                                           no_allocate_index_arrays )
     IMPLICIT NONE
 …
     INTEGER(iwp) ::  nxr_l  !< east bound of local 2d array start/end_index, is equal to nyn, except for restart-array
+    LOGICAL ::  allocate_index_arrays
+    LOGICAL, INTENT(IN), OPTIONAL ::  no_allocate_index_arrays
     TYPE(surf_type) ::  surfaces !< respective surface type
+    IF ( PRESENT( no_allocate_index_arrays ) )  THEN
+       allocate_index_arrays = .NOT. no_allocate_index_arrays
+    ELSE
+       allocate_index_arrays = .TRUE.
+    ENDIF
+!
 !-- Allocate arrays for start and end index of vertical surface type for each (j,i)-grid point. This
 !-- is required in diffion_x, which is called for each (j,i). In order to find the location where
+!-- is required in diffusion_x, which is called for each (j,i). In order to find the location where
 !-- the respective flux is store within the surface-type, start- and end-index are stored for each
 !-- (j,i). For example, each (j,i) can have several entries where fluxes for vertical surfaces might
 !-- be stored. In the flat case, where no vertical walls exit, set indicies such that loop in
 !-- diffusion routines will not be entered.
+    ALLOCATE ( surfaces%start_index(nys_l:nyn_l,nxl_l:nxr_l) )
+    ALLOCATE ( surfaces%end_index(nys_l:nyn_l,nxl_l:nxr_l) )
+    surfaces%start_index = 0
+    surfaces%end_index   = -1
+    IF ( allocate_index_arrays )  THEN
+       ALLOCATE( surfaces%start_index(nys_l:nyn_l,nxl_l:nxr_l) )
+       ALLOCATE( surfaces%end_index(nys_l:nyn_l,nxl_l:nxr_l)   )
+       surfaces%start_index = 0
+       surfaces%end_index   = -1
+    ENDIF
+!
 !-- Indices to locate surface element.
 …
     INTEGER(iwp), DIMENSION(0:3) ::  start_index_v  !< start index for vertical surface elements on gathered surface array
+    INTEGER(iwp),DIMENSION(nys:nyn,nxl:nxr) ::  global_start_index  !< index for surface data (MPI-IO)
+    INTEGER(iwp),DIMENSION(nys:nyn,nxl:nxr) ::  global_end_index    !< end index for surface data (MPI-IO)
+    INTEGER(iwp),DIMENSION(nys:nyn,nxl:nxr) ::  global_start_index  !< start index for surface data (MPI-IO)
     LOGICAL ::  surface_data_to_write  !< switch for MPI-I/O if PE has surface data to write
 …
           CALL rd_mpi_io_surface_filetypes( surf_h(l)%start_index, surf_h(l)%end_index,            &
+                                            surface_data_to_write, global_start_index )
+                                            surface_data_to_write, global_start_index,             &
+                                            global_end_index )
           IF ( .NOT. surface_data_to_write )  CYCLE
           ns_h_on_file(l) = total_number_of_surface_values
-          CALL wrd_mpi_io( 'surf_h(' // dum // ')%start_index', surf_h(l)%start_index )
-          CALL wrd_mpi_io( 'surf_h(' // dum // ')%end_index', surf_h(l)%end_index )
           CALL wrd_mpi_io( 'global_start_index_h_' // dum, global_start_index )
+          CALL wrd_mpi_io( 'global_end_index_h_' // dum, global_end_index )
           IF ( ALLOCATED ( surf_h(l)%us ) )  THEN
 …
           CALL rd_mpi_io_surface_filetypes( surf_v(l)%start_index, surf_v(l)%end_index,            &
+                                            surface_data_to_write, global_start_index )
+                                            surface_data_to_write, global_start_index,             &
+                                            global_end_index )
+          IF ( .NOT. surface_data_to_write )  CYCLE
           ns_v_on_file(l) = total_number_of_surface_values
-          CALL wrd_mpi_io( 'surf_v(' // dum // ')%start_index', surf_v(l)%start_index )
-          CALL wrd_mpi_io( 'surf_v(' // dum // ')%end_index', surf_v(l)%end_index )
           CALL wrd_mpi_io( 'global_start_index_v_' // dum, global_start_index )
+          CALL wrd_mpi_io( 'global_end_index_v_' // dum, global_end_index )
           IF ( .NOT. surface_data_to_write )  CYCLE
 …
     INTEGER(iwp) ::  mm !< loop index for surface types - file array
+    INTEGER(iwp), DIMENSION(nys:nyn,nxl:nxr) ::  global_start_index  !< index for surface data (MPI-IO)
+    LOGICAL ::  ldum            !< dummy variable
+    INTEGER(iwp), DIMENSION(nys:nyn,nxl:nxr) ::  global_end_index    !< end index for surface data (MPI-IO)
+    INTEGER(iwp), DIMENSION(nys:nyn,nxl:nxr) ::  global_start_index  !< start index for surface data (MPI-IO)
+    LOGICAL ::  data_to_read    !< cycle in l loop, if no values to read
     LOGICAL ::  surf_match_def  !< flag indicating that surface element is of default type
     LOGICAL ::  surf_match_lsm  !< flag indicating that surface element is of natural type
 …
        IF ( ns_h_on_file(l) == 0 )  CYCLE  !< No data of this surface type on file
+       WRITE( dum, '(I1)')  l
        IF ( ALLOCATED( surf_h(l)%start_index ) )  CALL deallocate_surface_attributes_h( surf_h(l) )
+       surf_h(l)%ns = ns_h_on_file(l)
+       CALL allocate_surface_attributes_h( surf_h(l), nys, nyn, nxl, nxr )
+       WRITE( dum, '(I1)') l
+       CALL rrd_mpi_io( 'surf_h(' // dum // ')%start_index',  surf_h(l)%start_index )
+       CALL rrd_mpi_io( 'surf_h(' // dum // ')%end_index',  surf_h(l)%end_index )
+       ALLOCATE( surf_h(l)%start_index(nys:nyn,nxl:nxr) )
+       ALLOCATE( surf_h(l)%end_index(nys:nyn,nxl:nxr) )
+       surf_h(l)%start_index = 0
+       surf_h(l)%end_index   = -1
        CALL rrd_mpi_io( 'global_start_index_h_' // dum , global_start_index )
+       CALL rd_mpi_io_surface_filetypes( surf_h(l)%start_index, surf_h(l)%end_index, ldum,         &
+                                         global_start_index )
+       CALL rrd_mpi_io( 'global_end_index_h_' // dum , global_end_index )
+       CALL rd_mpi_io_surface_filetypes( surf_h(l)%start_index, surf_h(l)%end_index, data_to_read, &
+                                         global_start_index, global_end_index )
+       surf_h(l)%ns = MAX( 2, MAXVAL( surf_h(l)%end_index ) )
+       CALL allocate_surface_attributes_h( surf_h(l), nys, nyn, nxl, nxr,                          &
+                                           no_allocate_index_arrays = .TRUE. )
+       IF ( .NOT. data_to_read )  CYCLE
        IF ( ALLOCATED ( surf_h(l)%us ) )  THEN
 …
        IF ( ALLOCATED( surf_v(l)%start_index ) )  CALL deallocate_surface_attributes_v( surf_v(l) )
+       surf_v(l)%ns = ns_v_on_file(l)
+       CALL allocate_surface_attributes_v( surf_v(l), nys, nyn, nxl, nxr )
+       WRITE( dum, '(I1)' ) l
+       CALL rrd_mpi_io( 'surf_v(' // dum // ')%start_index', surf_v(l)%start_index )
+       CALL rrd_mpi_io( 'surf_v(' // dum // ')%end_index', surf_v(l)%end_index )
+       ALLOCATE( surf_v(l)%start_index(nys:nyn,nxl:nxr) )
+       ALLOCATE( surf_v(l)%end_index(nys:nyn,nxl:nxr) )
+       surf_v(l)%start_index = 0
+       surf_v(l)%end_index   = -1
+       WRITE( dum, '(I1)' )  l
        CALL rrd_mpi_io( 'global_start_index_v_' // dum , global_start_index )
+       CALL rd_mpi_io_surface_filetypes( surf_v(l)%start_index, surf_v(l)%end_index, ldum,         &
+                                         global_start_index )
+       CALL rrd_mpi_io( 'global_end_index_v_' // dum , global_end_index )
+       CALL rd_mpi_io_surface_filetypes( surf_v(l)%start_index, surf_v(l)%end_index, data_to_read, &
+                                         global_start_index, global_end_index )
+       IF ( .NOT. data_to_read )  CYCLE
+       surf_v(l)%ns = MAX( 2, MAXVAL( surf_v(l)%end_index ) )
+       CALL allocate_surface_attributes_v( surf_v(l), nys, nyn, nxl, nxr,                          &
+                                           no_allocate_index_arrays = .TRUE.  )
        IF ( ALLOCATED ( surf_v(l)%us ) )  THEN

palm/trunk/SOURCE/urban_surface_mod.f90

-                      r4872
+                      r4893
 ! -----------------
 ! $Id$
+! revised output of surface data via MPI-IO for better performance
+!
+! 4872 2021-02-12 15:49:02Z raasch
 ! internal switch removed from namelist
+!
 …
  SUBROUTINE usm_rrd_local_mpi
     CHARACTER(LEN=1) ::  dum  !< dummy string to create input-variable name
     INTEGER(iwp) ::  l  !< loop index for surface types
+    INTEGER(iwp), DIMENSION(nys:nyn,nxl:nxr) ::  global_start
+    LOGICAL ::  ldum  !< dummy variable
+    INTEGER(iwp), DIMENSION(nys:nyn,nxl:nxr) ::  global_end_index
+    INTEGER(iwp), DIMENSION(nys:nyn,nxl:nxr) ::  global_start_index
+    LOGICAL ::  data_to_read  !< dummy variable
     DO  l = 0, 1
 …
        WRITE( dum, '(I1)' )  l
+       CALL rrd_mpi_io( 'usm_start_index_h_' //dum,  surf_usm_h(l)%start_index )
+       CALL rrd_mpi_io( 'usm_end_index_h_' //dum, surf_usm_h(l)%end_index )
+       CALL rrd_mpi_io( 'usm_global_start_h_' //dum, global_start )
+       CALL rd_mpi_io_surface_filetypes( surf_usm_h(l)%start_index, surf_usm_h(l)%end_index, ldum, &
+                                         global_start )
+       IF ( MAXVAL( surf_usm_h(l)%end_index ) <= 0 )  CYCLE
+       IF ( .NOT.  ALLOCATED( t_surf_wall_h_1(l)%val ) )                                             &
+       CALL rrd_mpi_io( 'usm_global_start_h_' //dum, global_start_index )
+       CALL rrd_mpi_io( 'usm_global_end_h_' //dum, global_end_index )
+       CALL rd_mpi_io_surface_filetypes( surf_usm_h(l)%start_index, surf_usm_h(l)%end_index,       &
+                                         data_to_read, global_start_index, global_end_index )
+       IF ( .NOT. data_to_read )  CYCLE
+       IF ( .NOT. ALLOCATED( t_surf_wall_h_1(l)%val ) )                                            &
           ALLOCATE( t_surf_wall_h_1(l)%val(1:surf_usm_h(l)%ns) )
        CALL rrd_mpi_io_surface( 't_surf_wall_h(' // dum // ')', t_surf_wall_h_1(l)%val )
        IF ( .NOT.  ALLOCATED( t_surf_window_h_1(l)%val ) )                                           &
+       IF ( .NOT. ALLOCATED( t_surf_window_h_1(l)%val ) )                                          &
           ALLOCATE( t_surf_window_h_1(l)%val(1:surf_usm_h(l)%ns) )
        CALL rrd_mpi_io_surface( 't_surf_window_h(' // dum // ')', t_surf_window_h_1(l)%val )
        IF ( .NOT.  ALLOCATED( t_surf_green_h_1(l)%val ) )                                            &
+       IF ( .NOT. ALLOCATED( t_surf_green_h_1(l)%val ) )                                           &
           ALLOCATE( t_surf_green_h_1(l)%val(1:surf_usm_h(l)%ns) )
        CALL rrd_mpi_io_surface( 't_surf_green_h(' // dum // ')', t_surf_green_h_1(l)%val )
        IF ( .NOT.  ALLOCATED( m_liq_usm_h_1(l)%val ) )                                             &
+       IF ( .NOT. ALLOCATED( m_liq_usm_h_1(l)%val ) )                                              &
           ALLOCATE( m_liq_usm_h_1(l)%val(1:surf_usm_h(l)%ns) )
        CALL rrd_mpi_io_surface( 'm_liq_usm_h(' // dum // ')', m_liq_usm_h_1(l)%val )
        IF ( indoor_model )  THEN
           IF ( .NOT.  ALLOCATED( surf_usm_h(l)%waste_heat ) )                                      &
+          IF ( .NOT. ALLOCATED( surf_usm_h(l)%waste_heat ) )                                       &
              ALLOCATE( surf_usm_h(l)%waste_heat(1:surf_usm_h(l)%ns) )
           CALL rrd_mpi_io_surface( 'waste_heat_h(' // dum // ')', surf_usm_h(l)%waste_heat )
           IF ( .NOT.  ALLOCATED( surf_usm_h(l)%t_prev ) )                                          &
+          IF ( .NOT. ALLOCATED( surf_usm_h(l)%t_prev ) )                                           &
              ALLOCATE( surf_usm_h(l)%t_prev(1:surf_usm_h(l)%ns) )
           CALL rrd_mpi_io_surface( 't_prev_h(' // dum // ')', surf_usm_h(l)%t_prev )
 …
        WRITE( dum, '(I1)' )  l
+       CALL rrd_mpi_io( 'usm_start_index_v_' //dum, surf_usm_v(l)%start_index )
+       CALL rrd_mpi_io( 'usm_end_index_v_' // dum, surf_usm_v(l)%end_index )
+       CALL rrd_mpi_io( 'usm_global_start_v_' // dum, global_start )
+       CALL rd_mpi_io_surface_filetypes( surf_usm_v(l)%start_index, surf_usm_v(l)%end_index, ldum, &
+                                         global_start )
+       IF ( MAXVAL( surf_usm_v(l)%end_index ) <= 0 )  CYCLE
+       IF ( .NOT.  ALLOCATED( t_surf_wall_v_1(l)%val ) )                                             &
+       CALL rrd_mpi_io( 'usm_global_start_v_' // dum, global_start_index )
+       CALL rrd_mpi_io( 'usm_global_end_v_' // dum, global_end_index )
+       CALL rd_mpi_io_surface_filetypes( surf_usm_v(l)%start_index, surf_usm_v(l)%end_index,       &
+                                         data_to_read, global_start_index, global_end_index )
+       IF ( .NOT. data_to_read )  CYCLE
+       IF ( .NOT. ALLOCATED( t_surf_wall_v_1(l)%val ) )                                            &
           ALLOCATE( t_surf_wall_v_1(l)%val(1:surf_usm_v(l)%ns) )
        CALL rrd_mpi_io_surface( 't_surf_wall_v(' // dum // ')', t_surf_wall_v_1(l)%val )
        IF ( .NOT.  ALLOCATED( t_surf_window_v_1(l)%val ) )                                           &
+       IF ( .NOT. ALLOCATED( t_surf_window_v_1(l)%val ) )                                          &
           ALLOCATE( t_surf_window_v_1(l)%val(1:surf_usm_v(l)%ns) )
        CALL rrd_mpi_io_surface( 't_surf_window_v(' // dum // ')', t_surf_window_v_1(l)%val )
        IF ( .NOT.  ALLOCATED( t_surf_green_v_1(l)%val ) )                                            &
+       IF ( .NOT. ALLOCATED( t_surf_green_v_1(l)%val ) )                                           &
           ALLOCATE( t_surf_green_v_1(l)%val(1:surf_usm_v(l)%ns) )
        CALL rrd_mpi_io_surface( 't_surf_green_v(' // dum // ')', t_surf_green_v_1(l)%val)
        IF ( indoor_model )  THEN
           IF ( .NOT.  ALLOCATED( surf_usm_v(l)%waste_heat ) )                                      &
+          IF ( .NOT. ALLOCATED( surf_usm_v(l)%waste_heat ) )                                       &
              ALLOCATE( surf_usm_v(l)%waste_heat(1:surf_usm_v(l)%ns) )
           CALL rrd_mpi_io_surface( 'waste_heat_v(' // dum // ')', surf_usm_v(l)%waste_heat )
           IF ( .NOT.  ALLOCATED( surf_usm_v(l)%t_prev ) )                                          &
+          IF ( .NOT. ALLOCATED( surf_usm_v(l)%t_prev ) )                                           &
              ALLOCATE( surf_usm_v(l)%t_prev(1:surf_usm_v(l)%ns) )
           CALL rrd_mpi_io_surface( 't_prev_v(' // dum // ')', surf_usm_v(l)%t_prev )
 …
        WRITE( dum, '(I1)' )  l
+       CALL rrd_mpi_io( 'usm_start_index_h_2_' //dum,  surf_usm_h(l)%start_index )
+       CALL rrd_mpi_io( 'usm_end_index_h_2_' //dum, surf_usm_h(l)%end_index )
+       CALL rrd_mpi_io( 'usm_global_start_h_2_' //dum, global_start )
+       CALL rd_mpi_io_surface_filetypes( surf_usm_h(l)%start_index, surf_usm_h(l)%end_index, ldum,          &
+                                         global_start )
+       IF ( MAXVAL( surf_usm_h(l)%end_index ) <= 0 )  CYCLE
+       IF ( .NOT.  ALLOCATED( t_wall_h_1(l)%val ) )                                                          &
+       CALL rrd_mpi_io( 'usm_global_start_h_2_' //dum, global_start_index )
+       CALL rrd_mpi_io( 'usm_global_end_h_2_' //dum, global_end_index )
+       CALL rd_mpi_io_surface_filetypes( surf_usm_h(l)%start_index, surf_usm_h(l)%end_index,       &
+                                         data_to_read, global_start_index, global_end_index )
+       IF ( .NOT. data_to_read )  CYCLE
+       IF ( .NOT. ALLOCATED( t_wall_h_1(l)%val ) )                                                 &
           ALLOCATE( t_wall_h_1(l)%val(nzb_wall:nzt_wall+1,1:surf_usm_h(l)%ns) )
        CALL rrd_mpi_io_surface( 't_wall_h(' // dum // ')', t_wall_h_1(l)%val )
        IF ( .NOT.  ALLOCATED( t_window_h_1(l)%val ) )                                                        &
+       IF ( .NOT. ALLOCATED( t_window_h_1(l)%val ) )                                               &
           ALLOCATE( t_window_h_1(l)%val(nzb_wall:nzt_wall+1,1:surf_usm_h(l)%ns) )
        CALL rrd_mpi_io_surface( 't_window_h(' // dum // ')', t_window_h_1(l)%val )
        IF ( .NOT.  ALLOCATED( t_green_h_1(l)%val ) )                                                         &
+       IF ( .NOT. ALLOCATED( t_green_h_1(l)%val ) )                                                &
           ALLOCATE( t_green_h_1(l)%val(nzb_wall:nzt_wall+1,1:surf_usm_h(l)%ns) )
        CALL rrd_mpi_io_surface( 't_green_h(' // dum // ')', t_green_h_1(l)%val )
 …
        WRITE( dum, '(I1)' )  l
+       CALL rrd_mpi_io( 'usm_start_index_v_2_' //dum, surf_usm_v(l)%start_index )
+       CALL rrd_mpi_io( 'usm_end_index_v_2_' // dum, surf_usm_v(l)%end_index )
+       CALL rrd_mpi_io( 'usm_global_start_v_2_' // dum, global_start )
+       CALL rd_mpi_io_surface_filetypes( surf_usm_v(l)%start_index, surf_usm_v(l)%end_index, ldum, &
+                                         global_start )
+       IF ( MAXVAL( surf_usm_v(l)%end_index ) <= 0 )  CYCLE
+       IF ( .NOT. ALLOCATED( t_wall_v_1(l)%val ) )                                                   &
+       CALL rrd_mpi_io( 'usm_global_start_v_2_' // dum, global_start_index )
+       CALL rrd_mpi_io( 'usm_global_end_v_2_' // dum, global_end_index )
+       CALL rd_mpi_io_surface_filetypes( surf_usm_v(l)%start_index, surf_usm_v(l)%end_index,       &
+                                         data_to_read, global_start_index, global_end_index )
+       IF ( .NOT. data_to_read )  CYCLE
+       IF ( .NOT. ALLOCATED( t_wall_v_1(l)%val ) )                                                 &
           ALLOCATE ( t_wall_v_1(l)%val(nzb_wall:nzt_wall+1,1:surf_usm_v(l)%ns) )
        CALL rrd_mpi_io_surface( 't_wall_v(' // dum // ')', t_wall_v_1(l)%val )
        IF ( .NOT. ALLOCATED( t_window_v_1(l)%val ) )                                                 &
+       IF ( .NOT. ALLOCATED( t_window_v_1(l)%val ) )                                               &
           ALLOCATE ( t_window_v_1(l)%val(nzb_wall:nzt_wall+1,1:surf_usm_v(l)%ns) )
        CALL rrd_mpi_io_surface( 't_window_v(' // dum // ')', t_window_v_1(l)%val )
        IF ( .NOT. ALLOCATED( t_green_v_1(l)%val ) )                                                  &
+       IF ( .NOT. ALLOCATED( t_green_v_1(l)%val ) )                                                &
           ALLOCATE ( t_green_v_1(l)%val(nzb_wall:nzt_wall+1,1:surf_usm_v(l)%ns) )
        CALL rrd_mpi_io_surface( 't_green_v(' // dum // ')', t_green_v_1(l)%val )
 …
  END SUBROUTINE usm_rrd_local_mpi
 !--------------------------------------------------------------------------------------------------!
 …
     INTEGER(iwp)  ::  l  !< index surface type orientation
+    INTEGER(iwp), DIMENSION(nys:nyn,nxl:nxr)  ::  global_start_index  !< index for surface data (MPI-IO)
+    INTEGER(iwp), DIMENSION(nys:nyn,nxl:nxr) ::  global_end_index    !< end index for surface data (MPI-IO)
+    INTEGER(iwp), DIMENSION(nys:nyn,nxl:nxr) ::  global_start_index  !< start index for surface data (MPI-IO)
     LOGICAL  ::  surface_data_to_write  !< switch for MPI-I/O if PE has surface data to write
 …
           WRITE( dum, '(I1)')  l
+          CALL rd_mpi_io_surface_filetypes( surf_usm_h(l)%start_index, surf_usm_h(l)%end_index,             &
+                                            surface_data_to_write, global_start_index )
+          CALL wrd_mpi_io( 'usm_start_index_h_' // dum,  surf_usm_h(l)%start_index )
+          CALL wrd_mpi_io( 'usm_end_index_h_' // dum, surf_usm_h(l)%end_index )
+          CALL rd_mpi_io_surface_filetypes( surf_usm_h(l)%start_index, surf_usm_h(l)%end_index,    &
+                                            surface_data_to_write, global_start_index,             &
+                                            global_end_index )
           CALL wrd_mpi_io( 'usm_global_start_h_' // dum, global_start_index )
+          CALL wrd_mpi_io( 'usm_global_end_h_' // dum, global_end_index )
           IF ( .NOT. surface_data_to_write )  CYCLE
 …
           CALL rd_mpi_io_surface_filetypes( surf_usm_v(l)%start_index, surf_usm_v(l)%end_index,    &
+                                            surface_data_to_write, global_start_index )
+          CALL wrd_mpi_io( 'usm_start_index_v_' // dum, surf_usm_v(l)%start_index )
+          CALL wrd_mpi_io( 'usm_end_index_v_' // dum, surf_usm_v(l)%end_index )
+                                            surface_data_to_write, global_start_index,             &
+                                            global_end_index )
           CALL wrd_mpi_io( 'usm_global_start_v_' // dum, global_start_index )
+          CALL wrd_mpi_io( 'usm_global_end_v_' // dum,   global_end_index )
           IF ( .NOT. surface_data_to_write )  CYCLE
 …
           WRITE( dum, '(I1)')  l
+          CALL rd_mpi_io_surface_filetypes( surf_usm_h(l)%start_index, surf_usm_h(l)%end_index,             &
+                                            surface_data_to_write, global_start_index )
+          CALL wrd_mpi_io( 'usm_start_index_h_2_' // dum,  surf_usm_h(l)%start_index )
+          CALL wrd_mpi_io( 'usm_end_index_h_2_' // dum, surf_usm_h(l)%end_index )
+          CALL rd_mpi_io_surface_filetypes( surf_usm_h(l)%start_index, surf_usm_h(l)%end_index,    &
+                                            surface_data_to_write, global_start_index,             &
+                                            global_end_index )
           CALL wrd_mpi_io( 'usm_global_start_h_2_' // dum, global_start_index )
+          CALL wrd_mpi_io( 'usm_global_end_h_2_' // dum, global_end_index )
           IF ( .NOT. surface_data_to_write )  CYCLE
 …
           CALL rd_mpi_io_surface_filetypes( surf_usm_v(l)%start_index, surf_usm_v(l)%end_index,    &
+                                            surface_data_to_write, global_start_index )
+          CALL wrd_mpi_io( 'usm_start_index_v_2_' //dum, surf_usm_v(l)%start_index )
+          CALL wrd_mpi_io( 'usm_end_index_v_2_' // dum, surf_usm_v(l)%end_index )
+                                            surface_data_to_write, global_start_index,             &
+                                            global_end_index )
           CALL wrd_mpi_io( 'usm_global_start_v_2_' // dum, global_start_index )
+          CALL wrd_mpi_io( 'usm_global_end_v_2_' // dum, global_end_index )
           IF ( .NOT. surface_data_to_write )  CYCLE

palm/trunk/SOURCE/write_restart_data_mod.f90

-                      r4848
+                      r4893
 ! -----------------
 ! $Id$
+! version number update because of revised output of surface data via MPI-IO for better performance
+!
+! 4848 2021-01-21 15:51:51Z gronemeier
 ! bugfix: removed syn_turb_gen from restart files
+!
 …
     INTEGER ::  i                                !< loop index
     binary_version_global = '5.2'
+    binary_version_global = '5.3'
     IF ( restart_data_format_output == 'fortran_binary' )  THEN

Note: See TracChangeset for help on using the changeset viewer.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |