Changeset 3657 for palm/trunk/SOURCE


Ignore:
Timestamp:
Jan 7, 2019 8:14:18 PM (3 years ago)
Author:
knoop
Message:

OpenACC: cuda-aware-mpi in transpose and acc update async in exchange_horiz added

Location:
palm/trunk/SOURCE
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • palm/trunk/SOURCE/exchange_horiz.f90

    r3655 r3657  
    132132
    133133#ifdef _OPENACC
    134     !$ACC UPDATE IF_PRESENT &
     134    !$ACC UPDATE IF_PRESENT ASYNC(1) &
    135135    !$ACC HOST(ar(:,:,nxr-nbgp_local+1:nxr)) &
    136136    !$ACC HOST(ar(:,:,nxl:nxl+nbgp_local-1))
    137     DO i = nxl-nbgp_local, nxr+nbgp_local
    138        !$ACC UPDATE IF_PRESENT &
     137
     138!
     139!-- Wait for first UPDATE to complete before starting the others.
     140    !$ACC WAIT(1) ASYNC(2)
     141    ! ar(:,:,nxl-nbgp_local:nxl-1) is overwritten by first part below
     142    ! ar(:,:,nxl:nxl+nbgp_local-1) has been transferred above
     143    DO i = nxl+nbgp_local, nxr-nbgp_local
     144       !$ACC UPDATE IF_PRESENT ASYNC(2) &
    139145       !$ACC HOST(ar(:,nyn-nbgp_local+1:nyn,i)) &
    140146       !$ACC HOST(ar(:,nys:nys+nbgp_local-1,i))
    141147    ENDDO
     148    ! ar(:,:,nxr-nbgp_local+1:nxr) has been transferred above
     149    ! ar(:,:,nxr+1:nxr+nbgp_local) is overwritten by first part below
     150
     151!
     152!-- Wait for first UPDATE to complete before starting MPI.
     153    !$ACC WAIT(1)
    142154#endif
    143155
     
    203215    ENDIF
    204216
     217    !$ACC UPDATE IF_PRESENT ASYNC(1) &
     218    !$ACC DEVICE(ar(:,:,nxl-nbgp_local:nxl-1)) &
     219    !$ACC DEVICE(ar(:,:,nxr+1:nxr+nbgp_local))
     220
     221!
     222!-- Wait for UPDATES above to complete before starting MPI.
     223    !$ACC WAIT(2)
    205224
    206225    IF ( pdims(2) == 1  .OR.  mg_switch_to_pe0 )  THEN
     
    274293    ENDIF
    275294
     295    !$ACC UPDATE IF_PRESENT ASYNC(1) &
     296    !$ACC DEVICE(ar(:,:,nxl-nbgp_local:nxl-1)) &
     297    !$ACC DEVICE(ar(:,:,nxr+1:nxr+nbgp_local))
     298
     299!
     300!-- Wait for UPDATES above to complete before starting MPI.
     301    !$ACC WAIT(2)
     302
    276303    IF ( bc_ns_cyc )  THEN
    277304       ar(:,nys-nbgp_local:nys-1,:) = ar(:,nyn-nbgp_local+1:nyn,:)
     
    282309
    283310#ifdef _OPENACC
    284     !$ACC UPDATE IF_PRESENT &
    285     !$ACC DEVICE(ar(:,:,nxl-nbgp_local:nxl-1)) &
    286     !$ACC DEVICE(ar(:,:,nxr+1:nxr+nbgp_local))
    287311    DO i = nxl-nbgp_local, nxr+nbgp_local
    288        !$ACC UPDATE IF_PRESENT &
     312       !$ACC UPDATE IF_PRESENT ASYNC(2) &
    289313       !$ACC DEVICE(ar(:,nys-nbgp_local:nys-1,i)) &
    290314       !$ACC DEVICE(ar(:,nyn+1:nyn+nbgp_local,i))
    291315    ENDDO
     316
     317!
     318!-- Wait for all UPDATEs to finish.
     319    !$ACC WAIT
    292320#endif
    293321
  • palm/trunk/SOURCE/transpose.f90

    r3655 r3657  
    180180!--    Transpose array
    181181       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
     182#ifndef __cuda_aware_mpi
    182183       !$ACC UPDATE HOST(f_inv)
     184#else
     185       !$ACC HOST_DATA USE_DEVICE(work, f_inv)
     186#endif
    183187       IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
    184188       CALL MPI_ALLTOALL( f_inv(nys_x,nzb_x,0),  sendrecvcount_xy, MPI_REAL, &
    185189                          work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
    186190                          comm1dy, ierr )
     191#ifndef __cuda_aware_mpi
    187192       !$ACC UPDATE DEVICE(work)
     193#else
     194       !$ACC END HOST_DATA
     195#endif
    188196       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
    189197
     
    337345!--    Transpose array
    338346       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
     347#ifndef __cuda_aware_mpi
    339348       !$ACC UPDATE HOST(work)
     349#else
     350       !$ACC HOST_DATA USE_DEVICE(work, f_inv)
     351#endif
    340352       IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
    341353       CALL MPI_ALLTOALL( work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
    342354                          f_inv(nys,nxl,1),      sendrecvcount_zx, MPI_REAL, &
    343355                          comm1dx, ierr )
     356#ifndef __cuda_aware_mpi
    344357       !$ACC UPDATE DEVICE(f_inv)
     358#else
     359       !$ACC END HOST_DATA
     360#endif
    345361       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
    346362#endif
     
    475491!--    Transpose array
    476492       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
     493#ifndef __cuda_aware_mpi
    477494       !$ACC UPDATE HOST(work)
     495#else
     496       !$ACC HOST_DATA USE_DEVICE(work, f_inv)
     497#endif
    478498       IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
    479499       CALL MPI_ALLTOALL( work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
    480500                          f_inv(nys_x,nzb_x,0),  sendrecvcount_xy, MPI_REAL, &
    481501                          comm1dy, ierr )
     502#ifndef __cuda_aware_mpi
    482503       !$ACC UPDATE DEVICE(f_inv)
     504#else
     505       !$ACC END HOST_DATA
     506#endif
    483507       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
    484508#endif
     
    693717!--    Transpose array
    694718       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
     719#ifndef __cuda_aware_mpi
    695720       !$ACC UPDATE HOST(f_inv)
     721#else
     722       !$ACC HOST_DATA USE_DEVICE(work, f_inv)
     723#endif
    696724       IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
    697725       CALL MPI_ALLTOALL( f_inv(nxl_y,nzb_y,0),  sendrecvcount_yz, MPI_REAL, &
    698726                          work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
    699727                          comm1dx, ierr )
     728#ifndef __cuda_aware_mpi
    700729       !$ACC UPDATE DEVICE(work)
     730#else
     731       !$ACC END HOST_DATA
     732#endif
    701733       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
    702734
     
    829861!--    Transpose array
    830862       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
     863#ifndef __cuda_aware_mpi
    831864       !$ACC UPDATE HOST(f_inv)
     865#else
     866       !$ACC HOST_DATA USE_DEVICE(work, f_inv)
     867#endif
    832868       IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
    833869       CALL MPI_ALLTOALL( f_inv(nys,nxl,1),      sendrecvcount_zx, MPI_REAL, &
    834870                          work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
    835871                          comm1dx, ierr )
     872#ifndef __cuda_aware_mpi
    836873       !$ACC UPDATE DEVICE(work)
     874#else
     875       !$ACC END HOST_DATA
     876#endif
    837877       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
    838878
     
    9711011!--    Transpose array
    9721012       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
     1013#ifndef __cuda_aware_mpi
    9731014       !$ACC UPDATE HOST(work)
     1015#else
     1016       !$ACC HOST_DATA USE_DEVICE(work, f_inv)
     1017#endif
    9741018       IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
    9751019       CALL MPI_ALLTOALL( work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
    9761020                          f_inv(nxl_y,nzb_y,0),  sendrecvcount_yz, MPI_REAL, &
    9771021                          comm1dx, ierr )
     1022#ifndef __cuda_aware_mpi
    9781023       !$ACC UPDATE DEVICE(f_inv)
     1024#else
     1025       !$ACC END HOST_DATA
     1026#endif
    9791027       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
    9801028#endif
Note: See TracChangeset for help on using the changeset viewer.