Changeset 1482


Ignore:
Timestamp:
Oct 18, 2014 12:34:45 PM (9 years ago)
Author:
raasch
Message:

adjustments for using CUDA-aware MPI

Location:
palm/trunk/SOURCE
Files:
5 edited

Legend:

Unmodified
Added
Removed
  • palm/trunk/SOURCE/fft_xy.f90

    r1403 r1482  
    2020! Current revisions:
    2121! -----------------
    22 !
     22! cudafft workaround for data declaration of ar_tmp because of PGI 14.1 bug
    2323!
    2424! Former revisions:
     
    404404       COMPLEX(dp), DIMENSION(0:(nx+1)/2,nys_x:nyn_x,nzb_x:nzt_x) ::           &
    405405          ar_tmp  !:
    406        !$acc declare create( ar_tmp )
     406       ! following does not work for PGI 14.1 -> to be removed later
     407       ! !$acc declare create( ar_tmp )
    407408#endif
    408409
     
    711712#elif defined( __cuda_fft )
    712713
     714          !$acc data create( ar_tmp )
    713715          IF ( forward_fft )  THEN
    714716
     
    757759
    758760          ENDIF
     761          !$acc end data
    759762
    760763#else
     
    10541057       COMPLEX(dp), DIMENSION(0:(ny+1)/2,nxl_y:nxr_y,nzb_y:nzt_y) ::           &
    10551058          ar_tmp  !:
     1059       ! following does not work for PGI 14.1 -> to be removed later
    10561060       !$acc declare create( ar_tmp )
    10571061#endif
     
    13341338#elif defined( __cuda_fft )
    13351339
     1340          !$acc data create( ar_tmp )
    13361341          IF ( forward_fft )  THEN
    13371342
     
    13801385
    13811386          ENDIF
     1387          !$acc end data
    13821388
    13831389#else
  • palm/trunk/SOURCE/flow_statistics.f90

    r1451 r1482  
    2121! Current revisions:
    2222! -----------------
    23 !
     23! missing ngp_sums_ls added in accelerator version
    2424!
    2525! Former revisions:
     
    14331433       
    14341434    USE indices,                                                               &
    1435         ONLY:  ngp_2dh, ngp_2dh_s_inner, ngp_3d, ngp_3d_inner, ngp_sums, nxl,  &
    1436                nxr, nyn, nys, nzb, nzb_diff_s_inner, nzb_s_inner, nzt,         &
    1437                nzt_diff, rflags_invers
     1435        ONLY:  ngp_2dh, ngp_2dh_s_inner, ngp_3d, ngp_3d_inner, ngp_sums,       &
     1436               ngp_sums_ls, nxl, nxr, nyn, nys, nzb, nzb_diff_s_inner,         &
     1437               nzb_s_inner, nzt, nzt_diff, rflags_invers
    14381438       
    14391439    USE kinds
  • palm/trunk/SOURCE/header.f90

    r1469 r1482  
    2020! Current revisions:
    2121! -----------------
    22 !
     22! information about calculated or predefined virtual processor topology adjusted
    2323!
    2424! Former revisions:
     
    341341    ENDIF
    342342#if defined( __parallel )
    343     IF ( npex == -1  .AND.  pdims(2) /= 1 )  THEN
     343    IF ( npex == -1  .AND.  npey == -1 )  THEN
    344344       char1 = 'calculated'
    345     ELSEIF ( ( host(1:3) == 'ibm'  .OR.  host(1:3) == 'nec'  .OR.  &
    346                host(1:2) == 'lc' )  .AND.                          &
    347              npex == -1  .AND.  pdims(2) == 1 )  THEN
    348        char1 = 'forced'
    349345    ELSE
    350346       char1 = 'predefined'
  • palm/trunk/SOURCE/palm.f90

    r1469 r1482  
    2020! Current revisions:
    2121! -----------------
    22 !
     22! adjustments for using CUDA-aware OpenMPI
    2323!
    2424! Former revisions:
     
    136136!
    137137!-- Local variables
    138     CHARACTER(LEN=9) ::  time_to_string  !:
    139     INTEGER(iwp)     ::  i               !:
     138    CHARACTER(LEN=9)  ::  time_to_string  !:
     139    CHARACTER(LEN=10) ::  env_string      !: to store string of environment var
     140    INTEGER(iwp)      ::  env_stat        !: to hold status of GET_ENV
     141    INTEGER(iwp)      ::  i               !:
     142    INTEGER(iwp)      ::  myid_openmpi    !: OpenMPI local rank for CUDA aware MPI
    140143#if defined( __openacc )
    141144    REAL(wp), DIMENSION(100) ::  acc_dum     !:
     
    161164#if defined( __openacc )
    162165!
     166!-- Get the local MPI rank in case of CUDA aware OpenMPI. Important, if there
     167!-- is more than one accelerator board on the node
     168    CALL GET_ENVIRONMENT_VARIABLE('OMPI_COMM_WORLD_LOCAL_RANK',                &
     169         VALUE=env_string, STATUS=env_stat )
     170    READ( env_string, '(I1)' )  myid_openmpi
     171    PRINT*, '### local_rank = ', myid_openmpi, '  status=',env_stat
     172!
    163173!-- Get the number of accelerator boards per node and assign the MPI processes
    164174!-- to these boards
     
    166176    num_acc_per_node  = ACC_GET_NUM_DEVICES( ACC_DEVICE_NVIDIA )
    167177    IF ( numprocs == 1  .AND.  num_acc_per_node > 0 )  num_acc_per_node = 1
    168     PRINT*, '*** myid = ', myid, ' num_acc_per_node = ', num_acc_per_node
    169     acc_rank = MOD( myid, num_acc_per_node )
    170 !    STOP '****'
     178    PRINT*, '*** myid = ', myid_openmpi, ' num_acc_per_node = ', num_acc_per_node
     179    acc_rank = MOD( myid_openmpi, num_acc_per_node )
    171180    CALL ACC_SET_DEVICE_NUM ( acc_rank, ACC_DEVICE_NVIDIA )
    172181!
    173182!-- Test output (to be removed later)
    174     WRITE (*,'(A,I6,A,I3,A,I3,A,I3)') '*** Connect MPI-Task ', myid,' to CPU ',&
    175                                       acc_rank, ' Devices: ', num_acc_per_node,&
    176                                       ' connected to:',                        &
     183    WRITE (*,'(A,I6,A,I3,A,I3,A,I3)') '*** Connect MPI-Task ', myid_openmpi,   &
     184                                      ' to CPU ', acc_rank, ' Devices: ',      &
     185                                      num_acc_per_node, ' connected to:',      &
    177186                                      ACC_GET_DEVICE_NUM( ACC_DEVICE_NVIDIA )
    178187#endif
  • palm/trunk/SOURCE/poisfft.f90

    r1407 r1482  
    2020! Current revisions:
    2121! -----------------
    22 !
     22! use 2d-decomposition, if accelerator boards are used
    2323!
    2424! Former revisions:
     
    239239!
    240240!--    Two-dimensional Fourier Transformation in x- and y-direction.
    241        IF ( pdims(2) == 1  .AND.  pdims(1) > 1 )  THEN
     241       IF ( pdims(2) == 1  .AND.  pdims(1) > 1  .AND.  num_acc_per_node == 0 ) &
     242       THEN
    242243
    243244!
     
    254255          CALL tr_xy_ffty( ar, ar )
    255256
    256        ELSEIF ( pdims(1) == 1  .AND.  pdims(2) > 1 )  THEN
     257       ELSEIF ( pdims(1) == 1 .AND. pdims(2) > 1 .AND. num_acc_per_node == 0 ) &
     258       THEN
    257259
    258260!
Note: See TracChangeset for help on using the changeset viewer.