Changeset 1482
- Timestamp:
- Oct 18, 2014 12:34:45 PM (10 years ago)
- Location:
- palm/trunk/SOURCE
- Files:
-
- 5 edited
Legend:
- Unmodified
- Added
- Removed
-
palm/trunk/SOURCE/fft_xy.f90
r1403 r1482 20 20 ! Current revisions: 21 21 ! ----------------- 22 ! 22 ! cudafft workaround for data declaration of ar_tmp because of PGI 14.1 bug 23 23 ! 24 24 ! Former revisions: … … 404 404 COMPLEX(dp), DIMENSION(0:(nx+1)/2,nys_x:nyn_x,nzb_x:nzt_x) :: & 405 405 ar_tmp !: 406 !$acc declare create( ar_tmp ) 406 ! following does not work for PGI 14.1 -> to be removed later 407 ! !$acc declare create( ar_tmp ) 407 408 #endif 408 409 … … 711 712 #elif defined( __cuda_fft ) 712 713 714 !$acc data create( ar_tmp ) 713 715 IF ( forward_fft ) THEN 714 716 … … 757 759 758 760 ENDIF 761 !$acc end data 759 762 760 763 #else … … 1054 1057 COMPLEX(dp), DIMENSION(0:(ny+1)/2,nxl_y:nxr_y,nzb_y:nzt_y) :: & 1055 1058 ar_tmp !: 1059 ! following does not work for PGI 14.1 -> to be removed later 1056 1060 !$acc declare create( ar_tmp ) 1057 1061 #endif … … 1334 1338 #elif defined( __cuda_fft ) 1335 1339 1340 !$acc data create( ar_tmp ) 1336 1341 IF ( forward_fft ) THEN 1337 1342 … … 1380 1385 1381 1386 ENDIF 1387 !$acc end data 1382 1388 1383 1389 #else -
palm/trunk/SOURCE/flow_statistics.f90
r1451 r1482 21 21 ! Current revisions: 22 22 ! ----------------- 23 ! 23 ! missing ngp_sums_ls added in accelerator version 24 24 ! 25 25 ! Former revisions: … … 1433 1433 1434 1434 USE indices, & 1435 ONLY: ngp_2dh, ngp_2dh_s_inner, ngp_3d, ngp_3d_inner, ngp_sums, nxl,&1436 n xr, nyn, nys, nzb, nzb_diff_s_inner, nzb_s_inner, nzt, &1437 nz t_diff, rflags_invers1435 ONLY: ngp_2dh, ngp_2dh_s_inner, ngp_3d, ngp_3d_inner, ngp_sums, & 1436 ngp_sums_ls, nxl, nxr, nyn, nys, nzb, nzb_diff_s_inner, & 1437 nzb_s_inner, nzt, nzt_diff, rflags_invers 1438 1438 1439 1439 USE kinds -
palm/trunk/SOURCE/header.f90
r1469 r1482 20 20 ! Current revisions: 21 21 ! ----------------- 22 ! 22 ! information about calculated or predefined virtual processor topology adjusted 23 23 ! 24 24 ! Former revisions: … … 341 341 ENDIF 342 342 #if defined( __parallel ) 343 IF ( npex == -1 .AND. pdims(2) /=1 ) THEN343 IF ( npex == -1 .AND. npey == -1 ) THEN 344 344 char1 = 'calculated' 345 ELSEIF ( ( host(1:3) == 'ibm' .OR. host(1:3) == 'nec' .OR. &346 host(1:2) == 'lc' ) .AND. &347 npex == -1 .AND. pdims(2) == 1 ) THEN348 char1 = 'forced'349 345 ELSE 350 346 char1 = 'predefined' -
palm/trunk/SOURCE/palm.f90
r1469 r1482 20 20 ! Current revisions: 21 21 ! ----------------- 22 ! 22 ! adjustments for using CUDA-aware OpenMPI 23 23 ! 24 24 ! Former revisions: … … 136 136 ! 137 137 !-- Local variables 138 CHARACTER(LEN=9) :: time_to_string !: 139 INTEGER(iwp) :: i !: 138 CHARACTER(LEN=9) :: time_to_string !: 139 CHARACTER(LEN=10) :: env_string !: to store string of environment var 140 INTEGER(iwp) :: env_stat !: to hold status of GET_ENV 141 INTEGER(iwp) :: i !: 142 INTEGER(iwp) :: myid_openmpi !: OpenMPI local rank for CUDA aware MPI 140 143 #if defined( __openacc ) 141 144 REAL(wp), DIMENSION(100) :: acc_dum !: … … 161 164 #if defined( __openacc ) 162 165 ! 166 !-- Get the local MPI rank in case of CUDA aware OpenMPI. Important, if there 167 !-- is more than one accelerator board on the node 168 CALL GET_ENVIRONMENT_VARIABLE('OMPI_COMM_WORLD_LOCAL_RANK', & 169 VALUE=env_string, STATUS=env_stat ) 170 READ( env_string, '(I1)' ) myid_openmpi 171 PRINT*, '### local_rank = ', myid_openmpi, ' status=',env_stat 172 ! 163 173 !-- Get the number of accelerator boards per node and assign the MPI processes 164 174 !-- to these boards … … 166 176 num_acc_per_node = ACC_GET_NUM_DEVICES( ACC_DEVICE_NVIDIA ) 167 177 IF ( numprocs == 1 .AND. num_acc_per_node > 0 ) num_acc_per_node = 1 168 PRINT*, '*** myid = ', myid, ' num_acc_per_node = ', num_acc_per_node 169 acc_rank = MOD( myid, num_acc_per_node ) 170 ! STOP '****' 178 PRINT*, '*** myid = ', myid_openmpi, ' num_acc_per_node = ', num_acc_per_node 179 acc_rank = MOD( myid_openmpi, num_acc_per_node ) 171 180 CALL ACC_SET_DEVICE_NUM ( acc_rank, ACC_DEVICE_NVIDIA ) 172 181 ! 173 182 !-- Test output (to be removed later) 174 WRITE (*,'(A,I6,A,I3,A,I3,A,I3)') '*** Connect MPI-Task ', myid ,' to CPU ',&175 acc_rank, ' Devices: ', num_acc_per_node,&176 ' connected to:',&183 WRITE (*,'(A,I6,A,I3,A,I3,A,I3)') '*** Connect MPI-Task ', myid_openmpi, & 184 ' to CPU ', acc_rank, ' Devices: ', & 185 num_acc_per_node, ' connected to:', & 177 186 ACC_GET_DEVICE_NUM( ACC_DEVICE_NVIDIA ) 178 187 #endif -
palm/trunk/SOURCE/poisfft.f90
r1407 r1482 20 20 ! Current revisions: 21 21 ! ----------------- 22 ! 22 ! use 2d-decomposition, if accelerator boards are used 23 23 ! 24 24 ! Former revisions: … … 239 239 ! 240 240 !-- Two-dimensional Fourier Transformation in x- and y-direction. 241 IF ( pdims(2) == 1 .AND. pdims(1) > 1 ) THEN 241 IF ( pdims(2) == 1 .AND. pdims(1) > 1 .AND. num_acc_per_node == 0 ) & 242 THEN 242 243 243 244 ! … … 254 255 CALL tr_xy_ffty( ar, ar ) 255 256 256 ELSEIF ( pdims(1) == 1 .AND. pdims(2) > 1 ) THEN 257 ELSEIF ( pdims(1) == 1 .AND. pdims(2) > 1 .AND. num_acc_per_node == 0 ) & 258 THEN 257 259 258 260 !
Note: See TracChangeset
for help on using the changeset viewer.