- Timestamp:
- Mar 8, 2013 11:54:10 PM (12 years ago)
- Location:
- palm/trunk
- Files:
-
- 20 edited
Legend:
- Unmodified
- Added
- Removed
-
palm/trunk/SCRIPTS/.mrun.config.imuk_gpu
- Property svn:keywords set to Id
r1016 r1111 1 #$Id$ 1 2 #column 1 column 2 column 3 2 3 #name of variable value of variable (~ must not be used) scope … … 8 9 %add_source_path $base_directory/USER_CODE/$fname 9 10 %depository_path $base_directory/MAKE_DEPOSITORY 10 #%use_makefile true11 #12 # Enter your own host below by adding another line containing in the second13 # column your hostname (as provided by the unix command "hostname") and in the14 # third column the host identifier. Depending on your operating system, the15 # first characters of the host identifier should be "lc" (Linux cluster), "ibm"16 # (IBM-AIX), or "nec" (NEC-SX), respectively.17 11 # 18 12 %host_identifier inferno lcmuk 19 13 # 20 # version 27/09/2012 21 # 14 # pure MPI version 22 15 %remote_username <replace by your IMUK username> lcmuk parallel pgi 23 16 %tmp_user_catalog /localdata lcmuk parallel pgi … … 29 22 %lopts -Mcray=pointer:-fastsse:-r8 lcmuk parallel pgi 30 23 # 24 # pure MPI version with debug options 25 %remote_username <replace by your IMUK username> lcmuk parallel pgidbg 26 %tmp_user_catalog /localdata lcmuk parallel pgidbg 27 %compiler_name mpif90 lcmuk parallel pgidbg 28 %compiler_name_ser pgf90 lcmuk parallel pgidbg 29 %cpp_options -Mpreprocess:-DMPI_REAL=MPI_DOUBLE_PRECISION:-DMPI_2REAL=MPI_2DOUBLE_PRECISION:-D__nopointer lcmuk parallel pgidbg 30 %mopts -j:4 lcmuk parallel pgidbg 31 %fopts -Mcray=pointer:-O0:-C:-g:-Mbounds:-Mchkstk:-traceback:-r8 lcmuk parallel pgidbg 32 %lopts -Mcray=pointer:-O0:-C:-g:-Mbounds:-Mchkstk:-traceback:-r8 lcmuk parallel pgidbg 33 # 34 # pure GPU version 35 %remote_username <replace by your IMUK username> lcmuk pgigpu 36 %tmp_user_catalog /localdata lcmuk pgigpu 37 %compiler_name pgf90 lcmuk pgigpu 38 %compiler_name_ser pgf90 lcmuk pgigpu 39 %cpp_options -Mpreprocess:-D__nopointer:-D__openacc:-D__cuda_fft lcmuk pgigpu 40 %mopts -j:4 lcmuk pgigpu 41 %fopts -acc:-ta=nvidia,4.1:-Minfo=acc:-Mcray=pointer:-fastsse:-r8:-Mcuda lcmuk pgigpu 42 %lopts -acc:-ta=nvidia,4.1:-Minfo=acc:-Mcray=pointer:-fastsse:-r8:-Mcuda:-L/localdata/opt/pgi/linux86-64/2012/cuda/4.1/lib64:-lcufft lcmuk pgigpu 43 # 44 # MPI+GPU 31 45 %remote_username <replace by your IMUK username> lcmuk parallel pgigpu 32 46 %tmp_user_catalog /localdata lcmuk parallel pgigpu 33 47 %compiler_name mpif90 lcmuk parallel pgigpu 34 48 %compiler_name_ser pgf90 lcmuk parallel pgigpu 35 %cpp_options -Mpreprocess:-DMPI_REAL=MPI_DOUBLE_PRECISION:-DMPI_2REAL=MPI_2DOUBLE_PRECISION:-D__nopointer:-D__openacc lcmuk parallel pgigpu49 %cpp_options -Mpreprocess:-DMPI_REAL=MPI_DOUBLE_PRECISION:-DMPI_2REAL=MPI_2DOUBLE_PRECISION:-D__nopointer:-D__openacc:-D__cuda_fft lcmuk parallel pgigpu 36 50 %mopts -j:4 lcmuk parallel pgigpu 37 %fopts -acc:-ta=nvidia,4.1:-Minfo=acc:-Mcray=pointer:-fastsse:-r8 38 %lopts -acc:-ta=nvidia,4.1:-Minfo=acc:-Mcray=pointer:-fastsse:-r8 39 51 %fopts -acc:-ta=nvidia,4.1:-Minfo=acc:-Mcray=pointer:-fastsse:-r8:-Mcuda lcmuk parallel pgigpu 52 %lopts -acc:-ta=nvidia,4.1:-Minfo=acc:-Mcray=pointer:-fastsse:-r8:-Mcuda:-L/localdata/opt/pgi/linux86-64/2012/cuda/4.1/lib64:-lcufft lcmuk parallel pgigpu 53 # 40 54 %write_binary true restart 41 55 # -
palm/trunk/SOURCE/Makefile
r1107 r1111 20 20 # Current revisions: 21 21 # ------------------ 22 # 22 # dependencies removed from init_pegrid 23 # bugfix: dependency added for cuda_fft_interfaces 23 24 # 24 25 # Former revisions: … … 267 268 cpu_log.o: modules.o 268 269 cpu_statistics.o: modules.o 269 cuda_fft_interfaces.o: cuda_fft_interfaces.f90 270 cuda_fft_interfaces.o: cuda_fft_interfaces.f90 modules.o 270 271 data_log.o: modules.o 271 272 data_output_dvrp.o: modules.o … … 303 304 init_masks.o: modules.o 304 305 init_ocean.o: modules.o eqn_state_seawater.o 305 init_pegrid.o: modules.o fft_xy.o poisfft.o poisfft_hybrid.o306 init_pegrid.o: modules.o 306 307 init_pt_anomaly.o: modules.o 307 308 init_rankine.o: modules.o -
palm/trunk/SOURCE/Makefile_check
r1107 r1111 20 20 # Current revisions: 21 21 # ------------------ 22 # 22 # dependencies removed from init_pegrid 23 23 # 24 24 # Former revisions: … … 134 134 init_grid.o: modules.o 135 135 init_masks.o: modules.o 136 init_pegrid.o: modules.o fft_xy.o poisfft.o poisfft_hybrid.o136 init_pegrid.o: modules.o 137 137 local_stop.o: modules.o 138 138 message.o: modules.o -
palm/trunk/SOURCE/check_parameters.f90
r1104 r1111 20 20 ! Current revisions: 21 21 ! ----------------- 22 ! 22 ! ibc_p_b = 2 removed 23 23 ! 24 24 ! Former revisions: … … 1601 1601 ELSEIF ( bc_p_b == 'neumann' ) THEN 1602 1602 ibc_p_b = 1 1603 ELSEIF ( bc_p_b == 'neumann+inhomo' ) THEN1604 ibc_p_b = 21605 1603 ELSE 1606 1604 message_string = 'unknown boundary condition: bc_p_b = "' // & … … 1608 1606 CALL message( 'check_parameters', 'PA0059', 1, 2, 0, 6, 0 ) 1609 1607 ENDIF 1610 IF ( ibc_p_b == 2 .AND. .NOT. prandtl_layer ) THEN 1611 message_string = 'boundary condition: bc_p_b = "' // TRIM( bc_p_b ) // & 1612 '" not allowed with prandtl_layer = .FALSE.' 1613 CALL message( 'check_parameters', 'PA0060', 1, 2, 0, 6, 0 ) 1614 ENDIF 1608 1615 1609 IF ( bc_p_t == 'dirichlet' ) THEN 1616 1610 ibc_p_t = 0 -
palm/trunk/SOURCE/cpu_statistics.f90
r1093 r1111 20 20 ! Current revisions: 21 21 ! ----------------- 22 ! 22 ! output of grid point numbers and average CPU time per grid point and timestep 23 23 ! 24 24 ! Former revisions: … … 69 69 !------------------------------------------------------------------------------! 70 70 71 USE control_parameters 71 72 USE cpulog 73 USE indices, ONLY: nx, ny, nz 72 74 USE pegrid 73 USE control_parameters74 75 75 76 IMPLICIT NONE 76 77 77 78 INTEGER :: i, ii(1), iii, sender 79 REAL :: average_cputime 78 80 REAL, SAVE :: norm = 1.0 79 81 REAL, DIMENSION(:), ALLOCATABLE :: pe_max, pe_min, pe_rms, sum … … 157 159 158 160 ! 161 !-- Get total time in order to calculate CPU-time per gridpoint and timestep 162 IF ( nr_timesteps_this_run /= 0 ) THEN 163 average_cputime = log_point(1)%sum / REAL( (nx+1) * (ny+1) * nz ) / & 164 REAL( nr_timesteps_this_run ) * 1E6 ! in micro-sec 165 ELSE 166 average_cputime = -1.0 167 ENDIF 168 169 ! 159 170 !-- Write cpu-times sorted by size 160 171 CALL check_open( 18 ) 161 172 #if defined( __parallel ) 162 WRITE ( 18, 100 ) TRIM( run_description_header ), & 163 numprocs * threads_per_task, pdims(1), pdims(2), & 164 threads_per_task 173 WRITE ( 18, 100 ) TRIM( run_description_header ), & 174 numprocs * threads_per_task, pdims(1), pdims(2), & 175 threads_per_task, nx+1, ny+1, nz, nr_timesteps_this_run, & 176 average_cputime 177 165 178 IF ( num_acc_per_node /= 0 ) WRITE ( 18, 108 ) num_acc_per_node 166 179 WRITE ( 18, 110 ) 167 180 #else 168 WRITE ( 18, 100 ) TRIM( run_description_header ), & 169 numprocs * threads_per_task, 1, 1, & 170 threads_per_task 181 WRITE ( 18, 100 ) TRIM( run_description_header ), & 182 numprocs * threads_per_task, 1, 1, & 183 threads_per_task, nx+1, ny+1, nz, nr_timesteps_this_run, & 184 average_cputime 185 171 186 IF ( num_acc_per_node /= 0 ) WRITE ( 18, 109 ) num_acc_per_node 172 187 WRITE ( 18, 110 ) … … 308 323 309 324 100 FORMAT (A/11('-')//'CPU measures for ',I5,' PEs (',I5,'(x) * ',I5,'(y', & 310 &') tasks *',I5,' threads):') 325 &') tasks *',I5,' threads):'// & 326 'gridpoints (x/y/z): ',20X,I5,' * ',I5,' * ',I5/ & 327 'nr of timesteps: ',22X,I6/ & 328 'cpu time per grid point and timestep: ',5X,F8.5,' * 10**-6 s') 311 329 312 330 101 FORMAT (/'special measures:'/ & … … 320 338 106 FORMAT (/'Exchange of ghostpoints via MPI_ISEND/MPI_IRECV') 321 339 107 FORMAT (//) 322 108 FORMAT ('Accelerator boards per node: ', I2)323 109 FORMAT ('Accelerator boards: ', I2)340 108 FORMAT ('Accelerator boards per node: ',14X,I2) 341 109 FORMAT ('Accelerator boards: ',23X,I2) 324 342 110 FORMAT ('----------------------------------------------------------', & 325 343 &'------------'//& -
palm/trunk/SOURCE/cuda_fft_interfaces.f90
r1107 r1111 20 20 ! Current revisions: 21 21 ! ----------------- 22 ! 22 ! idata and odata changed from 1d- to 3d-arrays 23 23 ! 24 24 ! Former revisions: … … 87 87 88 88 INTEGER(C_INT), value :: plan 89 COMPLEX(dpk), device :: idata( *)90 REAL(dpk), device :: odata( *)89 COMPLEX(dpk), device :: idata(:,:,:) 90 REAL(dpk), device :: odata(:,:,:) 91 91 92 92 END SUBROUTINE CUFFTEXECZ2D … … 103 103 104 104 INTEGER(C_INT), value :: plan 105 REAL(dpk), device :: idata( *)106 COMPLEX(dpk), device :: odata( *)105 REAL(dpk), device :: idata(:,:,:) 106 COMPLEX(dpk), device :: odata(:,:,:) 107 107 108 108 END SUBROUTINE CUFFTEXECD2Z -
palm/trunk/SOURCE/fft_xy.f90
r1107 r1111 20 20 ! Current revisions: 21 21 ! ----------------- 22 ! 22 ! further openACC statements added, CUDA branch completely runs on GPU 23 ! bugfix: CUDA fft plans adjusted for domain decomposition (before they always 24 ! used total domain) 23 25 ! 24 26 ! Former revisions: … … 213 215 total_points_x_transpo = (nx+1) * (nyn_x-nys_x+1) * (nzt_x-nzb_x+1) 214 216 total_points_y_transpo = (ny+1) * (nxr_y-nxl_y+1) * (nzt_y-nzb_y+1) 215 CALL CUFFTPLAN1D( plan_xf, nx+1, CUFFT_D2Z, (ny +1)*nz)216 CALL CUFFTPLAN1D( plan_xi, nx+1, CUFFT_Z2D, (ny +1)*nz)217 CALL CUFFTPLAN1D( plan_yf, ny+1, CUFFT_D2Z, (nx +1)*nz)218 CALL CUFFTPLAN1D( plan_yi, ny+1, CUFFT_Z2D, (nx +1)*nz)217 CALL CUFFTPLAN1D( plan_xf, nx+1, CUFFT_D2Z, (nyn_x-nys_x+1) * (nzt_x-nzb_x+1) ) 218 CALL CUFFTPLAN1D( plan_xi, nx+1, CUFFT_Z2D, (nyn_x-nys_x+1) * (nzt_x-nzb_x+1) ) 219 CALL CUFFTPLAN1D( plan_yf, ny+1, CUFFT_D2Z, (nxr_y-nxl_y+1) * (nzt_y-nzb_y+1) ) 220 CALL CUFFTPLAN1D( plan_yi, ny+1, CUFFT_Z2D, (nxr_y-nxl_y+1) * (nzt_y-nzb_y+1) ) 219 221 #else 220 222 message_string = 'no system-specific fft-call available' … … 261 263 262 264 CHARACTER (LEN=*) :: direction 263 INTEGER :: i, ishape(1), j, k , m265 INTEGER :: i, ishape(1), j, k 264 266 265 267 LOGICAL :: forward_fft … … 273 275 REAL, DIMENSION(6*(nx+1)) :: work2 274 276 #elif defined( __cuda_fft ) 275 REAL(dpk), DEVICE, DIMENSION(:), ALLOCATABLE :: cuda_a_device 276 COMPLEX(dpk), DEVICE, DIMENSION(:), ALLOCATABLE :: cuda_b_device 277 COMPLEX(dpk), DIMENSION(:), ALLOCATABLE :: cuda_host 277 !$acc declare create( ar_tmp ) 278 COMPLEX(dpk), DIMENSION(0:(nx+1)/2,nys_x:nyn_x,nzb_x:nzt_x) :: ar_tmp 278 279 #endif 279 280 REAL, DIMENSION(0:nx,nys_x:nyn_x,nzb_x:nzt_x) :: ar … … 502 503 #elif defined( __cuda_fft ) 503 504 504 ALLOCATE( cuda_a_device(0:total_points_x_transpo-1) )505 ALLOCATE( cuda_b_device(0:((nx+1)/2+1) * (nyn_x-nys_x+1) * (nzt_x-nzb_x+1) - 1) )506 ALLOCATE( cuda_host(0:((nx+1)/2+1) * (nyn_x-nys_x+1) * (nzt_x-nzb_x+1) - 1) )507 508 m = 0509 510 505 IF ( forward_fft ) THEN 511 506 512 cuda_a_device = ar(0:total_points_x_transpo-1,nys_x,nzb_x)513 514 CALL CUFFTEXECD2Z( plan_xf, cuda_a_device, cuda_b_device ) 515 cuda_host = cuda_b_device516 507 !$acc data present( ar ) 508 CALL CUFFTEXECD2Z( plan_xf, ar, ar_tmp ) 509 510 !$acc kernels 511 !$acc loop 517 512 DO k = nzb_x, nzt_x 518 513 DO j = nys_x, nyn_x 519 514 515 !$acc loop vector( 32 ) 520 516 DO i = 0, (nx+1)/2 521 ar(i,j,k) = REAL( cuda_host(m+i) ) * dnx 522 ENDDO 523 517 ar(i,j,k) = REAL( ar_tmp(i,j,k) ) * dnx 518 ENDDO 519 520 !$acc loop vector( 32 ) 524 521 DO i = 1, (nx+1)/2 - 1 525 ar(nx+1-i,j,k) = AIMAG( cuda_host(m+i) ) * dnx 526 ENDDO 527 528 m = m + (nx+1)/2 + 1 529 530 ENDDO 531 ENDDO 532 533 ELSE 534 522 ar(nx+1-i,j,k) = AIMAG( ar_tmp(i,j,k) ) * dnx 523 ENDDO 524 525 ENDDO 526 ENDDO 527 !$acc end kernels 528 !$acc end data 529 530 ELSE 531 532 !$acc data present( ar ) 533 !$acc kernels 534 !$acc loop 535 535 DO k = nzb_x, nzt_x 536 536 DO j = nys_x, nyn_x 537 537 538 cuda_host(m) = CMPLX( ar(0,j,k), 0.0 ) 539 538 ar_tmp(0,j,k) = CMPLX( ar(0,j,k), 0.0 ) 539 540 !$acc loop vector( 32 ) 540 541 DO i = 1, (nx+1)/2 - 1 541 cuda_host(m+i) = CMPLX( ar(i,j,k), ar(nx+1-i,j,k) ) 542 ENDDO 543 cuda_host(m+(nx+1)/2) = CMPLX( ar((nx+1)/2,j,k), 0.0 ) 544 545 m = m + (nx+1)/2 + 1 546 547 ENDDO 548 ENDDO 549 550 cuda_b_device = cuda_host 551 CALL CUFFTEXECZ2D( plan_xi, cuda_b_device, cuda_a_device ) 552 553 ar(0:total_points_x_transpo-1,nys_x,nzb_x) = cuda_a_device 554 555 ENDIF 556 557 DEALLOCATE( cuda_a_device, cuda_b_device, cuda_host ) 542 ar_tmp(i,j,k) = CMPLX( ar(i,j,k), ar(nx+1-i,j,k) ) 543 ENDDO 544 ar_tmp((nx+1)/2,j,k) = CMPLX( ar((nx+1)/2,j,k), 0.0 ) 545 546 ENDDO 547 ENDDO 548 !$acc end kernels 549 550 CALL CUFFTEXECZ2D( plan_xi, ar_tmp, ar ) 551 !$acc end data 552 553 ENDIF 558 554 559 555 #else … … 775 771 776 772 CHARACTER (LEN=*) :: direction 777 INTEGER :: i, j, jshape(1), k , m773 INTEGER :: i, j, jshape(1), k 778 774 779 775 LOGICAL :: forward_fft … … 787 783 REAL, DIMENSION(6*(ny+1)) :: work2 788 784 #elif defined( __cuda_fft ) 789 REAL(dpk), DEVICE, DIMENSION(:), ALLOCATABLE :: cuda_a_device 790 COMPLEX(dpk), DEVICE, DIMENSION(:), ALLOCATABLE :: cuda_b_device 791 COMPLEX(dpk), DIMENSION(:), ALLOCATABLE :: cuda_host 785 !$acc declare create( ar_tmp ) 786 COMPLEX(dpk), DIMENSION(0:(ny+1)/2,nxl_y:nxr_y,nzb_y:nzt_y) :: ar_tmp 792 787 #endif 793 788 REAL, DIMENSION(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) :: ar … … 1013 1008 #elif defined( __cuda_fft ) 1014 1009 1015 ALLOCATE( cuda_a_device(0:total_points_y_transpo-1) )1016 ALLOCATE( cuda_b_device(0:((ny+1)/2+1) * (nxr_y-nxl_y+1) * (nzt_y-nzb_y+1) - 1) )1017 ALLOCATE( cuda_host(0:((ny+1)/2+1) * (nxr_y-nxl_y+1) * (nzt_y-nzb_y+1) - 1) )1018 1019 m = 01020 1021 1010 IF ( forward_fft ) THEN 1022 1011 1023 cuda_a_device = ar(0:total_points_y_transpo-1,nxl_y,nzb_y)1024 1025 CALL CUFFTEXECD2Z( plan_yf, cuda_a_device, cuda_b_device ) 1026 cuda_host = cuda_b_device1027 1012 !$acc data present( ar ) 1013 CALL CUFFTEXECD2Z( plan_yf, ar, ar_tmp ) 1014 1015 !$acc kernels 1016 !$acc loop 1028 1017 DO k = nzb_y, nzt_y 1029 1018 DO i = nxl_y, nxr_y 1030 1019 1020 !$acc loop vector( 32 ) 1031 1021 DO j = 0, (ny+1)/2 1032 ar(j,i,k) = REAL( cuda_host(m+j) ) * dny 1033 ENDDO 1034 1022 ar(j,i,k) = REAL( ar_tmp(j,i,k) ) * dny 1023 ENDDO 1024 1025 !$acc loop vector( 32 ) 1035 1026 DO j = 1, (ny+1)/2 - 1 1036 ar(ny+1-j,i,k) = AIMAG( cuda_host(m+j) ) * dny 1037 ENDDO 1038 1039 m = m + (ny+1)/2 + 1 1040 1041 ENDDO 1042 ENDDO 1043 1044 ELSE 1045 1027 ar(ny+1-j,i,k) = AIMAG( ar_tmp(j,i,k) ) * dny 1028 ENDDO 1029 1030 ENDDO 1031 ENDDO 1032 !$acc end kernels 1033 !$acc end data 1034 1035 ELSE 1036 1037 !$acc data present( ar ) 1038 !$acc kernels 1039 !$acc loop 1046 1040 DO k = nzb_y, nzt_y 1047 1041 DO i = nxl_y, nxr_y 1048 1042 1049 cuda_host(m) = CMPLX( ar(0,i,k), 0.0 ) 1050 1043 ar_tmp(0,i,k) = CMPLX( ar(0,i,k), 0.0 ) 1044 1045 !$acc loop vector( 32 ) 1051 1046 DO j = 1, (ny+1)/2 - 1 1052 cuda_host(m+j) = CMPLX( ar(j,i,k), ar(ny+1-j,i,k) ) 1053 ENDDO 1054 cuda_host(m+(ny+1)/2) = CMPLX( ar((ny+1)/2,i,k), 0.0 ) 1055 1056 m = m + (ny+1)/2 + 1 1057 1058 ENDDO 1059 ENDDO 1060 1061 cuda_b_device = cuda_host 1062 CALL CUFFTEXECZ2D( plan_yi, cuda_b_device, cuda_a_device ) 1063 1064 ar(0:total_points_y_transpo-1,nxl_y,nzb_y) = cuda_a_device 1065 1066 ENDIF 1067 1068 DEALLOCATE( cuda_a_device, cuda_b_device, cuda_host ) 1047 ar_tmp(j,i,k) = CMPLX( ar(j,i,k), ar(ny+1-j,i,k) ) 1048 ENDDO 1049 ar_tmp((ny+1)/2,i,k) = CMPLX( ar((ny+1)/2,i,k), 0.0 ) 1050 1051 ENDDO 1052 ENDDO 1053 !$acc end kernels 1054 1055 CALL CUFFTEXECZ2D( plan_yi, ar_tmp, ar ) 1056 !$acc end data 1057 1058 ENDIF 1069 1059 1070 1060 #else -
palm/trunk/SOURCE/flow_statistics.f90
r1054 r1111 20 20 ! Current revisions: 21 21 ! ----------------- 22 ! openACC directive added 22 23 ! 23 24 ! Former revisions: … … 182 183 183 184 ENDIF 185 186 !$acc update host( km, kh, e, pt, qs, qsws, rif, shf, ts, u, v, w ) 184 187 185 188 ! -
palm/trunk/SOURCE/header.f90
r1109 r1111 20 20 ! Current revisions: 21 21 ! ----------------- 22 ! 22 ! output of accelerator board information 23 ! ibc_p_b = 2 removed 23 24 ! 24 25 ! Former revisions: … … 291 292 threads_per_task, pdims(1), pdims(2), TRIM( char1 ) 292 293 ENDIF 294 IF ( num_acc_per_node /= 0 ) WRITE ( io, 117 ) num_acc_per_node 293 295 IF ( ( host(1:3) == 'ibm' .OR. host(1:3) == 'nec' .OR. & 294 296 host(1:2) == 'lc' .OR. host(1:3) == 'dec' ) .AND. & … … 305 307 WRITE ( io, 108 ) maximum_parallel_io_streams 306 308 ENDIF 309 #else 310 IF ( num_acc_per_node /= 0 ) WRITE ( io, 120 ) num_acc_per_node 307 311 #endif 308 312 WRITE ( io, 99 ) … … 593 597 ELSEIF ( ibc_p_b == 1 ) THEN 594 598 runten = 'p(0) = p(1) |' 595 ELSE596 runten = 'p(0) = p(1) +R|'597 599 ENDIF 598 600 IF ( ibc_p_t == 0 ) THEN … … 1613 1615 37X,'independent precursor runs'/ & 1614 1616 37X,42('-')) 1617 117 FORMAT (' Accelerator boards / node: ',I2) 1615 1618 #endif 1616 1619 110 FORMAT (/' Numerical Schemes:'/ & … … 1627 1630 ' translation velocity = ',A/ & 1628 1631 ' distance advected ',A,': ',F8.3,' km(x) ',F8.3,' km(y)') 1632 120 FORMAT (' Accelerator boards: ',8X,I2) 1629 1633 122 FORMAT (' --> Time differencing scheme: ',A) 1630 1634 123 FORMAT (' --> Rayleigh-Damping active, starts ',A,' z = ',F8.2,' m'/ & … … 1680 1684 ' CPU-time used: ',F9.3,' s per timestep: ', & 1681 1685 ' ',F9.3,' s'/ & 1682 ' per second of simulated tim',&1686 ' per second of simulated tim', & 1683 1687 'e: ',F9.3,' s') 1684 1688 207 FORMAT ( ' Coupling start time: ',F9.3,' s') -
palm/trunk/SOURCE/init_3d_model.f90
r1093 r1111 23 23 ! Current revisions: 24 24 ! ------------------ 25 ! 25 ! openACC directives added for pres 26 ! array diss allocated only if required 26 27 ! 27 28 ! Former revisions: … … 234 235 USE random_function_mod 235 236 USE statistics 237 USE transpose_indices 236 238 237 239 IMPLICIT NONE … … 334 336 ENDIF 335 337 338 ! 339 !-- Array for storing constant coeffficients of the tridiagonal solver 340 IF ( psolver == 'poisfft' ) THEN 341 ALLOCATE( tric(nxl_z:nxr_z,nys_z:nyn_z,0:nz-1) ) 342 ENDIF 343 336 344 IF ( humidity .OR. passive_scalar ) THEN 337 345 ! … … 474 482 IF ( use_sgs_for_particles .OR. wang_kernel .OR. turbulence ) THEN 475 483 ALLOCATE ( diss(nzb:nzt+1,nysg:nyng,nxlg:nxrg) ) 476 ELSE477 ALLOCATE ( diss(2,2,2) ) ! required because diss is used as a478 ! formal parameter479 484 ENDIF 480 485 … … 1427 1432 CALL disturb_field( nzb_v_inner, tend, v ) 1428 1433 n_sor = nsor_ini 1434 !$acc data copy( d, ddzw, nzb_s_inner, tric, u, v, w, tend ) 1429 1435 CALL pres 1436 !$acc end data 1430 1437 n_sor = nsor 1431 1438 ENDIF -
palm/trunk/SOURCE/init_pegrid.f90
r1093 r1111 20 20 ! Current revisions: 21 21 ! ----------------- 22 ! 22 ! initialization of poisfft moved to poisfft 23 23 ! 24 24 ! Former revisions: … … 150 150 151 151 USE control_parameters 152 USE fft_xy153 152 USE grid_variables 154 153 USE indices 155 154 USE pegrid 156 USE poisfft_mod157 USE poisfft_hybrid_mod158 155 USE statistics 159 156 USE transpose_indices … … 1135 1132 ENDIF 1136 1133 1137 IF ( psolver == 'poisfft_hybrid' ) THEN1138 CALL poisfft_hybrid_ini1139 ELSEIF ( psolver == 'poisfft' ) THEN1140 CALL poisfft_init1141 ENDIF1142 1143 1134 ! 1144 1135 !-- Allocate wall flag arrays used in the multigrid solver -
palm/trunk/SOURCE/modules.f90
r1107 r1111 20 20 ! Current revisions: 21 21 ! ------------------ 22 ! 22 ! +tric, nr_timesteps_this_run 23 23 ! 24 24 ! Former revisions: … … 407 407 408 408 REAL, DIMENSION(:,:), ALLOCATABLE :: & 409 c_u, c_v, c_w, diss_s_e, diss_s_nr, diss_s_pt, diss_s_q, diss_s_qr,&410 diss_s_ sa, diss_s_u, diss_s_v, diss_s_w, dzu_mg, dzw_mg, flux_s_e,&411 flux_s_ nr, flux_s_pt, flux_s_q, flux_s_qr, flux_s_sa, flux_s_u, &412 flux_s_ v, flux_s_w, f1_mg, f2_mg, f3_mg, mean_inflow_profiles, nrs,&413 nrsws, nrswst, pt_slope_ref, qs, qsws, qswst, qswst_remote, qrs,&414 q rsws, qrswst, rif, saswsb, saswst, shf, total_2d_a, total_2d_o, ts,&415 t swst, us, usws, uswst, vsws, vswst, z0, z0h416 409 c_u, c_v, c_w, diss_s_e, diss_s_nr, diss_s_pt, diss_s_q, & 410 diss_s_qr, diss_s_sa, diss_s_u, diss_s_v, diss_s_w, dzu_mg, dzw_mg, & 411 flux_s_e, flux_s_nr, flux_s_pt, flux_s_q, flux_s_qr, flux_s_sa, & 412 flux_s_u, flux_s_v, flux_s_w, f1_mg, f2_mg, f3_mg, & 413 mean_inflow_profiles, nrs, nrsws, nrswst, pt_slope_ref, qs, qsws, & 414 qswst, qswst_remote, qrs, qrsws, qrswst, rif, saswsb, saswst, shf, & 415 total_2d_a, total_2d_o, ts, tswst, us, usws, uswst, vsws, vswst, z0, & 416 z0h 417 417 418 418 REAL, DIMENSION(:,:,:), ALLOCATABLE :: & … … 422 422 flux_l_qr, flux_l_sa, flux_l_u, flux_l_v, flux_l_w, kh, km, lad_s, & 423 423 lad_u, lad_v, lad_w, lai, l_wall, p_loc, sec, sls, tend, tend_pt, & 424 tend_nr, tend_q, tend_qr, u_m_l, u_m_n, u_m_r, u_m_s, v_m_l, v_m_n,&425 v_m_ r, v_m_s, w_m_l, w_m_n, w_m_r, w_m_s424 tend_nr, tend_q, tend_qr, tric, u_m_l, u_m_n, u_m_r, u_m_s, v_m_l, & 425 v_m_n, v_m_r, v_m_s, w_m_l, w_m_n, w_m_r, w_m_s 426 426 427 427 … … 684 684 maximum_parallel_io_streams = -1, max_pr_user = 0, & 685 685 mgcycles = 0, mg_cycles = -1, mg_switch_to_pe0_level = 0, mid, & 686 netcdf_data_format = 2, ngsrb = 2, n sor = 20, &687 nsor _ini = 100, n_sor, normalizing_region = 0, &686 netcdf_data_format = 2, ngsrb = 2, nr_timesteps_this_run = 0, & 687 nsor = 20, nsor_ini = 100, n_sor, normalizing_region = 0, & 688 688 nz_do3d = -9999, pch_index = 0, prt_time_count = 0, & 689 689 recycling_plane, runnr = 0, & -
palm/trunk/SOURCE/palm.f90
r1093 r1111 20 20 ! Current revisions: 21 21 ! ----------------- 22 ! 22 ! openACC statements updated 23 23 ! 24 24 ! Former revisions: … … 234 234 !-- Declare and initialize variables in the accelerator memory with their 235 235 !-- host values 236 !$acc data copyin( d iss, e, e_p, kh, km, pt, pt_p, q, ql, tend, te_m, tpt_m, tu_m, tv_m, tw_m, u, u_p, v, vpt, v_p, w, w_p ) &237 !$acc copyin( ddzu, ddzw, dd2zu, l_grid, l_wall, ptdf_x, ptdf_y, pt_init, rdf, rdf_sc, ug, vg, zu, zw ) &236 !$acc data copyin( d, diss, e, e_p, kh, km, pt, pt_p, q, ql, tend, te_m, tpt_m, tu_m, tv_m, tw_m, u, u_p, v, vpt, v_p, w, w_p ) & 237 !$acc copyin( tric, ddzu, ddzw, dd2zu, l_grid, l_wall, ptdf_x, ptdf_y, pt_init, rdf, rdf_sc, ug, vg, zu, zw ) & 238 238 !$acc copyin( hom, qs, qsws, qswst, rif, rif_wall, shf, ts, tswst, us, usws, uswst, vsws, vswst, z0, z0h ) & 239 239 !$acc copyin( fxm, fxp, fym, fyp, fwxm, fwxp, fwym, fwyp, nzb_diff_s_inner, nzb_diff_s_outer, nzb_diff_u ) & -
palm/trunk/SOURCE/poisfft.f90
r1107 r1111 20 20 ! Current revisions: 21 21 ! ----------------- 22 ! 22 ! further openACC porting of non-parallel (MPI) branch: 23 ! tridiagonal routines split into extermal subroutines (instead using CONTAINS), 24 ! no distinction between parallel/non-parallel in poisfft and tridia any more, 25 ! tridia routines moved to end of file because of probable bug in PGI compiler 26 ! (otherwise "invalid device function" is indicated during runtime), 27 ! optimization of tridia routines: constant elements and coefficients of tri are 28 ! stored in seperate arrays ddzuw and tric, last dimension of tri reduced from 5 29 ! to 2, 30 ! poisfft_init is now called internally from poisfft, maketri is called from 31 ! poisfft_init, 32 ! ibc_p_b = 2 removed 23 33 ! 24 34 ! Former revisions: … … 157 167 IMPLICIT NONE 158 168 169 LOGICAL, SAVE :: poisfft_initialized = .FALSE. 170 171 REAL, DIMENSION(:,:), ALLOCATABLE :: ddzuw 172 159 173 PRIVATE 160 174 … … 181 195 SUBROUTINE poisfft_init 182 196 197 USE arrays_3d, ONLY: ddzu_pres, ddzw 198 199 IMPLICIT NONE 200 201 INTEGER :: k 202 203 183 204 CALL fft_init 184 205 206 ALLOCATE( ddzuw(0:nz-1,3) ) 207 208 DO k = 0, nz-1 209 ddzuw(k,1) = ddzu_pres(k+1) * ddzw(k+1) 210 ddzuw(k,2) = ddzu_pres(k+2) * ddzw(k+1) 211 ddzuw(k,3) = -1.0 * & 212 ( ddzu_pres(k+2) * ddzw(k+1) + ddzu_pres(k+1) * ddzw(k+1) ) 213 ENDDO 214 ! 215 !-- Calculate constant coefficients of the tridiagonal matrix 216 #if ! defined ( __check ) 217 CALL maketri 218 #endif 219 220 poisfft_initialized = .TRUE. 221 185 222 END SUBROUTINE poisfft_init 223 186 224 187 225 #if ! defined ( __check ) … … 199 237 CALL cpu_log( log_point_s(3), 'poisfft', 'start' ) 200 238 239 IF ( .NOT. poisfft_initialized ) CALL poisfft_init 240 201 241 ! 202 242 !-- Two-dimensional Fourier Transformation in x- and y-direction. 203 #if defined( __parallel ) 204 IF ( pdims(2) == 1 ) THEN 243 IF ( pdims(2) == 1 .AND. pdims(1) > 1 ) THEN 205 244 206 245 ! … … 217 256 CALL tr_xy_ffty( ar, work, ar ) 218 257 219 ELSEIF ( pdims(1) == 1 ) THEN258 ELSEIF ( pdims(1) == 1 .AND. pdims(2) > 1 ) THEN 220 259 221 260 ! … … 235 274 236 275 ! 237 !-- 2d-domain-decomposition 276 !-- 2d-domain-decomposition or no decomposition (1 PE run) 238 277 !-- Transposition z --> x 239 278 CALL cpu_log( log_point_s(5), 'transpo forward', 'start' ) … … 296 335 ENDIF 297 336 298 #else299 300 !301 !-- Two-dimensional Fourier Transformation along x- and y-direction.302 CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )303 !$acc data copyin( ar, work )304 CALL transpose_zx( ar, work, ar )305 !$acc update host( ar )306 CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )307 308 309 CALL cpu_log( log_point_s(4), 'fft_x', 'start' )310 CALL fft_x( ar, 'forward' )311 CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )312 313 CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )314 CALL transpose_xy( ar, work, ar )315 CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )316 317 CALL cpu_log( log_point_s(7), 'fft_y', 'start' )318 CALL fft_y( ar, 'forward' )319 CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )320 321 !322 !-- Solve the tridiagonal equation system along z323 CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )324 CALL transpose_yz( ar, work, ar )325 CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )326 327 CALL cpu_log( log_point_s(6), 'tridia', 'start' )328 CALL tridia( ar )329 CALL cpu_log( log_point_s(6), 'tridia', 'stop' )330 331 CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )332 CALL transpose_zy( ar, work, ar )333 CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )334 335 !336 !-- Inverse Fourier Transformation.337 CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )338 CALL fft_y( ar, 'backward' )339 CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )340 341 CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )342 CALL transpose_yx( ar, work, ar )343 CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )344 345 CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )346 CALL fft_x( ar, 'backward' )347 CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )348 349 CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )350 CALL transpose_xz( ar, work, ar )351 CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )352 353 !$acc end data354 355 #endif356 357 337 CALL cpu_log( log_point_s(3), 'poisfft', 'stop' ) 358 338 … … 361 341 362 342 363 SUBROUTINE tridia( ar )364 365 !------------------------------------------------------------------------------!366 ! solves the linear system of equations:367 !368 ! -(4 pi^2(i^2/(dx^2*nnx^2)+j^2/(dy^2*nny^2))+369 ! 1/(dzu(k)*dzw(k))+1/(dzu(k-1)*dzw(k)))*p(i,j,k)+370 ! 1/(dzu(k)*dzw(k))*p(i,j,k+1)+1/(dzu(k-1)*dzw(k))*p(i,j,k-1)=d(i,j,k)371 !372 ! by using the Thomas algorithm373 !------------------------------------------------------------------------------!374 375 USE arrays_3d376 377 IMPLICIT NONE378 379 INTEGER :: i, j, k, nnyh380 381 REAL, DIMENSION(nxl_z:nxr_z,0:nz-1) :: ar1382 REAL, DIMENSION(5,nxl_z:nxr_z,0:nz-1) :: tri383 384 REAL :: ar(nxl_z:nxr_z,nys_z:nyn_z,1:nz)385 386 387 nnyh = (ny+1) / 2388 389 !390 !-- Define constant elements of the tridiagonal matrix.391 !$OMP PARALLEL PRIVATE ( k, i )392 !$OMP DO393 DO k = 0, nz-1394 DO i = nxl_z, nxr_z395 tri(2,i,k) = ddzu_pres(k+1) * ddzw(k+1)396 tri(3,i,k) = ddzu_pres(k+2) * ddzw(k+1)397 ENDDO398 ENDDO399 !$OMP END PARALLEL400 401 #if defined( __parallel )402 !403 !-- Repeat for all y-levels.404 !$OMP PARALLEL FIRSTPRIVATE( tri ) PRIVATE ( ar1, j )405 !$OMP DO406 DO j = nys_z, nyn_z407 IF ( j <= nnyh ) THEN408 CALL maketri( j )409 ELSE410 CALL maketri( ny+1-j )411 ENDIF412 CALL split413 CALL substi( j )414 ENDDO415 !$OMP END PARALLEL416 #else417 !418 !-- First y-level.419 CALL maketri( nys_z )420 CALL split421 CALL substi( 0 )422 423 !424 !-- Further y-levels.425 DO j = 1, nnyh - 1426 CALL maketri( j )427 CALL split428 CALL substi( j )429 CALL substi( ny+1-j )430 ENDDO431 CALL maketri( nnyh )432 CALL split433 CALL substi( nnyh+nys )434 #endif435 436 CONTAINS437 438 SUBROUTINE maketri( j )439 440 !------------------------------------------------------------------------------!441 ! Computes the i- and j-dependent component of the matrix442 !------------------------------------------------------------------------------!443 444 USE arrays_3d445 USE constants446 USE control_parameters447 USE grid_variables448 449 IMPLICIT NONE450 451 INTEGER :: i, j, k, nnxh452 REAL :: a, c453 REAL :: ll(nxl_z:nxr_z)454 455 456 nnxh = ( nx + 1 ) / 2457 458 !459 !-- Provide the tridiagonal matrix for solution of the Poisson equation in460 !-- Fourier space. The coefficients are computed following the method of461 !-- Schmidt et al. (DFVLR-Mitteilung 84-15), which departs from Stephan462 !-- Siano's original version by discretizing the Poisson equation,463 !-- before it is Fourier-transformed464 #if defined( __parallel )465 DO i = nxl_z, nxr_z466 IF ( i >= 0 .AND. i <= nnxh ) THEN467 ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &468 REAL( nx+1 ) ) ) / ( dx * dx ) + &469 2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &470 REAL( ny+1 ) ) ) / ( dy * dy )471 ELSE472 ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &473 REAL( nx+1 ) ) ) / ( dx * dx ) + &474 2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &475 REAL( ny+1 ) ) ) / ( dy * dy )476 ENDIF477 DO k = 0,nz-1478 a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)479 c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)480 tri(1,i,k) = a + c - ll(i)481 ENDDO482 ENDDO483 #else484 DO i = 0, nnxh485 ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / REAL( nx+1 ) ) ) / &486 ( dx * dx ) + &487 2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / REAL( ny+1 ) ) ) / &488 ( dy * dy )489 DO k = 0, nz-1490 a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)491 c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)492 tri(1,i,k) = a + c - ll(i)493 IF ( i >= 1 .and. i < nnxh ) THEN494 tri(1,nx+1-i,k) = tri(1,i,k)495 ENDIF496 ENDDO497 ENDDO498 #endif499 IF ( ibc_p_b == 1 .OR. ibc_p_b == 2 ) THEN500 DO i = nxl_z, nxr_z501 tri(1,i,0) = tri(1,i,0) + tri(2,i,0)502 ENDDO503 ENDIF504 IF ( ibc_p_t == 1 ) THEN505 DO i = nxl_z, nxr_z506 tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)507 ENDDO508 ENDIF509 510 END SUBROUTINE maketri511 512 513 SUBROUTINE substi( j )514 515 !------------------------------------------------------------------------------!516 ! Substitution (Forward and Backward) (Thomas algorithm)517 !------------------------------------------------------------------------------!518 519 USE control_parameters520 521 IMPLICIT NONE522 523 INTEGER :: i, j, k524 525 !526 !-- Forward substitution.527 DO i = nxl_z, nxr_z528 ar1(i,0) = ar(i,j,1)529 ENDDO530 DO k = 1, nz - 1531 DO i = nxl_z, nxr_z532 ar1(i,k) = ar(i,j,k+1) - tri(5,i,k) * ar1(i,k-1)533 ENDDO534 ENDDO535 536 !537 !-- Backward substitution538 !-- Note, the 1.0E-20 in the denominator is due to avoid divisions539 !-- by zero appearing if the pressure bc is set to neumann at the top of540 !-- the model domain.541 DO i = nxl_z, nxr_z542 ar(i,j,nz) = ar1(i,nz-1) / ( tri(4,i,nz-1) + 1.0E-20 )543 ENDDO544 DO k = nz-2, 0, -1545 DO i = nxl_z, nxr_z546 ar(i,j,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,j,k+2) ) &547 / tri(4,i,k)548 ENDDO549 ENDDO550 551 !552 !-- Indices i=0, j=0 correspond to horizontally averaged pressure.553 !-- The respective values of ar should be zero at all k-levels if554 !-- acceleration of horizontally averaged vertical velocity is zero.555 IF ( ibc_p_b == 1 .AND. ibc_p_t == 1 ) THEN556 IF ( j == 0 .AND. nxl_z == 0 ) THEN557 DO k = 1, nz558 ar(nxl_z,j,k) = 0.0559 ENDDO560 ENDIF561 ENDIF562 563 END SUBROUTINE substi564 565 566 SUBROUTINE split567 568 !------------------------------------------------------------------------------!569 ! Splitting of the tridiagonal matrix (Thomas algorithm)570 !------------------------------------------------------------------------------!571 572 IMPLICIT NONE573 574 INTEGER :: i, k575 576 !577 !-- Splitting.578 DO i = nxl_z, nxr_z579 tri(4,i,0) = tri(1,i,0)580 ENDDO581 DO k = 1, nz-1582 DO i = nxl_z, nxr_z583 tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)584 tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)585 ENDDO586 ENDDO587 588 END SUBROUTINE split589 590 END SUBROUTINE tridia591 592 593 #if defined( __parallel )594 343 SUBROUTINE ffty_tr_yx( f_in, work, f_out ) 595 344 … … 706 455 ! 707 456 !-- Transpose array 457 #if defined( __parallel ) 708 458 CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' ) 709 459 IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr ) … … 712 462 comm1dx, ierr ) 713 463 CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' ) 464 #endif 714 465 715 466 END SUBROUTINE ffty_tr_yx … … 745 496 ! 746 497 !-- Transpose array 498 #if defined( __parallel ) 747 499 CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' ) 748 500 IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr ) … … 751 503 comm1dx, ierr ) 752 504 CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' ) 505 #endif 753 506 754 507 ! … … 1061 814 ! 1062 815 !-- Transpose array 816 #if defined( __parallel ) 1063 817 CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' ) 1064 818 IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr ) … … 1067 821 comm1dy, ierr ) 1068 822 CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' ) 823 #endif 1069 824 1070 825 END SUBROUTINE fftx_tr_xy … … 1096 851 ! 1097 852 !-- Transpose array 853 #if defined( __parallel ) 1098 854 CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' ) 1099 855 IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr ) … … 1102 858 comm1dy, ierr ) 1103 859 CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' ) 860 #endif 1104 861 1105 862 ! … … 1426 1183 ENDDO 1427 1184 ENDDO 1428 IF ( ibc_p_b == 1 .OR. ibc_p_b == 2) THEN1185 IF ( ibc_p_b == 1 ) THEN 1429 1186 DO i = 0, nx 1430 1187 tri(1,i,0) = tri(1,i,0) + tri(2,i,0) … … 1530 1287 END SUBROUTINE tridia_1dd 1531 1288 1532 #endif 1533 #endif 1289 1290 SUBROUTINE tridia( ar ) 1291 1292 !------------------------------------------------------------------------------! 1293 ! solves the linear system of equations: 1294 ! 1295 ! -(4 pi^2(i^2/(dx^2*nnx^2)+j^2/(dy^2*nny^2))+ 1296 ! 1/(dzu(k)*dzw(k))+1/(dzu(k-1)*dzw(k)))*p(i,j,k)+ 1297 ! 1/(dzu(k)*dzw(k))*p(i,j,k+1)+1/(dzu(k-1)*dzw(k))*p(i,j,k-1)=d(i,j,k) 1298 ! 1299 ! by using the Thomas algorithm 1300 !------------------------------------------------------------------------------! 1301 1302 USE arrays_3d 1303 1304 IMPLICIT NONE 1305 1306 INTEGER :: i, j, k 1307 1308 !$acc declare create( tri ) 1309 REAL, DIMENSION(nxl_z:nxr_z,nys_z:nyn_z,0:nz-1,2) :: tri 1310 1311 REAL :: ar(nxl_z:nxr_z,nys_z:nyn_z,1:nz) 1312 1313 1314 CALL split( tri ) 1315 CALL substi( ar, tri ) 1316 1317 END SUBROUTINE tridia 1318 1319 1320 SUBROUTINE maketri 1321 1322 !------------------------------------------------------------------------------! 1323 ! Computes the i- and j-dependent component of the matrix 1324 !------------------------------------------------------------------------------! 1325 1326 USE arrays_3d, ONLY: tric 1327 USE constants 1328 USE control_parameters 1329 USE grid_variables 1330 1331 IMPLICIT NONE 1332 1333 INTEGER :: i, j, k, nnxh, nnyh 1334 1335 !$acc declare create( ll ) 1336 REAL :: ll(nxl_z:nxr_z,nys_z:nyn_z) 1337 1338 1339 nnxh = ( nx + 1 ) / 2 1340 nnyh = ( ny + 1 ) / 2 1341 1342 ! 1343 !-- Provide the constant coefficients of the tridiagonal matrix for solution 1344 !-- of the Poisson equation in Fourier space. 1345 !-- The coefficients are computed following the method of 1346 !-- Schmidt et al. (DFVLR-Mitteilung 84-15), which departs from Stephan 1347 !-- Siano's original version by discretizing the Poisson equation, 1348 !-- before it is Fourier-transformed. 1349 1350 !$acc kernels present( tric ) 1351 !$acc loop vector( 32 ) 1352 DO j = nys_z, nyn_z 1353 DO i = nxl_z, nxr_z 1354 IF ( j >= 0 .AND. j <= nnyh ) THEN 1355 IF ( i >= 0 .AND. i <= nnxh ) THEN 1356 ll(i,j) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / & 1357 REAL( nx+1 ) ) ) / ( dx * dx ) + & 1358 2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / & 1359 REAL( ny+1 ) ) ) / ( dy * dy ) 1360 ELSE 1361 ll(i,j) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / & 1362 REAL( nx+1 ) ) ) / ( dx * dx ) + & 1363 2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / & 1364 REAL( ny+1 ) ) ) / ( dy * dy ) 1365 ENDIF 1366 ELSE 1367 IF ( i >= 0 .AND. i <= nnxh ) THEN 1368 ll(i,j) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / & 1369 REAL( nx+1 ) ) ) / ( dx * dx ) + & 1370 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( ny+1-j ) ) / & 1371 REAL( ny+1 ) ) ) / ( dy * dy ) 1372 ELSE 1373 ll(i,j) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / & 1374 REAL( nx+1 ) ) ) / ( dx * dx ) + & 1375 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( ny+1-j ) ) / & 1376 REAL( ny+1 ) ) ) / ( dy * dy ) 1377 ENDIF 1378 ENDIF 1379 ENDDO 1380 ENDDO 1381 1382 !$acc loop 1383 DO k = 0, nz-1 1384 DO j = nys_z, nyn_z 1385 !$acc loop vector( 32 ) 1386 DO i = nxl_z, nxr_z 1387 tric(i,j,k) = ddzuw(k,3) - ll(i,j) 1388 ENDDO 1389 ENDDO 1390 ENDDO 1391 !$acc end kernels 1392 1393 IF ( ibc_p_b == 1 ) THEN 1394 !$acc kernels present( tric ) 1395 !$acc loop 1396 DO j = nys_z, nyn_z 1397 DO i = nxl_z, nxr_z 1398 tric(i,j,0) = tric(i,j,0) + ddzuw(0,1) 1399 ENDDO 1400 ENDDO 1401 !$acc end kernels 1402 ENDIF 1403 IF ( ibc_p_t == 1 ) THEN 1404 !$acc kernels present( tric ) 1405 !$acc loop 1406 DO j = nys_z, nyn_z 1407 DO i = nxl_z, nxr_z 1408 tric(i,j,nz-1) = tric(i,j,nz-1) + ddzuw(nz-1,2) 1409 ENDDO 1410 ENDDO 1411 !$acc end kernels 1412 ENDIF 1413 1414 END SUBROUTINE maketri 1415 1416 1417 SUBROUTINE substi( ar, tri ) 1418 1419 !------------------------------------------------------------------------------! 1420 ! Substitution (Forward and Backward) (Thomas algorithm) 1421 !------------------------------------------------------------------------------! 1422 1423 USE control_parameters 1424 1425 IMPLICIT NONE 1426 1427 INTEGER :: i, j, k 1428 1429 REAL :: ar(nxl_z:nxr_z,nys_z:nyn_z,1:nz) 1430 REAL, DIMENSION(nxl_z:nxr_z,nys_z:nyn_z,0:nz-1,2) :: tri 1431 1432 !$acc declare create( ar1 ) 1433 REAL, DIMENSION(nxl_z:nxr_z,nys_z:nyn_z,0:nz-1) :: ar1 1434 1435 ! 1436 !-- Forward substitution 1437 DO k = 0, nz - 1 1438 !$acc kernels present( ar, tri ) 1439 !$acc loop 1440 DO j = nys_z, nyn_z 1441 DO i = nxl_z, nxr_z 1442 1443 IF ( k == 0 ) THEN 1444 ar1(i,j,k) = ar(i,j,k+1) 1445 ELSE 1446 ar1(i,j,k) = ar(i,j,k+1) - tri(i,j,k,2) * ar1(i,j,k-1) 1447 ENDIF 1448 1449 ENDDO 1450 ENDDO 1451 !$acc end kernels 1452 ENDDO 1453 1454 ! 1455 !-- Backward substitution 1456 !-- Note, the 1.0E-20 in the denominator is due to avoid divisions 1457 !-- by zero appearing if the pressure bc is set to neumann at the top of 1458 !-- the model domain. 1459 DO k = nz-1, 0, -1 1460 !$acc kernels present( ar, tri ) 1461 !$acc loop 1462 DO j = nys_z, nyn_z 1463 DO i = nxl_z, nxr_z 1464 1465 IF ( k == nz-1 ) THEN 1466 ar(i,j,k+1) = ar1(i,j,k) / ( tri(i,j,k,1) + 1.0E-20 ) 1467 ELSE 1468 ar(i,j,k+1) = ( ar1(i,j,k) - ddzuw(k,2) * ar(i,j,k+2) ) & 1469 / tri(i,j,k,1) 1470 ENDIF 1471 ENDDO 1472 ENDDO 1473 !$acc end kernels 1474 ENDDO 1475 1476 ! 1477 !-- Indices i=0, j=0 correspond to horizontally averaged pressure. 1478 !-- The respective values of ar should be zero at all k-levels if 1479 !-- acceleration of horizontally averaged vertical velocity is zero. 1480 IF ( ibc_p_b == 1 .AND. ibc_p_t == 1 ) THEN 1481 IF ( nys_z == 0 .AND. nxl_z == 0 ) THEN 1482 !$acc kernels loop present( ar ) 1483 DO k = 1, nz 1484 ar(nxl_z,nys_z,k) = 0.0 1485 ENDDO 1486 ENDIF 1487 ENDIF 1488 1489 END SUBROUTINE substi 1490 1491 1492 SUBROUTINE split( tri ) 1493 1494 !------------------------------------------------------------------------------! 1495 ! Splitting of the tridiagonal matrix (Thomas algorithm) 1496 !------------------------------------------------------------------------------! 1497 1498 USE arrays_3d, ONLY: tric 1499 1500 IMPLICIT NONE 1501 1502 INTEGER :: i, j, k 1503 1504 REAL, DIMENSION(nxl_z:nxr_z,nys_z:nyn_z,0:nz-1,2) :: tri 1505 1506 ! 1507 !-- Splitting 1508 !$acc kernels present( tri, tric ) 1509 !$acc loop 1510 DO j = nys_z, nyn_z 1511 !$acc loop vector( 32 ) 1512 DO i = nxl_z, nxr_z 1513 tri(i,j,0,1) = tric(i,j,0) 1514 ENDDO 1515 ENDDO 1516 !$acc end kernels 1517 1518 DO k = 1, nz-1 1519 !$acc kernels present( tri, tric ) 1520 !$acc loop 1521 DO j = nys_z, nyn_z 1522 !$acc loop vector( 32 ) 1523 DO i = nxl_z, nxr_z 1524 tri(i,j,k,2) = ddzuw(k,1) / tri(i,j,k-1,1) 1525 tri(i,j,k,1) = tric(i,j,k) - ddzuw(k-1,2) * tri(i,j,k,2) 1526 ENDDO 1527 ENDDO 1528 !$acc end kernels 1529 ENDDO 1530 1531 END SUBROUTINE split 1532 1533 #endif 1534 1534 1535 END MODULE poisfft_mod -
palm/trunk/SOURCE/poisfft_hybrid.f90
r1107 r1111 20 20 ! Current revisions: 21 21 ! ----------------- 22 ! 22 ! poisfft_hybrid_ini is now called internally from poisfft_hybrid, 23 ! ibc_p_b = 2 removed 23 24 ! 24 25 ! Former revisions: … … 115 116 tasks_per_logical_node = -1 ! default no cluster 116 117 118 LOGICAL, SAVE :: poisfft_initialized = .FALSE. 119 117 120 118 121 PRIVATE … … 283 286 ENDIF 284 287 288 poisfft_initialized = .TRUE. 289 285 290 END SUBROUTINE poisfft_hybrid_ini 286 291 … … 294 299 295 300 REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar 301 302 IF ( .NOT. poisfft_initialized ) CALL poisfft_hybrid_ini 296 303 297 304 IF ( host(1:3) == 'nec' ) THEN … … 961 968 ENDDO 962 969 ENDDO 963 IF ( ibc_p_b == 1 .OR. ibc_p_b == 2) THEN970 IF ( ibc_p_b == 1 ) THEN 964 971 DO i = 0,nx 965 972 tri(1,i,0) = tri(1,i,0) + tri(2,i,0) -
palm/trunk/SOURCE/pres.f90
r1093 r1111 20 20 ! Current revisions: 21 21 ! ----------------- 22 ! 22 ! openACC statements added, 23 ! ibc_p_b = 2 removed 23 24 ! 24 25 ! Former revisions: … … 304 305 ELSE 305 306 !$OMP PARALLEL DO SCHEDULE( STATIC ) 307 !$acc kernels present( d ) 308 !$acc loop 306 309 DO i = nxl, nxr 307 310 DO j = nys, nyn 311 !$acc loop vector(32) 308 312 DO k = nzb+1, nzt 309 313 d(k,j,i) = 0.0 … … 311 315 ENDDO 312 316 ENDDO 317 !$acc end kernels 313 318 ENDIF 314 319 … … 328 333 ENDDO 329 334 ! 330 !-- Additional pressure boundary condition at the bottom boundary for331 !-- inhomogeneous Prandtl layer heat fluxes and temperatures, respectively332 !-- dp/dz = -(dtau13/dx + dtau23/dy) + g*pt'/pt0.333 !-- This condition must not be applied at the start of a run, because then334 !-- flow_statistics has not yet been called and thus sums = 0.335 IF ( ibc_p_b == 2 .AND. sums(nzb+1,4) /= 0.0 ) THEN336 k = nzb_s_inner(j,i)337 d(k+1,j,i) = d(k+1,j,i) + ( &338 ( usws(j,i+1) - usws(j,i) ) * ddx &339 + ( vsws(j+1,i) - vsws(j,i) ) * ddy &340 - g * ( pt(k+1,j,i) - sums(k+1,4) ) / &341 sums(k+1,4) &342 ) * ddzw(k+1) * ddt_3d * d_weight_pres343 ENDIF344 345 !346 335 !-- Compute possible PE-sum of divergences for flow_statistics 347 336 DO k = nzb_s_inner(j,i)+1, nzt … … 357 346 !$OMP END PARALLEL 358 347 #else 359 IF ( ibc_p_b == 2 .AND. sums(nzb+1,4) /= 0.0 ) THEN 360 !$OMP PARALLEL PRIVATE (i,j,k) 361 !$OMP DO SCHEDULE( STATIC ) 362 DO i = nxl, nxr 363 DO j = nys, nyn 364 DO k = nzb_s_inner(j,i)+1, nzt 365 d(k,j,i) = ( ( u(k,j,i+1) - u(k,j,i) ) * ddx + & 366 ( v(k,j+1,i) - v(k,j,i) ) * ddy + & 367 ( w(k,j,i) - w(k-1,j,i) ) * ddzw(k) ) * ddt_3d & 368 * d_weight_pres 369 ENDDO 370 ENDDO 371 ! 372 !-- Additional pressure boundary condition at the bottom boundary for 373 !-- inhomogeneous Prandtl layer heat fluxes and temperatures, respectively 374 !-- dp/dz = -(dtau13/dx + dtau23/dy) + g*pt'/pt0. 375 !-- This condition must not be applied at the start of a run, because then 376 !-- flow_statistics has not yet been called and thus sums = 0. 377 DO j = nys, nyn 378 k = nzb_s_inner(j,i) 379 d(k+1,j,i) = d(k+1,j,i) + ( & 380 ( usws(j,i+1) - usws(j,i) ) * ddx & 381 + ( vsws(j+1,i) - vsws(j,i) ) * ddy & 382 - g * ( pt(k+1,j,i) - sums(k+1,4) ) / & 383 sums(k+1,4) & 384 ) * ddzw(k+1) * ddt_3d & 385 * d_weight_pres 386 ENDDO 387 ENDDO 388 !$OMP END PARALLEL 389 390 ELSE 391 392 !$OMP PARALLEL PRIVATE (i,j,k) 393 !$OMP DO SCHEDULE( STATIC ) 394 DO i = nxl, nxr 395 DO j = nys, nyn 396 DO k = nzb_s_inner(j,i)+1, nzt 348 349 !$OMP PARALLEL PRIVATE (i,j,k) 350 !$OMP DO SCHEDULE( STATIC ) 351 !$acc kernels present( d, ddzw, nzb_s_inner, u, v, w ) 352 !$acc loop 353 DO i = nxl, nxr 354 DO j = nys, nyn 355 !$acc loop vector(32) 356 DO k = 1, nzt 357 IF ( k > nzb_s_inner(j,i) ) THEN 397 358 d(k,j,i) = ( ( u(k,j,i+1) - u(k,j,i) ) * ddx + & 398 ( v(k,j+1,i) - v(k,j,i) ) * ddy + &399 ( w(k,j,i) - w(k-1,j,i) ) * ddzw(k) ) * ddt_3d &400 * d_weight_pres401 END DO402 ENDDO 403 ENDDO 404 !$OMP END PARALLEL405 406 ENDIF359 ( v(k,j+1,i) - v(k,j,i) ) * ddy + & 360 ( w(k,j,i) - w(k-1,j,i) ) * ddzw(k) ) * ddt_3d & 361 * d_weight_pres 362 ENDIF 363 ENDDO 364 ENDDO 365 ENDDO 366 !$acc end kernels 367 !$OMP END PARALLEL 407 368 408 369 ! … … 439 400 !-- Solver for 2d-decomposition 440 401 CALL poisfft( d, tend ) 402 !$acc update host( d ) 441 403 ELSEIF ( psolver == 'poisfft_hybrid' ) THEN 442 404 ! … … 466 428 ! 467 429 !-- Neumann (dp/dz = 0) 468 !$OMP PARALLEL DO469 DO i = nxlg, nxrg470 DO j = nysg, nyng471 tend(nzb_s_inner(j,i),j,i) = tend(nzb_s_inner(j,i)+1,j,i)472 ENDDO473 ENDDO474 475 ELSEIF ( ibc_p_b == 2 ) THEN476 !477 !-- Neumann condition for inhomogeneous surfaces,478 !-- here currently still in the form of a zero gradient. Actually479 !-- dp/dz = -(dtau13/dx + dtau23/dy) + g*pt'/pt0 would have to be used for480 !-- the computation (cf. above: computation of divergences).481 430 !$OMP PARALLEL DO 482 431 DO i = nxlg, nxrg … … 611 560 !-- Correction of the provisional velocities with the current perturbation 612 561 !-- pressure just computed 562 !$acc update host( u, v, w ) 613 563 IF ( conserve_volume_flow .AND. ( bc_lr_cyc .OR. bc_ns_cyc ) ) THEN 614 564 volume_flow_l(1) = 0.0 … … 748 698 CALL cpu_log( log_point_s(1), 'divergence', 'stop' ) 749 699 700 !$acc update device( u, v, w ) 701 750 702 CALL cpu_log( log_point(8), 'pres', 'stop' ) 751 703 -
palm/trunk/SOURCE/prognostic_equations.f90
r1107 r1111 20 20 ! Current revisions: 21 21 ! ------------------ 22 ! 22 ! update directives for prognostic quantities removed 23 23 ! 24 24 ! Former revisions: … … 1574 1574 1575 1575 CALL cpu_log( log_point(5), 'u-equation', 'stop' ) 1576 !$acc update host( u_p )1577 1576 1578 1577 ! … … 1637 1636 1638 1637 CALL cpu_log( log_point(6), 'v-equation', 'stop' ) 1639 !$acc update host( v_p )1640 1638 1641 1639 ! … … 1700 1698 1701 1699 CALL cpu_log( log_point(7), 'w-equation', 'stop' ) 1702 !$acc update host( w_p )1703 1700 1704 1701 … … 1796 1793 1797 1794 CALL cpu_log( log_point(13), 'pt-equation', 'stop' ) 1798 !$acc update host( pt_p )1799 1795 1800 1796 ENDIF … … 2046 2042 2047 2043 CALL cpu_log( log_point(16), 'tke-equation', 'stop' ) 2048 !$acc update host( e_p )2049 2044 2050 2045 ENDIF -
palm/trunk/SOURCE/swap_timelevel.f90
r1054 r1111 20 20 ! Current revisions: 21 21 ! ----------------- 22 ! openACC directives added 22 23 ! 23 24 ! Former revisions: … … 82 83 CALL cpu_log( log_point(28), 'swap_timelevel (nop)', 'start' ) 83 84 85 !$acc kernels present( pt, pt_p, u, u_p, v, v_p, w, w_p ) 84 86 u = u_p 85 87 v = v_p 86 88 w = w_p 87 89 pt = pt_p 90 !$acc end kernels 88 91 IF ( .NOT. constant_diffusion ) THEN 92 !$acc kernels present( e, e_p ) 89 93 e = e_p 94 !$acc end kernels 90 95 ENDIF 91 96 IF ( ocean ) THEN -
palm/trunk/SOURCE/time_integration.f90
r1093 r1111 20 20 ! Current revisions: 21 21 ! ------------------ 22 ! 22 ! +internal timestep counter for cpu statistics added, 23 ! openACC directives updated 23 24 ! 24 25 ! Former revisions: … … 238 239 !-- Exchange of ghost points (lateral boundary conditions) 239 240 CALL cpu_log( log_point(26), 'exchange-horiz-progn', 'start' ) 241 !$acc update host( e_p, pt_p, u_p, v_p, w_p ) 240 242 CALL exchange_horiz( u_p, nbgp ) 241 243 CALL exchange_horiz( v_p, nbgp ) … … 272 274 ! 273 275 !-- Swap the time levels in preparation for the next time step. 276 !$acc update device( e_p, pt_p, u_p, v_p, w_p ) 274 277 CALL swap_timelevel 275 278 … … 298 301 time_disturb = time_disturb + dt_3d 299 302 IF ( time_disturb >= dt_disturb ) THEN 303 !$acc update host( u, v ) 300 304 IF ( hom(nzb+5,1,pr_palm,0) < disturbance_energy_limit ) THEN 301 305 CALL disturb_field( nzb_u_inner, tend, u ) … … 310 314 dist_range = 0 311 315 ENDIF 316 !$acc update device( u, v ) 312 317 time_disturb = time_disturb - dt_disturb 313 318 ENDIF … … 321 326 CALL pres 322 327 ENDIF 323 !324 !-- Update device memory for calculating diffusion quantities and for next325 !-- timestep326 !$acc update device( e, pt, u, v, w )327 !$acc update device( q ) if ( allocated( q ) )328 328 329 329 ! … … 351 351 CALL prandtl_fluxes 352 352 CALL cpu_log( log_point(19), 'prandtl_fluxes', 'stop' ) 353 !354 !++ Statistics still require updates on host355 !$acc update host( qs, qsws, rif, shf, ts )356 353 ENDIF 357 354 … … 369 366 ENDIF 370 367 CALL cpu_log( log_point(17), 'diffusivities', 'stop' ) 371 !372 !++ Statistics still require update of diffusivities on host373 !$acc update host( kh, km )374 368 375 369 ENDIF … … 379 373 ! 380 374 !-- Increase simulation time and output times 375 nr_timesteps_this_run = nr_timesteps_this_run + 1 381 376 current_timestep_number = current_timestep_number + 1 382 377 simulated_time = simulated_time + dt_3d -
palm/trunk/SOURCE/transpose.f90
r1107 r1111 20 20 ! Current revisions: 21 21 ! ----------------- 22 ! 22 ! openACC directives added, 23 ! resorting data from/to work changed, work got 4 dimensions instead of 1 23 24 ! 24 25 ! Former revisions: … … 81 82 IMPLICIT NONE 82 83 83 INTEGER :: i, j, k, l, m,ys84 INTEGER :: i, j, k, l, ys 84 85 85 REAL :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x), & 86 f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx), & 87 f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y), & 88 work(nnx*nny*nnz) 86 REAL :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x), f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) 87 88 REAL, DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work 89 90 !$acc declare create( f_inv ) 91 REAL :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) 92 89 93 90 94 ! … … 93 97 !$OMP PARALLEL PRIVATE ( i, j, k ) 94 98 !$OMP DO 99 !$acc kernels present( f_in ) 100 !$acc loop 95 101 DO i = 0, nx 96 102 DO k = nzb_x, nzt_x 103 !$acc loop vector( 32 ) 97 104 DO j = nys_x, nyn_x 98 105 f_inv(j,k,i) = f_in(i,j,k) … … 100 107 ENDDO 101 108 ENDDO 109 !$acc end kernels 102 110 !$OMP END PARALLEL 103 111 … … 109 117 CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' ) 110 118 IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr ) 111 CALL MPI_ALLTOALL( f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, & 112 work(1), sendrecvcount_xy, MPI_REAL, & 119 !$acc update host( f_inv ) 120 CALL MPI_ALLTOALL( f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, & 121 work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, & 113 122 comm1dy, ierr ) 123 !$acc update device( work ) 114 124 CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' ) 115 125 116 126 ! 117 127 !-- Reorder transposed array 118 !$OMP PARALLEL PRIVATE ( i, j, k, l, m,ys )128 !$OMP PARALLEL PRIVATE ( i, j, k, l, ys ) 119 129 !$OMP DO 120 130 DO l = 0, pdims(2) - 1 121 m = l * ( nxr_y - nxl_y + 1 ) * ( nzt_y - nzb_y + 1 ) * &122 ( nyn_x - nys_x + 1 )123 131 ys = 0 + l * ( nyn_x - nys_x + 1 ) 132 !$acc kernels present( f_out, work ) 133 !$acc loop 124 134 DO i = nxl_y, nxr_y 125 135 DO k = nzb_y, nzt_y 136 !$acc loop vector( 32 ) 126 137 DO j = ys, ys + nyn_x - nys_x 127 m = m + 1 128 f_out(j,i,k) = work(m) 138 f_out(j,i,k) = work(j-ys+1,k,i,l) 129 139 ENDDO 130 140 ENDDO 131 141 ENDDO 142 !$acc end kernels 132 143 ENDDO 133 144 !$OMP END PARALLEL … … 140 151 !$OMP PARALLEL PRIVATE ( i, j, k ) 141 152 !$OMP DO 153 !$acc kernels present( f_out ) 154 !$acc loop 142 155 DO k = nzb_y, nzt_y 143 156 DO i = nxl_y, nxr_y 157 !$acc loop vector( 32 ) 144 158 DO j = 0, ny 145 159 f_out(j,i,k) = f_inv(j,k,i) … … 147 161 ENDDO 148 162 ENDDO 163 !$acc end kernels 149 164 !$OMP END PARALLEL 150 165 … … 172 187 IMPLICIT NONE 173 188 174 INTEGER :: i, j, k, l, m,xs189 INTEGER :: i, j, k, l, xs 175 190 176 REAL :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x), & 177 f_inv(nys:nyn,nxl:nxr,1:nz), & 178 f_out(1:nz,nys:nyn,nxl:nxr), & 179 work(nnx*nny*nnz) 191 REAL :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x), f_out(1:nz,nys:nyn,nxl:nxr) 192 193 REAL, DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work 194 195 !$acc declare create( f_inv ) 196 REAL :: f_inv(nys:nyn,nxl:nxr,1:nz) 180 197 181 198 … … 188 205 ! 189 206 !-- Reorder input array for transposition 190 !$OMP PARALLEL PRIVATE ( i, j, k, l, m,xs )207 !$OMP PARALLEL PRIVATE ( i, j, k, l, xs ) 191 208 !$OMP DO 192 209 DO l = 0, pdims(1) - 1 193 m = l * ( nzt_x - nzb_x + 1 ) * nnx * ( nyn_x - nys_x + 1 )194 210 xs = 0 + l * nnx 211 !$acc kernels present( f_in, work ) 212 !$acc loop 195 213 DO k = nzb_x, nzt_x 196 214 DO i = xs, xs + nnx - 1 215 !$acc loop vector( 32 ) 197 216 DO j = nys_x, nyn_x 198 m = m + 1 199 work(m) = f_in(i,j,k) 217 work(j,i-xs+1,k,l) = f_in(i,j,k) 200 218 ENDDO 201 219 ENDDO 202 220 ENDDO 221 !$acc end kernels 203 222 ENDDO 204 223 !$OMP END PARALLEL … … 208 227 CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' ) 209 228 IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr ) 210 CALL MPI_ALLTOALL( work(1), sendrecvcount_zx, MPI_REAL, & 211 f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, & 229 !$acc update host( work ) 230 CALL MPI_ALLTOALL( work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, & 231 f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, & 212 232 comm1dx, ierr ) 233 !$acc update device( f_inv ) 213 234 CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' ) 214 235 … … 217 238 !$OMP PARALLEL PRIVATE ( i, j, k ) 218 239 !$OMP DO 240 !$acc kernels present( f_out ) 241 !$acc loop 219 242 DO k = 1, nz 220 243 DO i = nxl, nxr 244 !$acc loop vector( 32 ) 221 245 DO j = nys, nyn 222 246 f_out(k,j,i) = f_inv(j,i,k) … … 224 248 ENDDO 225 249 ENDDO 250 !$acc end kernels 226 251 !$OMP END PARALLEL 227 252 #endif … … 233 258 !$OMP PARALLEL PRIVATE ( i, j, k ) 234 259 !$OMP DO 260 !$acc kernels present( f_in ) 261 !$acc loop 235 262 DO i = nxl, nxr 236 263 DO j = nys, nyn 264 !$acc loop vector( 32 ) 237 265 DO k = 1, nz 238 266 f_inv(j,i,k) = f_in(i,j,k) … … 240 268 ENDDO 241 269 ENDDO 242 !$OMP END PARALLEL 243 244 !$OMP PARALLEL PRIVATE ( i, j, k ) 245 !$OMP DO 270 !$acc end kernels 271 !$OMP END PARALLEL 272 273 !$OMP PARALLEL PRIVATE ( i, j, k ) 274 !$OMP DO 275 !$acc kernels present( f_out ) 276 !$acc loop 246 277 DO k = 1, nz 247 278 DO i = nxl, nxr 279 !$acc loop vector( 32 ) 248 280 DO j = nys, nyn 249 281 f_out(k,j,i) = f_inv(j,i,k) … … 251 283 ENDDO 252 284 ENDDO 285 !$acc end kernels 253 286 !$OMP END PARALLEL 254 287 … … 276 309 IMPLICIT NONE 277 310 278 INTEGER :: i, j, k, l, m,ys311 INTEGER :: i, j, k, l, ys 279 312 280 REAL :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y), & 281 f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx), & 282 f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x), & 283 work(nnx*nny*nnz) 313 REAL :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y), f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) 314 315 REAL, DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work 316 317 !$acc declare create( f_inv ) 318 REAL :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) 319 284 320 285 321 IF ( numprocs /= 1 ) THEN … … 288 324 ! 289 325 !-- Reorder input array for transposition 290 !$OMP PARALLEL PRIVATE ( i, j, k, l, m,ys )326 !$OMP PARALLEL PRIVATE ( i, j, k, l, ys ) 291 327 !$OMP DO 292 328 DO l = 0, pdims(2) - 1 293 m = l * ( nxr_y - nxl_y + 1 ) * ( nzt_y - nzb_y + 1 ) * &294 ( nyn_x - nys_x + 1 )295 329 ys = 0 + l * ( nyn_x - nys_x + 1 ) 330 !$acc kernels present( f_in, work ) 331 !$acc loop 296 332 DO i = nxl_y, nxr_y 297 333 DO k = nzb_y, nzt_y 334 !$acc loop vector( 32 ) 298 335 DO j = ys, ys + nyn_x - nys_x 299 m = m + 1 300 work(m) = f_in(j,i,k) 336 work(j-ys+1,k,i,l) = f_in(j,i,k) 301 337 ENDDO 302 338 ENDDO 303 339 ENDDO 340 !$acc end kernels 304 341 ENDDO 305 342 !$OMP END PARALLEL … … 309 346 CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' ) 310 347 IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr ) 311 CALL MPI_ALLTOALL( work(1), sendrecvcount_xy, MPI_REAL, & 312 f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, & 348 !$acc update host( work ) 349 CALL MPI_ALLTOALL( work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, & 350 f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, & 313 351 comm1dy, ierr ) 352 !$acc update device( f_inv ) 314 353 CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' ) 315 354 #endif … … 321 360 !$OMP PARALLEL PRIVATE ( i, j, k ) 322 361 !$OMP DO 362 !$acc kernels present( f_in ) 363 !$acc loop 323 364 DO i = nxl_y, nxr_y 324 365 DO k = nzb_y, nzt_y 366 !$acc loop vector( 32 ) 325 367 DO j = 0, ny 326 368 f_inv(j,k,i) = f_in(j,i,k) … … 328 370 ENDDO 329 371 ENDDO 372 !$acc end kernels 330 373 !$OMP END PARALLEL 331 374 … … 336 379 !$OMP PARALLEL PRIVATE ( i, j, k ) 337 380 !$OMP DO 381 !$acc kernels present( f_out ) 382 !$acc loop 338 383 DO i = 0, nx 339 384 DO k = nzb_x, nzt_x 385 !$acc loop vector( 32 ) 340 386 DO j = nys_x, nyn_x 341 387 f_out(i,j,k) = f_inv(j,k,i) … … 343 389 ENDDO 344 390 ENDDO 391 !$acc end kernels 345 392 !$OMP END PARALLEL 346 393 … … 434 481 IMPLICIT NONE 435 482 436 INTEGER :: i, j, k, l, m,zs483 INTEGER :: i, j, k, l, zs 437 484 438 REAL :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y), & 439 f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny), & 440 f_out(nxl_z:nxr_z,nys_z:nyn_z,1:nz), & 441 work(nnx*nny*nnz) 485 REAL :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y), f_out(nxl_z:nxr_z,nys_z:nyn_z,1:nz) 486 487 REAL, DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work 488 489 !$acc declare create( f_inv ) 490 REAL :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) 491 442 492 443 493 ! … … 446 496 !$OMP PARALLEL PRIVATE ( i, j, k ) 447 497 !$OMP DO 498 !$acc kernels present( f_in ) 499 !$acc loop 448 500 DO j = 0, ny 449 501 DO k = nzb_y, nzt_y 502 !$acc loop vector( 32 ) 450 503 DO i = nxl_y, nxr_y 451 504 f_inv(i,k,j) = f_in(j,i,k) … … 453 506 ENDDO 454 507 ENDDO 508 !$acc end kernels 455 509 !$OMP END PARALLEL 456 510 … … 464 518 !$OMP PARALLEL PRIVATE ( i, j, k ) 465 519 !$OMP DO 520 !$acc kernels present( f_out ) 521 !$acc loop 466 522 DO j = 0, ny 467 523 DO k = nzb_y, nzt_y 524 !$acc loop vector( 32 ) 468 525 DO i = nxl_y, nxr_y 469 526 f_out(i,j,k) = f_inv(i,k,j) … … 471 528 ENDDO 472 529 ENDDO 530 !$acc end kernels 473 531 !$OMP END PARALLEL 474 532 … … 480 538 CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' ) 481 539 IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr ) 482 CALL MPI_ALLTOALL( f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, & 483 work(1), sendrecvcount_yz, MPI_REAL, & 540 !$acc update host( f_inv ) 541 CALL MPI_ALLTOALL( f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, & 542 work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, & 484 543 comm1dx, ierr ) 544 !$acc update device( work ) 485 545 CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' ) 486 546 487 547 ! 488 548 !-- Reorder transposed array 489 !$OMP PARALLEL PRIVATE ( i, j, k, l, m,zs )549 !$OMP PARALLEL PRIVATE ( i, j, k, l, zs ) 490 550 !$OMP DO 491 551 DO l = 0, pdims(1) - 1 492 m = l * ( nyn_z - nys_z + 1 ) * ( nzt_y - nzb_y + 1 ) * &493 ( nxr_z - nxl_z + 1 )494 552 zs = 1 + l * ( nzt_y - nzb_y + 1 ) 553 !$acc kernels present( f_out, work ) 554 !$acc loop 495 555 DO j = nys_z, nyn_z 496 556 DO k = zs, zs + nzt_y - nzb_y 557 !$acc loop vector( 32 ) 497 558 DO i = nxl_z, nxr_z 498 m = m + 1 499 f_out(i,j,k) = work(m) 559 f_out(i,j,k) = work(i,k-zs+1,j,l) 500 560 ENDDO 501 561 ENDDO 502 562 ENDDO 563 !$acc end kernels 503 564 ENDDO 504 565 !$OMP END PARALLEL … … 528 589 IMPLICIT NONE 529 590 530 INTEGER :: i, j, k, l, m,xs591 INTEGER :: i, j, k, l, xs 531 592 532 REAL :: f_in(1:nz,nys:nyn,nxl:nxr), f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x), & 533 work(nnx*nny*nnz) 534 535 !$acc declare create ( f_inv ) 593 REAL :: f_in(1:nz,nys:nyn,nxl:nxr), f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) 594 595 REAL, DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work 596 597 !$acc declare create( f_inv ) 536 598 REAL :: f_inv(nys:nyn,nxl:nxr,1:nz) 537 599 … … 552 614 ENDDO 553 615 ENDDO 616 !$acc end kernels 554 617 !$OMP END PARALLEL 555 618 … … 573 636 ENDDO 574 637 ENDDO 638 !$acc end kernels 575 639 !$OMP END PARALLEL 576 640 … … 582 646 CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' ) 583 647 IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr ) 584 CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, & 585 work(1), sendrecvcount_zx, MPI_REAL, & 648 !$acc update host( f_inv ) 649 CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, & 650 work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, & 586 651 comm1dx, ierr ) 652 !$acc update device( work ) 587 653 CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' ) 588 654 589 655 ! 590 656 !-- Reorder transposed array 591 !$OMP PARALLEL PRIVATE ( i, j, k, l, m,xs )657 !$OMP PARALLEL PRIVATE ( i, j, k, l, xs ) 592 658 !$OMP DO 593 659 DO l = 0, pdims(1) - 1 594 m = l * ( nzt_x - nzb_x + 1 ) * nnx * ( nyn_x - nys_x + 1 )595 660 xs = 0 + l * nnx 661 !$acc kernels present( f_out, work ) 662 !$acc loop 596 663 DO k = nzb_x, nzt_x 597 664 DO i = xs, xs + nnx - 1 665 !$acc loop vector( 32 ) 598 666 DO j = nys_x, nyn_x 599 m = m + 1 600 f_out(i,j,k) = work(m) 667 f_out(i,j,k) = work(j,i-xs+1,k,l) 601 668 ENDDO 602 669 ENDDO 603 670 ENDDO 671 !$acc end kernels 604 672 ENDDO 605 673 !$OMP END PARALLEL … … 629 697 IMPLICIT NONE 630 698 631 INTEGER :: i, j, k, l, m,zs699 INTEGER :: i, j, k, l, zs 632 700 633 REAL :: f_in(nxl_z:nxr_z,nys_z:nyn_z,1:nz), & 634 f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny), & 635 f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y), & 636 work(nnx*nny*nnz) 701 REAL :: f_in(nxl_z:nxr_z,nys_z:nyn_z,1:nz), f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) 702 703 REAL, DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work 704 705 !$acc declare create( f_inv ) 706 REAL :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) 707 637 708 638 709 ! … … 644 715 ! 645 716 !-- Reorder input array for transposition 646 !$OMP PARALLEL PRIVATE ( i, j, k, l, m,zs )717 !$OMP PARALLEL PRIVATE ( i, j, k, l, zs ) 647 718 !$OMP DO 648 719 DO l = 0, pdims(1) - 1 649 m = l * ( nyn_z - nys_z + 1 ) * ( nzt_y - nzb_y + 1 ) * &650 ( nxr_z - nxl_z + 1 )651 720 zs = 1 + l * ( nzt_y - nzb_y + 1 ) 721 !$acc kernels present( f_in, work ) 722 !$acc loop 652 723 DO j = nys_z, nyn_z 653 724 DO k = zs, zs + nzt_y - nzb_y 725 !$acc loop vector( 32 ) 654 726 DO i = nxl_z, nxr_z 655 m = m + 1 656 work(m) = f_in(i,j,k) 727 work(i,k-zs+1,j,l) = f_in(i,j,k) 657 728 ENDDO 658 729 ENDDO 659 730 ENDDO 731 !$acc end kernels 660 732 ENDDO 661 733 !$OMP END PARALLEL … … 665 737 CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' ) 666 738 IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr ) 667 CALL MPI_ALLTOALL( work(1), sendrecvcount_yz, MPI_REAL, & 668 f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, & 739 !$acc update host( work ) 740 CALL MPI_ALLTOALL( work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, & 741 f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, & 669 742 comm1dx, ierr ) 743 !$acc update device( f_inv ) 670 744 CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' ) 671 745 #endif … … 676 750 !$OMP PARALLEL PRIVATE ( i, j, k ) 677 751 !$OMP DO 752 !$acc kernels present( f_in ) 753 !$acc loop 678 754 DO k = nzb_y, nzt_y 679 755 DO j = 0, ny 756 !$acc loop vector( 32 ) 680 757 DO i = nxl_y, nxr_y 681 758 f_inv(i,k,j) = f_in(i,j,k) … … 683 760 ENDDO 684 761 ENDDO 762 !$acc end kernels 685 763 !$OMP END PARALLEL 686 764 … … 691 769 !$OMP PARALLEL PRIVATE ( i, j, k ) 692 770 !$OMP DO 771 !$acc kernels present( f_out ) 772 !$acc loop 693 773 DO k = nzb_y, nzt_y 694 774 DO i = nxl_y, nxr_y 775 !$acc loop vector( 32 ) 695 776 DO j = 0, ny 696 777 f_out(j,i,k) = f_inv(i,k,j) … … 698 779 ENDDO 699 780 ENDDO 781 !$acc end kernels 700 782 !$OMP END PARALLEL 701 783
Note: See TracChangeset
for help on using the changeset viewer.