Changeset 1111 for palm/trunk/SOURCE/fft_xy.f90
- Timestamp:
- Mar 8, 2013 11:54:10 PM (11 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
palm/trunk/SOURCE/fft_xy.f90
r1107 r1111 20 20 ! Current revisions: 21 21 ! ----------------- 22 ! 22 ! further openACC statements added, CUDA branch completely runs on GPU 23 ! bugfix: CUDA fft plans adjusted for domain decomposition (before they always 24 ! used total domain) 23 25 ! 24 26 ! Former revisions: … … 213 215 total_points_x_transpo = (nx+1) * (nyn_x-nys_x+1) * (nzt_x-nzb_x+1) 214 216 total_points_y_transpo = (ny+1) * (nxr_y-nxl_y+1) * (nzt_y-nzb_y+1) 215 CALL CUFFTPLAN1D( plan_xf, nx+1, CUFFT_D2Z, (ny +1)*nz)216 CALL CUFFTPLAN1D( plan_xi, nx+1, CUFFT_Z2D, (ny +1)*nz)217 CALL CUFFTPLAN1D( plan_yf, ny+1, CUFFT_D2Z, (nx +1)*nz)218 CALL CUFFTPLAN1D( plan_yi, ny+1, CUFFT_Z2D, (nx +1)*nz)217 CALL CUFFTPLAN1D( plan_xf, nx+1, CUFFT_D2Z, (nyn_x-nys_x+1) * (nzt_x-nzb_x+1) ) 218 CALL CUFFTPLAN1D( plan_xi, nx+1, CUFFT_Z2D, (nyn_x-nys_x+1) * (nzt_x-nzb_x+1) ) 219 CALL CUFFTPLAN1D( plan_yf, ny+1, CUFFT_D2Z, (nxr_y-nxl_y+1) * (nzt_y-nzb_y+1) ) 220 CALL CUFFTPLAN1D( plan_yi, ny+1, CUFFT_Z2D, (nxr_y-nxl_y+1) * (nzt_y-nzb_y+1) ) 219 221 #else 220 222 message_string = 'no system-specific fft-call available' … … 261 263 262 264 CHARACTER (LEN=*) :: direction 263 INTEGER :: i, ishape(1), j, k , m265 INTEGER :: i, ishape(1), j, k 264 266 265 267 LOGICAL :: forward_fft … … 273 275 REAL, DIMENSION(6*(nx+1)) :: work2 274 276 #elif defined( __cuda_fft ) 275 REAL(dpk), DEVICE, DIMENSION(:), ALLOCATABLE :: cuda_a_device 276 COMPLEX(dpk), DEVICE, DIMENSION(:), ALLOCATABLE :: cuda_b_device 277 COMPLEX(dpk), DIMENSION(:), ALLOCATABLE :: cuda_host 277 !$acc declare create( ar_tmp ) 278 COMPLEX(dpk), DIMENSION(0:(nx+1)/2,nys_x:nyn_x,nzb_x:nzt_x) :: ar_tmp 278 279 #endif 279 280 REAL, DIMENSION(0:nx,nys_x:nyn_x,nzb_x:nzt_x) :: ar … … 502 503 #elif defined( __cuda_fft ) 503 504 504 ALLOCATE( cuda_a_device(0:total_points_x_transpo-1) )505 ALLOCATE( cuda_b_device(0:((nx+1)/2+1) * (nyn_x-nys_x+1) * (nzt_x-nzb_x+1) - 1) )506 ALLOCATE( cuda_host(0:((nx+1)/2+1) * (nyn_x-nys_x+1) * (nzt_x-nzb_x+1) - 1) )507 508 m = 0509 510 505 IF ( forward_fft ) THEN 511 506 512 cuda_a_device = ar(0:total_points_x_transpo-1,nys_x,nzb_x)513 514 CALL CUFFTEXECD2Z( plan_xf, cuda_a_device, cuda_b_device ) 515 cuda_host = cuda_b_device516 507 !$acc data present( ar ) 508 CALL CUFFTEXECD2Z( plan_xf, ar, ar_tmp ) 509 510 !$acc kernels 511 !$acc loop 517 512 DO k = nzb_x, nzt_x 518 513 DO j = nys_x, nyn_x 519 514 515 !$acc loop vector( 32 ) 520 516 DO i = 0, (nx+1)/2 521 ar(i,j,k) = REAL( cuda_host(m+i) ) * dnx 522 ENDDO 523 517 ar(i,j,k) = REAL( ar_tmp(i,j,k) ) * dnx 518 ENDDO 519 520 !$acc loop vector( 32 ) 524 521 DO i = 1, (nx+1)/2 - 1 525 ar(nx+1-i,j,k) = AIMAG( cuda_host(m+i) ) * dnx 526 ENDDO 527 528 m = m + (nx+1)/2 + 1 529 530 ENDDO 531 ENDDO 532 533 ELSE 534 522 ar(nx+1-i,j,k) = AIMAG( ar_tmp(i,j,k) ) * dnx 523 ENDDO 524 525 ENDDO 526 ENDDO 527 !$acc end kernels 528 !$acc end data 529 530 ELSE 531 532 !$acc data present( ar ) 533 !$acc kernels 534 !$acc loop 535 535 DO k = nzb_x, nzt_x 536 536 DO j = nys_x, nyn_x 537 537 538 cuda_host(m) = CMPLX( ar(0,j,k), 0.0 ) 539 538 ar_tmp(0,j,k) = CMPLX( ar(0,j,k), 0.0 ) 539 540 !$acc loop vector( 32 ) 540 541 DO i = 1, (nx+1)/2 - 1 541 cuda_host(m+i) = CMPLX( ar(i,j,k), ar(nx+1-i,j,k) ) 542 ENDDO 543 cuda_host(m+(nx+1)/2) = CMPLX( ar((nx+1)/2,j,k), 0.0 ) 544 545 m = m + (nx+1)/2 + 1 546 547 ENDDO 548 ENDDO 549 550 cuda_b_device = cuda_host 551 CALL CUFFTEXECZ2D( plan_xi, cuda_b_device, cuda_a_device ) 552 553 ar(0:total_points_x_transpo-1,nys_x,nzb_x) = cuda_a_device 554 555 ENDIF 556 557 DEALLOCATE( cuda_a_device, cuda_b_device, cuda_host ) 542 ar_tmp(i,j,k) = CMPLX( ar(i,j,k), ar(nx+1-i,j,k) ) 543 ENDDO 544 ar_tmp((nx+1)/2,j,k) = CMPLX( ar((nx+1)/2,j,k), 0.0 ) 545 546 ENDDO 547 ENDDO 548 !$acc end kernels 549 550 CALL CUFFTEXECZ2D( plan_xi, ar_tmp, ar ) 551 !$acc end data 552 553 ENDIF 558 554 559 555 #else … … 775 771 776 772 CHARACTER (LEN=*) :: direction 777 INTEGER :: i, j, jshape(1), k , m773 INTEGER :: i, j, jshape(1), k 778 774 779 775 LOGICAL :: forward_fft … … 787 783 REAL, DIMENSION(6*(ny+1)) :: work2 788 784 #elif defined( __cuda_fft ) 789 REAL(dpk), DEVICE, DIMENSION(:), ALLOCATABLE :: cuda_a_device 790 COMPLEX(dpk), DEVICE, DIMENSION(:), ALLOCATABLE :: cuda_b_device 791 COMPLEX(dpk), DIMENSION(:), ALLOCATABLE :: cuda_host 785 !$acc declare create( ar_tmp ) 786 COMPLEX(dpk), DIMENSION(0:(ny+1)/2,nxl_y:nxr_y,nzb_y:nzt_y) :: ar_tmp 792 787 #endif 793 788 REAL, DIMENSION(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) :: ar … … 1013 1008 #elif defined( __cuda_fft ) 1014 1009 1015 ALLOCATE( cuda_a_device(0:total_points_y_transpo-1) )1016 ALLOCATE( cuda_b_device(0:((ny+1)/2+1) * (nxr_y-nxl_y+1) * (nzt_y-nzb_y+1) - 1) )1017 ALLOCATE( cuda_host(0:((ny+1)/2+1) * (nxr_y-nxl_y+1) * (nzt_y-nzb_y+1) - 1) )1018 1019 m = 01020 1021 1010 IF ( forward_fft ) THEN 1022 1011 1023 cuda_a_device = ar(0:total_points_y_transpo-1,nxl_y,nzb_y)1024 1025 CALL CUFFTEXECD2Z( plan_yf, cuda_a_device, cuda_b_device ) 1026 cuda_host = cuda_b_device1027 1012 !$acc data present( ar ) 1013 CALL CUFFTEXECD2Z( plan_yf, ar, ar_tmp ) 1014 1015 !$acc kernels 1016 !$acc loop 1028 1017 DO k = nzb_y, nzt_y 1029 1018 DO i = nxl_y, nxr_y 1030 1019 1020 !$acc loop vector( 32 ) 1031 1021 DO j = 0, (ny+1)/2 1032 ar(j,i,k) = REAL( cuda_host(m+j) ) * dny 1033 ENDDO 1034 1022 ar(j,i,k) = REAL( ar_tmp(j,i,k) ) * dny 1023 ENDDO 1024 1025 !$acc loop vector( 32 ) 1035 1026 DO j = 1, (ny+1)/2 - 1 1036 ar(ny+1-j,i,k) = AIMAG( cuda_host(m+j) ) * dny 1037 ENDDO 1038 1039 m = m + (ny+1)/2 + 1 1040 1041 ENDDO 1042 ENDDO 1043 1044 ELSE 1045 1027 ar(ny+1-j,i,k) = AIMAG( ar_tmp(j,i,k) ) * dny 1028 ENDDO 1029 1030 ENDDO 1031 ENDDO 1032 !$acc end kernels 1033 !$acc end data 1034 1035 ELSE 1036 1037 !$acc data present( ar ) 1038 !$acc kernels 1039 !$acc loop 1046 1040 DO k = nzb_y, nzt_y 1047 1041 DO i = nxl_y, nxr_y 1048 1042 1049 cuda_host(m) = CMPLX( ar(0,i,k), 0.0 ) 1050 1043 ar_tmp(0,i,k) = CMPLX( ar(0,i,k), 0.0 ) 1044 1045 !$acc loop vector( 32 ) 1051 1046 DO j = 1, (ny+1)/2 - 1 1052 cuda_host(m+j) = CMPLX( ar(j,i,k), ar(ny+1-j,i,k) ) 1053 ENDDO 1054 cuda_host(m+(ny+1)/2) = CMPLX( ar((ny+1)/2,i,k), 0.0 ) 1055 1056 m = m + (ny+1)/2 + 1 1057 1058 ENDDO 1059 ENDDO 1060 1061 cuda_b_device = cuda_host 1062 CALL CUFFTEXECZ2D( plan_yi, cuda_b_device, cuda_a_device ) 1063 1064 ar(0:total_points_y_transpo-1,nxl_y,nzb_y) = cuda_a_device 1065 1066 ENDIF 1067 1068 DEALLOCATE( cuda_a_device, cuda_b_device, cuda_host ) 1047 ar_tmp(j,i,k) = CMPLX( ar(j,i,k), ar(ny+1-j,i,k) ) 1048 ENDDO 1049 ar_tmp((ny+1)/2,i,k) = CMPLX( ar((ny+1)/2,i,k), 0.0 ) 1050 1051 ENDDO 1052 ENDDO 1053 !$acc end kernels 1054 1055 CALL CUFFTEXECZ2D( plan_yi, ar_tmp, ar ) 1056 !$acc end data 1057 1058 ENDIF 1069 1059 1070 1060 #else
Note: See TracChangeset
for help on using the changeset viewer.