Home

Context Navigation

← Previous Change
Next Change →

fft_xy.f90

Timestamp:

Mar 8, 2013 11:54:10 PM (11 years ago)

Author:

raasch

Message:

New:
---

GPU porting of pres, swap_timelevel. Adjustments of openACC directives.
Further porting of poisfft, which now runs completely on GPU without any
host/device data transfer for serial an parallel runs (but parallel runs
require data transfer before and after the MPI transpositions).
GPU-porting of tridiagonal solver:
tridiagonal routines split into extermal subroutines (instead using CONTAINS),
no distinction between parallel/non-parallel in poisfft and tridia any more,
tridia routines moved to end of file because of probable bug in PGI compiler
(otherwise "invalid device function" is indicated during runtime).
(cuda_fft_interfaces, fft_xy, flow_statistics, init_3d_model, palm, poisfft, pres, prognostic_equations, swap_timelevel, time_integration, transpose)
output of accelerator board information. (header)

optimization of tridia routines: constant elements and coefficients of tri are
stored in seperate arrays ddzuw and tric, last dimension of tri reduced from 5 to 2,
(init_grid, init_3d_model, modules, palm, poisfft)

poisfft_init is now called internally from poisfft,
(Makefile, Makefile_check, init_pegrid, poisfft, poisfft_hybrid)

CPU-time per grid point and timestep is output to CPU_MEASURES file
(cpu_statistics, modules, time_integration)

Changed:

resorting from/to array work changed, work now has 4 dimensions instead of 1 (transpose)
array diss allocated only if required (init_3d_model)

pressure boundary condition "Neumann+inhomo" removed from the code
(check_parameters, header, poisfft, poisfft_hybrid, pres)

Errors:

bugfix: dependency added for cuda_fft_interfaces (Makefile)
bugfix: CUDA fft plans adjusted for domain decomposition (before they always
used total domain) (fft_xy)

File:

: 1 edited

palm/trunk/SOURCE/fft_xy.f90 (modified) (8 diffs)

Legend:

: Unmodified
: Added
: Removed

palm/trunk/SOURCE/fft_xy.f90

-                      r1107
+                      r1111
 ! Current revisions:
 ! -----------------
+!
+! further openACC statements added, CUDA branch completely runs on GPU
+! bugfix: CUDA fft plans adjusted for domain decomposition (before they always
+! used total domain)
+!
 ! Former revisions:
 …
           total_points_x_transpo = (nx+1) * (nyn_x-nys_x+1) * (nzt_x-nzb_x+1)
           total_points_y_transpo = (ny+1) * (nxr_y-nxl_y+1) * (nzt_y-nzb_y+1)
           CALL CUFFTPLAN1D( plan_xf, nx+1, CUFFT_D2Z, (ny+1)*nz )
           CALL CUFFTPLAN1D( plan_xi, nx+1, CUFFT_Z2D, (ny+1)*nz )
           CALL CUFFTPLAN1D( plan_yf, ny+1, CUFFT_D2Z, (nx+1)*nz )
           CALL CUFFTPLAN1D( plan_yi, ny+1, CUFFT_Z2D, (nx+1)*nz )
+          CALL CUFFTPLAN1D( plan_xf, nx+1, CUFFT_D2Z, (nyn_x-nys_x+1) * (nzt_x-nzb_x+1) )
+          CALL CUFFTPLAN1D( plan_xi, nx+1, CUFFT_Z2D, (nyn_x-nys_x+1) * (nzt_x-nzb_x+1) )
+          CALL CUFFTPLAN1D( plan_yf, ny+1, CUFFT_D2Z, (nxr_y-nxl_y+1) * (nzt_y-nzb_y+1) )
+          CALL CUFFTPLAN1D( plan_yi, ny+1, CUFFT_Z2D, (nxr_y-nxl_y+1) * (nzt_y-nzb_y+1) )
 #else
           message_string = 'no system-specific fft-call available'
 …
        CHARACTER (LEN=*) ::  direction
        INTEGER ::  i, ishape(1), j, k, m
+       INTEGER ::  i, ishape(1), j, k
        LOGICAL ::  forward_fft
 …
        REAL, DIMENSION(6*(nx+1)) ::  work2
 #elif defined( __cuda_fft )
+       REAL(dpk), DEVICE, DIMENSION(:), ALLOCATABLE    ::  cuda_a_device
+       COMPLEX(dpk), DEVICE, DIMENSION(:), ALLOCATABLE ::  cuda_b_device
+       COMPLEX(dpk), DIMENSION(:), ALLOCATABLE         ::  cuda_host
+       !$acc declare create( ar_tmp )
+       COMPLEX(dpk), DIMENSION(0:(nx+1)/2,nys_x:nyn_x,nzb_x:nzt_x) ::  ar_tmp
 #endif
        REAL, DIMENSION(0:nx,nys_x:nyn_x,nzb_x:nzt_x) ::  ar
 …
 #elif defined( __cuda_fft )
-          ALLOCATE( cuda_a_device(0:total_points_x_transpo-1) )
-          ALLOCATE( cuda_b_device(0:((nx+1)/2+1) * (nyn_x-nys_x+1) * (nzt_x-nzb_x+1) - 1) )
-          ALLOCATE( cuda_host(0:((nx+1)/2+1) * (nyn_x-nys_x+1) * (nzt_x-nzb_x+1) - 1) )
-          m = 0
           IF ( forward_fft )  THEN
              cuda_a_device = ar(0:total_points_x_transpo-1,nys_x,nzb_x)
+             CALL CUFFTEXECD2Z( plan_xf, cuda_a_device, cuda_b_device )
              cuda_host = cuda_b_device
+             !$acc data present( ar )
+             CALL CUFFTEXECD2Z( plan_xf, ar, ar_tmp )
+             !$acc kernels
+             !$acc loop
              DO  k = nzb_x, nzt_x
                 DO  j = nys_x, nyn_x
+                   !$acc loop vector( 32 )
                    DO  i = 0, (nx+1)/2
+                      ar(i,j,k)      = REAL( cuda_host(m+i) )  * dnx
+                   ENDDO
+                      ar(i,j,k)      = REAL( ar_tmp(i,j,k) )  * dnx
+                   ENDDO
+                   !$acc loop vector( 32 )
                    DO  i = 1, (nx+1)/2 - 1
+                      ar(nx+1-i,j,k) = AIMAG( cuda_host(m+i) ) * dnx
+                   ENDDO
+                   m = m + (nx+1)/2 + 1
+                ENDDO
+             ENDDO
+          ELSE
+                      ar(nx+1-i,j,k) = AIMAG( ar_tmp(i,j,k) ) * dnx
+                   ENDDO
+                ENDDO
+             ENDDO
+             !$acc end kernels
+             !$acc end data
+          ELSE
+             !$acc data present( ar )
+             !$acc kernels
+             !$acc loop
              DO  k = nzb_x, nzt_x
                 DO  j = nys_x, nyn_x
+                   cuda_host(m) = CMPLX( ar(0,j,k), 0.0 )
+                   ar_tmp(0,j,k) = CMPLX( ar(0,j,k), 0.0 )
+                   !$acc loop vector( 32 )
                    DO  i = 1, (nx+1)/2 - 1
+                      cuda_host(m+i) = CMPLX( ar(i,j,k), ar(nx+1-i,j,k) )
+                   ENDDO
+                   cuda_host(m+(nx+1)/2) = CMPLX( ar((nx+1)/2,j,k), 0.0 )
+                   m = m + (nx+1)/2 + 1
+                ENDDO
+             ENDDO
+             cuda_b_device = cuda_host
+             CALL CUFFTEXECZ2D( plan_xi, cuda_b_device, cuda_a_device )
+             ar(0:total_points_x_transpo-1,nys_x,nzb_x) = cuda_a_device
+          ENDIF
+          DEALLOCATE( cuda_a_device, cuda_b_device, cuda_host )
+                      ar_tmp(i,j,k) = CMPLX( ar(i,j,k), ar(nx+1-i,j,k) )
+                   ENDDO
+                   ar_tmp((nx+1)/2,j,k) = CMPLX( ar((nx+1)/2,j,k), 0.0 )
+                ENDDO
+             ENDDO
+             !$acc end kernels
+             CALL CUFFTEXECZ2D( plan_xi, ar_tmp, ar )
+             !$acc end data
+          ENDIF
 #else
 …
        CHARACTER (LEN=*) ::  direction
        INTEGER ::  i, j, jshape(1), k, m
+       INTEGER ::  i, j, jshape(1), k
        LOGICAL ::  forward_fft
 …
        REAL, DIMENSION(6*(ny+1)) ::  work2
 #elif defined( __cuda_fft )
+       REAL(dpk), DEVICE, DIMENSION(:), ALLOCATABLE    ::  cuda_a_device
+       COMPLEX(dpk), DEVICE, DIMENSION(:), ALLOCATABLE ::  cuda_b_device
+       COMPLEX(dpk), DIMENSION(:), ALLOCATABLE         ::  cuda_host
+       !$acc declare create( ar_tmp )
+       COMPLEX(dpk), DIMENSION(0:(ny+1)/2,nxl_y:nxr_y,nzb_y:nzt_y) ::  ar_tmp
 #endif
        REAL, DIMENSION(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) ::  ar
 …
 #elif defined( __cuda_fft )
-          ALLOCATE( cuda_a_device(0:total_points_y_transpo-1) )
-          ALLOCATE( cuda_b_device(0:((ny+1)/2+1) * (nxr_y-nxl_y+1) * (nzt_y-nzb_y+1) - 1) )
-          ALLOCATE( cuda_host(0:((ny+1)/2+1) * (nxr_y-nxl_y+1) * (nzt_y-nzb_y+1) - 1) )
-          m = 0
           IF ( forward_fft )  THEN
              cuda_a_device = ar(0:total_points_y_transpo-1,nxl_y,nzb_y)
+             CALL CUFFTEXECD2Z( plan_yf, cuda_a_device, cuda_b_device )
              cuda_host = cuda_b_device
+             !$acc data present( ar )
+             CALL CUFFTEXECD2Z( plan_yf, ar, ar_tmp )
+             !$acc kernels
+             !$acc loop
              DO  k = nzb_y, nzt_y
                 DO  i = nxl_y, nxr_y
+                   !$acc loop vector( 32 )
                    DO  j = 0, (ny+1)/2
+                      ar(j,i,k)      = REAL( cuda_host(m+j) )  * dny
+                   ENDDO
+                      ar(j,i,k)      = REAL( ar_tmp(j,i,k) )  * dny
+                   ENDDO
+                   !$acc loop vector( 32 )
                    DO  j = 1, (ny+1)/2 - 1
+                      ar(ny+1-j,i,k) = AIMAG( cuda_host(m+j) ) * dny
+                   ENDDO
+                   m = m + (ny+1)/2 + 1
+                ENDDO
+             ENDDO
+          ELSE
+                      ar(ny+1-j,i,k) = AIMAG( ar_tmp(j,i,k) ) * dny
+                   ENDDO
+                ENDDO
+             ENDDO
+             !$acc end kernels
+             !$acc end data
+          ELSE
+             !$acc data present( ar )
+             !$acc kernels
+             !$acc loop
              DO  k = nzb_y, nzt_y
                 DO  i = nxl_y, nxr_y
+                   cuda_host(m) = CMPLX( ar(0,i,k), 0.0 )
+                   ar_tmp(0,i,k) = CMPLX( ar(0,i,k), 0.0 )
+                   !$acc loop vector( 32 )
                    DO  j = 1, (ny+1)/2 - 1
+                      cuda_host(m+j) = CMPLX( ar(j,i,k), ar(ny+1-j,i,k) )
+                   ENDDO
+                   cuda_host(m+(ny+1)/2) = CMPLX( ar((ny+1)/2,i,k), 0.0 )
+                   m = m + (ny+1)/2 + 1
+                ENDDO
+             ENDDO
+             cuda_b_device = cuda_host
+             CALL CUFFTEXECZ2D( plan_yi, cuda_b_device, cuda_a_device )
+             ar(0:total_points_y_transpo-1,nxl_y,nzb_y) = cuda_a_device
+          ENDIF
+          DEALLOCATE( cuda_a_device, cuda_b_device, cuda_host )
+                      ar_tmp(j,i,k) = CMPLX( ar(j,i,k), ar(ny+1-j,i,k) )
+                   ENDDO
+                   ar_tmp((ny+1)/2,i,k) = CMPLX( ar((ny+1)/2,i,k), 0.0 )
+                ENDDO
+             ENDDO
+             !$acc end kernels
+             CALL CUFFTEXECZ2D( plan_yi, ar_tmp, ar )
+             !$acc end data
+          ENDIF
 #else

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 1111 for palm/trunk/SOURCE/fft_xy.f90

Legend:

palm/trunk/SOURCE/fft_xy.f90

Download in other formats: