Ignore:
Timestamp:
Mar 8, 2013 11:54:10 PM (9 years ago)
Author:
raasch
Message:

New:
---

GPU porting of pres, swap_timelevel. Adjustments of openACC directives.
Further porting of poisfft, which now runs completely on GPU without any
host/device data transfer for serial an parallel runs (but parallel runs
require data transfer before and after the MPI transpositions).
GPU-porting of tridiagonal solver:
tridiagonal routines split into extermal subroutines (instead using CONTAINS),
no distinction between parallel/non-parallel in poisfft and tridia any more,
tridia routines moved to end of file because of probable bug in PGI compiler
(otherwise "invalid device function" is indicated during runtime).
(cuda_fft_interfaces, fft_xy, flow_statistics, init_3d_model, palm, poisfft, pres, prognostic_equations, swap_timelevel, time_integration, transpose)
output of accelerator board information. (header)

optimization of tridia routines: constant elements and coefficients of tri are
stored in seperate arrays ddzuw and tric, last dimension of tri reduced from 5 to 2,
(init_grid, init_3d_model, modules, palm, poisfft)

poisfft_init is now called internally from poisfft,
(Makefile, Makefile_check, init_pegrid, poisfft, poisfft_hybrid)

CPU-time per grid point and timestep is output to CPU_MEASURES file
(cpu_statistics, modules, time_integration)

Changed:


resorting from/to array work changed, work now has 4 dimensions instead of 1 (transpose)
array diss allocated only if required (init_3d_model)

pressure boundary condition "Neumann+inhomo" removed from the code
(check_parameters, header, poisfft, poisfft_hybrid, pres)

Errors:


bugfix: dependency added for cuda_fft_interfaces (Makefile)
bugfix: CUDA fft plans adjusted for domain decomposition (before they always
used total domain) (fft_xy)

File:
1 edited

Legend:

Unmodified
Added
Removed
  • palm/trunk/SOURCE/pres.f90

    r1093 r1111  
    2020! Current revisions:
    2121! -----------------
    22 !
     22! openACC statements added,
     23! ibc_p_b = 2 removed
    2324!
    2425! Former revisions:
     
    304305    ELSE
    305306       !$OMP PARALLEL DO SCHEDULE( STATIC )
     307       !$acc kernels present( d )
     308       !$acc loop
    306309       DO  i = nxl, nxr
    307310          DO  j = nys, nyn
     311             !$acc loop vector(32)
    308312             DO  k = nzb+1, nzt
    309313                d(k,j,i) = 0.0
     
    311315          ENDDO
    312316       ENDDO
     317       !$acc end kernels
    313318    ENDIF
    314319
     
    328333          ENDDO
    329334!
    330 !--       Additional pressure boundary condition at the bottom boundary for
    331 !--       inhomogeneous Prandtl layer heat fluxes and temperatures, respectively
    332 !--       dp/dz = -(dtau13/dx + dtau23/dy) + g*pt'/pt0.
    333 !--       This condition must not be applied at the start of a run, because then
    334 !--       flow_statistics has not yet been called and thus sums = 0.
    335           IF ( ibc_p_b == 2  .AND.  sums(nzb+1,4) /= 0.0 )  THEN
    336              k = nzb_s_inner(j,i)
    337              d(k+1,j,i) = d(k+1,j,i) + (                                     &
    338                                          ( usws(j,i+1) - usws(j,i) ) * ddx   &
    339                                        + ( vsws(j+1,i) - vsws(j,i) ) * ddy   &
    340                                        - g * ( pt(k+1,j,i) - sums(k+1,4) ) / &
    341                                          sums(k+1,4)                         &
    342                                        ) * ddzw(k+1) * ddt_3d * d_weight_pres
    343           ENDIF
    344 
    345 !
    346335!--       Compute possible PE-sum of divergences for flow_statistics
    347336          DO  k = nzb_s_inner(j,i)+1, nzt
     
    357346    !$OMP END PARALLEL
    358347#else
    359     IF ( ibc_p_b == 2 .AND. sums(nzb+1,4) /= 0.0 )  THEN
    360        !$OMP PARALLEL PRIVATE (i,j,k)
    361        !$OMP DO SCHEDULE( STATIC )
    362        DO  i = nxl, nxr
    363           DO  j = nys, nyn
    364              DO  k = nzb_s_inner(j,i)+1, nzt
    365              d(k,j,i) = ( ( u(k,j,i+1) - u(k,j,i) ) * ddx + &
    366                           ( v(k,j+1,i) - v(k,j,i) ) * ddy + &
    367                           ( w(k,j,i) - w(k-1,j,i) ) * ddzw(k) ) * ddt_3d      &
    368                                                                 * d_weight_pres
    369              ENDDO
    370           ENDDO
    371 !
    372 !--       Additional pressure boundary condition at the bottom boundary for
    373 !--       inhomogeneous Prandtl layer heat fluxes and temperatures, respectively
    374 !--       dp/dz = -(dtau13/dx + dtau23/dy) + g*pt'/pt0.
    375 !--       This condition must not be applied at the start of a run, because then
    376 !--       flow_statistics has not yet been called and thus sums = 0.
    377           DO  j = nys, nyn
    378               k = nzb_s_inner(j,i)
    379               d(k+1,j,i) = d(k+1,j,i) + (                        &
    380                              ( usws(j,i+1) - usws(j,i) ) * ddx   &
    381                            + ( vsws(j+1,i) - vsws(j,i) ) * ddy   &
    382                            - g * ( pt(k+1,j,i) - sums(k+1,4) ) / &
    383                              sums(k+1,4)                         &
    384                                         ) * ddzw(k+1) * ddt_3d   &
    385                                           * d_weight_pres
    386           ENDDO
    387        ENDDO
    388        !$OMP END PARALLEL
    389 
    390     ELSE
    391 
    392        !$OMP PARALLEL PRIVATE (i,j,k)
    393        !$OMP DO SCHEDULE( STATIC )
    394        DO  i = nxl, nxr
    395           DO  j = nys, nyn
    396              DO  k = nzb_s_inner(j,i)+1, nzt
     348
     349    !$OMP PARALLEL PRIVATE (i,j,k)
     350    !$OMP DO SCHEDULE( STATIC )
     351    !$acc kernels present( d, ddzw, nzb_s_inner, u, v, w )
     352    !$acc loop
     353    DO  i = nxl, nxr
     354       DO  j = nys, nyn
     355          !$acc loop vector(32)
     356          DO  k = 1, nzt
     357             IF ( k > nzb_s_inner(j,i) )  THEN
    397358                d(k,j,i) = ( ( u(k,j,i+1) - u(k,j,i) ) * ddx + &
    398                           ( v(k,j+1,i) - v(k,j,i) ) * ddy + &
    399                           ( w(k,j,i) - w(k-1,j,i) ) * ddzw(k) ) * ddt_3d      &
    400                                                                 * d_weight_pres
    401              ENDDO
    402           ENDDO
    403        ENDDO
    404        !$OMP END PARALLEL
    405 
    406     ENDIF
     359                           ( v(k,j+1,i) - v(k,j,i) ) * ddy + &
     360                           ( w(k,j,i) - w(k-1,j,i) ) * ddzw(k) ) * ddt_3d      &
     361                           * d_weight_pres
     362             ENDIF
     363          ENDDO
     364       ENDDO
     365    ENDDO
     366    !$acc end kernels
     367    !$OMP END PARALLEL
    407368
    408369!
     
    439400!--       Solver for 2d-decomposition
    440401          CALL poisfft( d, tend )
     402          !$acc update host( d )
    441403       ELSEIF ( psolver == 'poisfft_hybrid' )  THEN
    442404!
     
    466428!
    467429!--       Neumann (dp/dz = 0)
    468           !$OMP PARALLEL DO
    469           DO  i = nxlg, nxrg
    470              DO  j = nysg, nyng
    471                 tend(nzb_s_inner(j,i),j,i) = tend(nzb_s_inner(j,i)+1,j,i)
    472              ENDDO
    473           ENDDO
    474 
    475        ELSEIF ( ibc_p_b == 2 )  THEN
    476 !
    477 !--       Neumann condition for inhomogeneous surfaces,
    478 !--       here currently still in the form of a zero gradient. Actually
    479 !--       dp/dz = -(dtau13/dx + dtau23/dy) + g*pt'/pt0 would have to be used for
    480 !--       the computation (cf. above: computation of divergences).
    481430          !$OMP PARALLEL DO
    482431          DO  i = nxlg, nxrg
     
    611560!-- Correction of the provisional velocities with the current perturbation
    612561!-- pressure just computed
     562    !$acc update host( u, v, w )
    613563    IF ( conserve_volume_flow  .AND.  ( bc_lr_cyc .OR. bc_ns_cyc ) )  THEN
    614564       volume_flow_l(1) = 0.0
     
    748698    CALL cpu_log( log_point_s(1), 'divergence', 'stop' )
    749699
     700    !$acc update device( u, v, w )
     701
    750702    CALL cpu_log( log_point(8), 'pres', 'stop' )
    751703   
Note: See TracChangeset for help on using the changeset viewer.