Home

Context Navigation

← Previous Change
Next Change →

advec_ws.f90

Timestamp:

Jan 8, 2019 6:22:50 PM (6 years ago)

Author:

suehring

Message:

Split loops in advec_ws in order to reduce bit queries; Introduce new parameter to better control order degradation of advection scheme at non-cyclic boundaries; Remove setting of Neumann conditions for horizontal velocity variances; Minor bugfix in divergence correction in advection scheme (only has implications at downward-facing wall surfaces)

File:

: 1 edited

palm/trunk/SOURCE/advec_ws.f90 (modified) (91 diffs)

Legend:

: Unmodified
: Added
: Removed

palm/trunk/SOURCE/advec_ws.f90

-                      r3655
+                      r3661
 ! -----------------
 ! $Id$
+! - Minor bugfix in divergence correction (only has implications at
+!   downward-facing wall surfaces)
+! - Remove setting of Neumann condition for horizontal velocity variances
+! - Split loops for tendency calculation and divergence correction in order to
+!   reduce bit queries
+! - Introduce new parameter nzb_max_l to better control order degradation at
+!   non-cyclic boundaries
+!
+! 3655 2019-01-07 16:51:22Z knoop
 ! OpenACC port for SPEC
+!
 …
 !> degraded.
 !> A divergence correction is applied. It is necessary for topography, since
 !> the divergence is not sufficiently reduced, resulting in erroneous fluxes and
 !> partly numerical instabilities.
+!> the divergence is not sufficiently reduced, resulting in erroneous fluxes
+!> and could lead to numerical instabilities.
 !-----------------------------------------------------------------------------!
  MODULE advec_ws
 …
        USE control_parameters,                                                 &
+           ONLY:  intermediate_timestep_count, u_gtrans, v_gtrans
+           ONLY:  bc_dirichlet_l, bc_dirichlet_n, bc_dirichlet_r,              &
+                  bc_dirichlet_s, bc_radiation_l, bc_radiation_n,              &
+                  bc_radiation_r, bc_radiation_s, intermediate_timestep_count, &
+                  u_gtrans, v_gtrans
        USE grid_variables,                                                     &
 …
        CHARACTER (LEN = *), INTENT(IN) ::  sk_char !< string identifier, used for assign fluxes to the correct dimension in the analysis array
+       INTEGER(iwp) ::  i     !< grid index along x-direction
+       INTEGER(iwp) ::  i_omp !< leftmost index on subdomain, or in case of OpenMP, on thread
+       INTEGER(iwp) ::  j     !< grid index along y-direction
+       INTEGER(iwp) ::  k     !< grid index along z-direction
+       INTEGER(iwp) ::  k_mm  !< k-2 index in disretization, can be modified to avoid segmentation faults
+       INTEGER(iwp) ::  k_pp  !< k+2 index in disretization, can be modified to avoid segmentation faults
+       INTEGER(iwp) ::  k_ppp !< k+3 index in disretization, can be modified to avoid segmentation faults
+       INTEGER(iwp) ::  tn    !< number of OpenMP thread
+       INTEGER(iwp) ::  i         !< grid index along x-direction
+       INTEGER(iwp) ::  i_omp     !< leftmost index on subdomain, or in case of OpenMP, on thread
+       INTEGER(iwp) ::  j         !< grid index along y-direction
+       INTEGER(iwp) ::  k         !< grid index along z-direction
+       INTEGER(iwp) ::  k_mm      !< k-2 index in disretization, can be modified to avoid segmentation faults
+       INTEGER(iwp) ::  k_pp      !< k+2 index in disretization, can be modified to avoid segmentation faults
+       INTEGER(iwp) ::  k_ppp     !< k+3 index in disretization, can be modified to avoid segmentation faults
+       INTEGER(iwp) ::  nzb_max_l !< index indicating upper bound for order degradation of horizontal advection terms
+       INTEGER(iwp) ::  tn        !< number of OpenMP thread
        REAL(wp)     ::  ibit0  !< flag indicating 1st-order scheme along x-direction
 …
        REAL(wp), DIMENSION(nzb+1:nzt,nys:nyn,0:threads_per_task-1) ::  swap_diss_x_local !< discretized artificial dissipation at leftward-side of the grid box
        REAL(wp), DIMENSION(nzb+1:nzt,nys:nyn,0:threads_per_task-1) ::  swap_flux_x_local !< discretized 6th-order flux at leftward-side of the grid box
+!
+!--    Used local modified copy of nzb_max (used to degrade order of
+!--    discretization) at non-cyclic boundaries. Modify only at relevant points
+!--    instead of the entire subdomain. This should lead to better
+!--    load balance between boundary and non-boundary PEs.
+       IF( ( bc_dirichlet_l  .OR.  bc_radiation_l )  .AND.  i <= nxl + 2  .OR. &
+           ( bc_dirichlet_r  .OR.  bc_radiation_r )  .AND.  i >= nxr - 2  .OR. &
+           ( bc_dirichlet_s  .OR.  bc_radiation_s )  .AND.  j <= nys + 2  .OR. &
+           ( bc_dirichlet_n  .OR.  bc_radiation_n )  .AND.  j >= nyn - 2 )  THEN
+          nzb_max_l = nzt
+       ELSE
+          nzb_max_l = nzb_max
+       END IF
+!
 !--    Compute southside fluxes of the respective PE bounds.
 …
+!
 !--       Up to the top of the highest topography.
           DO  k = nzb+1, nzb_max
+          DO  k = nzb+1, nzb_max_l
              ibit5 = REAL( IBITS(advc_flags_1(k,j-1,i),5,1), KIND = wp )
 …
        IF ( i == i_omp )  THEN
           DO  k = nzb+1, nzb_max
+          DO  k = nzb+1, nzb_max_l
              ibit2 = REAL( IBITS(advc_flags_1(k,j,i-1),2,1), KIND = wp )
 …
           ENDDO
           DO  k = nzb_max+1, nzt
+          DO  k = nzb_max_l+1, nzt
              u_comp                 = u(k,j,i) - u_gtrans + u_stokes_zu(k)
 …
 !--    Now compute the fluxes and tendency terms for the horizontal and
 !--    vertical parts up to the top of the highest topography.
        DO  k = nzb+1, nzb_max
+       DO  k = nzb+1, nzb_max_l
+!
 !--       Note: It is faster to conduct all multiplications explicitly, e.g.
 …
 !--    vertical parts above the top of the highest topography. No degradation
 !--    for the horizontal parts, but for the vertical it is stell needed.
        DO  k = nzb_max+1, nzt
+       DO  k = nzb_max_l+1, nzt
           u_comp    = u(k,j,i+1) - u_gtrans + u_stokes_zu(k)
 …
        ENDDO
        DO  k = nzb+1, nzt
+       DO  k = nzb+1, nzb_max_l
           flux_d    = flux_t(k-1)
           diss_d    = diss_t(k-1)
+          ibit2 = REAL( IBITS(advc_flags_1(k,j,i),2,1), KIND = wp )
+          ibit1 = REAL( IBITS(advc_flags_1(k,j,i),1,1), KIND = wp )
+          ibit0 = REAL( IBITS(advc_flags_1(k,j,i),0,1), KIND = wp )
+          ibit5 = REAL( IBITS(advc_flags_1(k,j,i),5,1), KIND = wp )
+          ibit4 = REAL( IBITS(advc_flags_1(k,j,i),4,1), KIND = wp )
+          ibit3 = REAL( IBITS(advc_flags_1(k,j,i),3,1), KIND = wp )
+          ibit8 = REAL( IBITS(advc_flags_1(k,j,i),8,1), KIND = wp )
+          ibit7 = REAL( IBITS(advc_flags_1(k,j,i),7,1), KIND = wp )
+          ibit6 = REAL( IBITS(advc_flags_1(k,j,i),6,1), KIND = wp )
+!
 !--       Calculate the divergence of the velocity field. A respective
 …
                                          )                                    &
                           ) * drho_air(k) * ddzw(k)
+          tend(k,j,i) = tend(k,j,i) - (                                       &
+                        ( flux_r(k) + diss_r(k) - swap_flux_x_local(k,j,tn) - &
+                          swap_diss_x_local(k,j,tn)            ) * ddx        &
+                      + ( flux_n(k) + diss_n(k) - swap_flux_y_local(k,tn)   - &
+                          swap_diss_y_local(k,tn)              ) * ddy        &
+                      + ( ( flux_t(k) + diss_t(k) ) -                         &
+                          ( flux_d    + diss_d    )                           &
+                                                    ) * drho_air(k) * ddzw(k) &
+                                      ) + sk(k,j,i) * div
+          swap_flux_y_local(k,tn)   = flux_n(k)
+          swap_diss_y_local(k,tn)   = diss_n(k)
+          swap_flux_x_local(k,j,tn) = flux_r(k)
+          swap_diss_x_local(k,j,tn) = diss_r(k)
+       ENDDO
+       DO  k = nzb_max_l+1, nzt
+          flux_d    = flux_t(k-1)
+          diss_d    = diss_t(k-1)
+!
+!--       Calculate the divergence of the velocity field. A respective
+!--       correction is needed to overcome numerical instabilities introduced
+!--       by a not sufficient reduction of divergences near topography.
+          div         =   ( u(k,j,i+1) - u(k,j,i) ) * ddx                     &
+                        + ( v(k,j+1,i) - v(k,j,i) ) * ddy                     &
+                        + ( w(k,j,i) * rho_air_zw(k)                          &
+                          - w(k-1,j,i) * rho_air_zw(k-1)                      &
+                                                  ) * drho_air(k) * ddzw(k)
           tend(k,j,i) = tend(k,j,i) - (                                       &
 …
     SUBROUTINE advec_u_ws_ij( i, j, i_omp, tn )
        USE arrays_3d,                                                         &
            ONLY:  ddzw, diss_l_u, diss_s_u, flux_l_u, flux_s_u, tend, u, v, w,&
+       USE arrays_3d,                                                          &
+           ONLY:  ddzw, diss_l_u, diss_s_u, flux_l_u, flux_s_u, tend, u, v, w, &
                   drho_air, rho_air_zw
+       USE control_parameters,                                                &
+           ONLY:  intermediate_timestep_count, u_gtrans, v_gtrans
+       USE grid_variables,                                                    &
+       USE control_parameters,                                                 &
+           ONLY:  bc_dirichlet_l, bc_dirichlet_n, bc_dirichlet_r,              &
+                  bc_dirichlet_s, bc_radiation_l, bc_radiation_n,              &
+                  bc_radiation_r, bc_radiation_s, intermediate_timestep_count, &
+                  u_gtrans, v_gtrans
+       USE grid_variables,                                                     &
            ONLY:  ddx, ddy
        USE indices,                                                           &
            ONLY:  nxlu, nys, nzb, nzb_max, nzt, advc_flags_1
+       USE indices,                                                            &
+           ONLY:  nyn, nys, nxl, nxlu, nxr, nzb, nzb_max, nzt, advc_flags_1
        USE kinds
        USE statistics,                                                        &
+       USE statistics,                                                         &
            ONLY:  hom, sums_us2_ws_l, sums_wsus_ws_l, weight_substep
        IMPLICIT NONE
+       INTEGER(iwp) ::  i      !< grid index along x-direction
+       INTEGER(iwp) ::  i_omp  !< leftmost index on subdomain, or in case of OpenMP, on thread
+       INTEGER(iwp) ::  j      !< grid index along y-direction
+       INTEGER(iwp) ::  k      !< grid index along z-direction
+       INTEGER(iwp) ::  k_mm   !< k-2 index in disretization, can be modified to avoid segmentation faults
+       INTEGER(iwp) ::  k_pp   !< k+2 index in disretization, can be modified to avoid segmentation faults
+       INTEGER(iwp) ::  k_ppp  !< k+3 index in disretization, can be modified to avoid segmentation faults
+       INTEGER(iwp) ::  tn     !< number of OpenMP thread
+       INTEGER(iwp) ::  i         !< grid index along x-direction
+       INTEGER(iwp) ::  i_omp     !< leftmost index on subdomain, or in case of OpenMP, on thread
+       INTEGER(iwp) ::  j         !< grid index along y-direction
+       INTEGER(iwp) ::  k         !< grid index along z-direction
+       INTEGER(iwp) ::  k_mm      !< k-2 index in disretization, can be modified to avoid segmentation faults
+       INTEGER(iwp) ::  k_pp      !< k+2 index in disretization, can be modified to avoid segmentation faults
+       INTEGER(iwp) ::  k_ppp     !< k+3 index in disretization, can be modified to avoid segmentation faults
+       INTEGER(iwp) ::  nzb_max_l !< index indicating upper bound for order degradation of horizontal advection terms
+       INTEGER(iwp) ::  tn        !< number of OpenMP thread
        REAL(wp)    ::  ibit9    !< flag indicating 1st-order scheme along x-direction
 …
        REAL(wp), DIMENSION(nzb:nzt+1) ::  v_comp !< advection velocity along y
        REAL(wp), DIMENSION(nzb:nzt+1) ::  w_comp !< advection velocity along z
+!
+!--    Used local modified copy of nzb_max (used to degrade order of
+!--    discretization) at non-cyclic boundaries. Modify only at relevant points
+!--    instead of the entire subdomain. This should lead to better
+!--    load balance between boundary and non-boundary PEs.
+       IF( ( bc_dirichlet_l  .OR.  bc_radiation_l )  .AND.  i <= nxl + 2  .OR. &
+           ( bc_dirichlet_r  .OR.  bc_radiation_r )  .AND.  i >= nxr - 2  .OR. &
+           ( bc_dirichlet_s  .OR.  bc_radiation_s )  .AND.  j <= nys + 2  .OR. &
+           ( bc_dirichlet_n  .OR.  bc_radiation_n )  .AND.  j >= nyn - 2 )  THEN
+          nzb_max_l = nzt
+       ELSE
+          nzb_max_l = nzb_max
+       END IF
        gu = 2.0_wp * u_gtrans
        gv = 2.0_wp * v_gtrans
 …
        IF ( j == nys  )  THEN
           DO  k = nzb+1, nzb_max
+          DO  k = nzb+1, nzb_max_l
              ibit14 = REAL( IBITS(advc_flags_1(k,j-1,i),14,1), KIND = wp )
 …
           ENDDO
           DO  k = nzb_max+1, nzt
+          DO  k = nzb_max_l+1, nzt
              v_comp(k)      = v(k,j,i) + v(k,j,i-1) - gv
 …
        IF ( i == i_omp  .OR.  i == nxlu )  THEN
           DO  k = nzb+1, nzb_max
+          DO  k = nzb+1, nzb_max_l
              ibit11 = REAL( IBITS(advc_flags_1(k,j,i-1),11,1), KIND = wp )
 …
           ENDDO
           DO  k = nzb_max+1, nzt
+          DO  k = nzb_max_l+1, nzt
              u_comp_l         = u(k,j,i) + u(k,j,i-1) - gu
 …
 !--    Now compute the fluxes tendency terms for the horizontal and
 !--    vertical parts.
        DO  k = nzb+1, nzb_max
+       DO  k = nzb+1, nzb_max_l
           ibit11 = REAL( IBITS(advc_flags_1(k,j,i),11,1), KIND = wp )
 …
        ENDDO
        DO  k = nzb_max+1, nzt
+       DO  k = nzb_max_l+1, nzt
           u_comp(k) = u(k,j,i+1) + u(k,j,i)
 …
        ENDDO
        DO  k = nzb+1, nzt
+       DO  k = nzb+1, nzb_max_l
           flux_d    = flux_t(k-1)
           diss_d    = diss_t(k-1)
+          ibit11 = REAL( IBITS(advc_flags_1(k,j,i),11,1), KIND = wp )
+          ibit10 = REAL( IBITS(advc_flags_1(k,j,i),10,1), KIND = wp )
+          ibit9  = REAL( IBITS(advc_flags_1(k,j,i),9,1),  KIND = wp )
+          ibit14 = REAL( IBITS(advc_flags_1(k,j,i),14,1), KIND = wp )
+          ibit13 = REAL( IBITS(advc_flags_1(k,j,i),13,1), KIND = wp )
+          ibit12 = REAL( IBITS(advc_flags_1(k,j,i),12,1), KIND = wp )
+          ibit17 = REAL( IBITS(advc_flags_1(k,j,i),17,1), KIND = wp )
+          ibit16 = REAL( IBITS(advc_flags_1(k,j,i),16,1), KIND = wp )
+          ibit15 = REAL( IBITS(advc_flags_1(k,j,i),15,1), KIND = wp )
+!
 !--       Calculate the divergence of the velocity field. A respective
 …
                 ) * 0.5_wp
            tend(k,j,i) = tend(k,j,i) - (                                      &
+          tend(k,j,i) = tend(k,j,i) - (                                       &
                             ( flux_r(k) + diss_r(k)                           &
                           -   flux_l_u(k,j,tn) - diss_l_u(k,j,tn) ) * ddx     &
 …
                                        ) + div * u(k,j,i)
            flux_l_u(k,j,tn) = flux_r(k)
            diss_l_u(k,j,tn) = diss_r(k)
            flux_s_u(k,tn)   = flux_n(k)
            diss_s_u(k,tn)   = diss_n(k)
+!
 !--        Statistical Evaluation of u'u'. The factor has to be applied for
 !--        right evaluation when gallilei_trans = .T. .
            sums_us2_ws_l(k,tn) = sums_us2_ws_l(k,tn)                           &
+          flux_l_u(k,j,tn) = flux_r(k)
+          diss_l_u(k,j,tn) = diss_r(k)
+          flux_s_u(k,tn)   = flux_n(k)
+          diss_s_u(k,tn)   = diss_n(k)
+!
+!--       Statistical Evaluation of u'u'. The factor has to be applied for
+!--       right evaluation when gallilei_trans = .T. .
+          sums_us2_ws_l(k,tn) = sums_us2_ws_l(k,tn)                            &
                 + ( flux_r(k)                                                  &
                     * ( u_comp(k) - 2.0_wp * hom(k,1,1,0)                   )  &
 …
                   ) *   weight_substep(intermediate_timestep_count)
+!
 !--        Statistical Evaluation of w'u'.
            sums_wsus_ws_l(k,tn) = sums_wsus_ws_l(k,tn)                         &
+!--       Statistical Evaluation of w'u'.
+          sums_wsus_ws_l(k,tn) = sums_wsus_ws_l(k,tn)                          &
                 + ( flux_t(k)                                                  &
                     * ( w_comp(k) - 2.0_wp * hom(k,1,3,0)                   )  &
 …
                   ) *   weight_substep(intermediate_timestep_count)
        ENDDO
+       sums_us2_ws_l(nzb,tn) = sums_us2_ws_l(nzb+1,tn)
+       DO  k = nzb_max_l+1, nzt
+          flux_d    = flux_t(k-1)
+          diss_d    = diss_t(k-1)
+!
+!--       Calculate the divergence of the velocity field. A respective
+!--       correction is needed to overcome numerical instabilities introduced
+!--       by a not sufficient reduction of divergences near topography.
+          div = ( ( u_comp(k)       - ( u(k,j,i)   + u(k,j,i-1) ) ) * ddx      &
+               +  ( v_comp(k) + gv  - ( v(k,j,i)   + v(k,j,i-1) ) ) * ddy      &
+               +  ( w_comp(k)   * rho_air_zw(k)                                &
+                 -  w_comp(k-1) * rho_air_zw(k-1)                              &
+                  ) * drho_air(k) * ddzw(k)                                    &
+                ) * 0.5_wp
+          tend(k,j,i) = tend(k,j,i) - (                                        &
+                            ( flux_r(k) + diss_r(k)                            &
+                          -   flux_l_u(k,j,tn) - diss_l_u(k,j,tn) ) * ddx      &
+                          + ( flux_n(k) + diss_n(k)                            &
+                          -   flux_s_u(k,tn) - diss_s_u(k,tn)     ) * ddy      &
+                          + ( ( flux_t(k) + diss_t(k) )                        &
+                          -   ( flux_d    + diss_d    )                        &
+                                                    ) * drho_air(k) * ddzw(k)  &
+                                       ) + div * u(k,j,i)
+          flux_l_u(k,j,tn) = flux_r(k)
+          diss_l_u(k,j,tn) = diss_r(k)
+          flux_s_u(k,tn)   = flux_n(k)
+          diss_s_u(k,tn)   = diss_n(k)
+!
+!--       Statistical Evaluation of u'u'. The factor has to be applied for
+!--       right evaluation when gallilei_trans = .T. .
+          sums_us2_ws_l(k,tn) = sums_us2_ws_l(k,tn)                            &
+                + ( flux_r(k)                                                  &
+                    * ( u_comp(k) - 2.0_wp * hom(k,1,1,0)                   )  &
+                    / ( u_comp(k) - gu + SIGN( 1.0E-20_wp, u_comp(k) - gu ) )  &
+                  + diss_r(k)                                                  &
+                    *   ABS( u_comp(k) - 2.0_wp * hom(k,1,1,0)              )  &
+                    / ( ABS( u_comp(k) - gu ) + 1.0E-20_wp                  )  &
+                  ) *   weight_substep(intermediate_timestep_count)
+!
+!--       Statistical Evaluation of w'u'.
+          sums_wsus_ws_l(k,tn) = sums_wsus_ws_l(k,tn)                          &
+                + ( flux_t(k)                                                  &
+                    * ( w_comp(k) - 2.0_wp * hom(k,1,3,0)                   )  &
+                    / ( w_comp(k) + SIGN( 1.0E-20_wp, w_comp(k) )           )  &
+                  + diss_t(k)                                                  &
+                    *   ABS( w_comp(k) - 2.0_wp * hom(k,1,3,0)              )  &
+                    / ( ABS( w_comp(k) ) + 1.0E-20_wp                       )  &
+                  ) *   weight_substep(intermediate_timestep_count)
+       ENDDO
 …
        USE control_parameters,                                                 &
+           ONLY:  intermediate_timestep_count, u_gtrans, v_gtrans
+           ONLY:  bc_dirichlet_l, bc_dirichlet_n, bc_dirichlet_r,              &
+                  bc_dirichlet_s, bc_radiation_l, bc_radiation_n,              &
+                  bc_radiation_r, bc_radiation_s, intermediate_timestep_count, &
+                  u_gtrans, v_gtrans
        USE grid_variables,                                                     &
 …
        USE indices,                                                            &
            ONLY:  nysv, nzb, nzb_max, nzt, advc_flags_1
+           ONLY:  nyn, nys, nysv, nxl, nxr, nzb, nzb_max, nzt, advc_flags_1
        USE kinds
 …
        IMPLICIT NONE
+       INTEGER(iwp)  ::  i      !< grid index along x-direction
+       INTEGER(iwp)  ::  i_omp  !< leftmost index on subdomain, or in case of OpenMP, on thread
+       INTEGER(iwp)  ::  j      !< grid index along y-direction
+       INTEGER(iwp)  ::  k      !< grid index along z-direction
+       INTEGER(iwp)  ::  k_mm   !< k-2 index in disretization, can be modified to avoid segmentation faults
+       INTEGER(iwp)  ::  k_pp   !< k+2 index in disretization, can be modified to avoid segmentation faults
+       INTEGER(iwp)  ::  k_ppp  !< k+3 index in disretization, can be modified to avoid segmentation faults
+       INTEGER(iwp)  ::  tn     !< number of OpenMP thread
+       INTEGER(iwp)  ::  i         !< grid index along x-direction
+       INTEGER(iwp)  ::  i_omp     !< leftmost index on subdomain, or in case of OpenMP, on thread
+       INTEGER(iwp)  ::  j         !< grid index along y-direction
+       INTEGER(iwp)  ::  k         !< grid index along z-direction
+       INTEGER(iwp)  ::  k_mm      !< k-2 index in disretization, can be modified to avoid segmentation faults
+       INTEGER(iwp)  ::  k_pp      !< k+2 index in disretization, can be modified to avoid segmentation faults
+       INTEGER(iwp)  ::  k_ppp     !< k+3 index in disretization, can be modified to avoid segmentation faults
+       INTEGER(iwp)  ::  nzb_max_l !< index indicating upper bound for order degradation of horizontal advection terms
+       INTEGER(iwp)  ::  tn        !< number of OpenMP thread
        REAL(wp)      ::  ibit18   !< flag indicating 1st-order scheme along x-direction
 …
        REAL(wp), DIMENSION(nzb:nzt+1)  ::  v_comp !< advection velocity along y
        REAL(wp), DIMENSION(nzb:nzt+1)  ::  w_comp !< advection velocity along z
+!
+!--    Used local modified copy of nzb_max (used to degrade order of
+!--    discretization) at non-cyclic boundaries. Modify only at relevant points
+!--    instead of the entire subdomain. This should lead to better
+!--    load balance between boundary and non-boundary PEs.
+       IF( ( bc_dirichlet_l  .OR.  bc_radiation_l )  .AND.  i <= nxl + 2  .OR. &
+           ( bc_dirichlet_r  .OR.  bc_radiation_r )  .AND.  i >= nxr - 2  .OR. &
+           ( bc_dirichlet_s  .OR.  bc_radiation_s )  .AND.  j <= nys + 2  .OR. &
+           ( bc_dirichlet_n  .OR.  bc_radiation_n )  .AND.  j >= nyn - 2 )  THEN
+          nzb_max_l = nzt
+       ELSE
+          nzb_max_l = nzb_max
+       END IF
        gu = 2.0_wp * u_gtrans
        gv = 2.0_wp * v_gtrans
 …
        IF ( i == i_omp )  THEN
           DO  k = nzb+1, nzb_max
+          DO  k = nzb+1, nzb_max_l
              ibit20 = REAL( IBITS(advc_flags_1(k,j,i-1),20,1), KIND = wp )
 …
           ENDDO
           DO  k = nzb_max+1, nzt
+          DO  k = nzb_max_l+1, nzt
              u_comp(k)           = u(k,j-1,i) + u(k,j,i) - gu
 …
        IF ( j == nysv )  THEN
           DO  k = nzb+1, nzb_max
+          DO  k = nzb+1, nzb_max_l
              ibit23 = REAL( IBITS(advc_flags_1(k,j-1,i),23,1), KIND = wp )
 …
           ENDDO
           DO  k = nzb_max+1, nzt
+          DO  k = nzb_max_l+1, nzt
              v_comp_l       = v(k,j,i) + v(k,j-1,i) - gv
 …
 !--    Now compute the fluxes and tendency terms for the horizontal and
 !--    verical parts.
        DO  k = nzb+1, nzb_max
+       DO  k = nzb+1, nzb_max_l
           ibit20 = REAL( IBITS(advc_flags_1(k,j,i),20,1), KIND = wp )
 …
        ENDDO
        DO  k = nzb_max+1, nzt
+       DO  k = nzb_max_l+1, nzt
           u_comp(k) = u(k,j-1,i+1) + u(k,j,i+1) - gu
 …
        ENDDO
        DO  k = nzb+1, nzt
+       DO  k = nzb+1, nzb_max_l
           flux_d    = flux_t(k-1)
           diss_d    = diss_t(k-1)
+          ibit20 = REAL( IBITS(advc_flags_1(k,j,i),20,1), KIND = wp )
+          ibit19 = REAL( IBITS(advc_flags_1(k,j,i),19,1), KIND = wp )
+          ibit18 = REAL( IBITS(advc_flags_1(k,j,i),18,1), KIND = wp )
+          ibit23 = REAL( IBITS(advc_flags_1(k,j,i),23,1), KIND = wp )
+          ibit22 = REAL( IBITS(advc_flags_1(k,j,i),22,1), KIND = wp )
+          ibit21 = REAL( IBITS(advc_flags_1(k,j,i),21,1), KIND = wp )
+          ibit26 = REAL( IBITS(advc_flags_1(k,j,i),26,1), KIND = wp )
+          ibit25 = REAL( IBITS(advc_flags_1(k,j,i),25,1), KIND = wp )
+          ibit24 = REAL( IBITS(advc_flags_1(k,j,i),24,1), KIND = wp )
+!
 !--       Calculate the divergence of the velocity field. A respective
 …
                                       ) + v(k,j,i) * div
            flux_l_v(k,j,tn) = flux_r(k)
            diss_l_v(k,j,tn) = diss_r(k)
            flux_s_v(k,tn)   = flux_n(k)
            diss_s_v(k,tn)   = diss_n(k)
+!
 !--        Statistical Evaluation of v'v'. The factor has to be applied for
 !--        right evaluation when gallilei_trans = .T. .
            sums_vs2_ws_l(k,tn) = sums_vs2_ws_l(k,tn)                           &
+          flux_l_v(k,j,tn) = flux_r(k)
+          diss_l_v(k,j,tn) = diss_r(k)
+          flux_s_v(k,tn)   = flux_n(k)
+          diss_s_v(k,tn)   = diss_n(k)
+!
+!--       Statistical Evaluation of v'v'. The factor has to be applied for
+!--       right evaluation when gallilei_trans = .T. .
+          sums_vs2_ws_l(k,tn) = sums_vs2_ws_l(k,tn)                            &
                 + ( flux_n(k)                                                  &
                     * ( v_comp(k) - 2.0_wp * hom(k,1,2,0)                   )  &
 …
                   ) *   weight_substep(intermediate_timestep_count)
+!
 !--        Statistical Evaluation of w'u'.
            sums_wsvs_ws_l(k,tn) = sums_wsvs_ws_l(k,tn)                         &
+!--       Statistical Evaluation of w'u'.
+          sums_wsvs_ws_l(k,tn) = sums_wsvs_ws_l(k,tn)                          &
                 + ( flux_t(k)                                                  &
                     * ( w_comp(k) - 2.0_wp * hom(k,1,3,0)                   )  &
 …
        ENDDO
+       sums_vs2_ws_l(nzb,tn) = sums_vs2_ws_l(nzb+1,tn)
+       DO  k = nzb_max_l+1, nzt
+          flux_d    = flux_t(k-1)
+          diss_d    = diss_t(k-1)
+!
+!--       Calculate the divergence of the velocity field. A respective
+!--       correction is needed to overcome numerical instabilities introduced
+!--       by a not sufficient reduction of divergences near topography.
+          div = ( ( u_comp(k) + gu - ( u(k,j-1,i) + u(k,j,i)   ) ) * ddx       &
+               +  ( v_comp(k)      - ( v(k,j,i)   + v(k,j-1,i) ) ) * ddy       &
+               +  ( w_comp(k)   * rho_air_zw(k)                                &
+                 -  w_comp(k-1) * rho_air_zw(k-1)                              &
+                  ) * drho_air(k) * ddzw(k)                                    &
+                ) * 0.5_wp
+          tend(k,j,i) = tend(k,j,i) - (                                        &
+                         ( flux_r(k) + diss_r(k)                               &
+                       -   flux_l_v(k,j,tn) - diss_l_v(k,j,tn)   ) * ddx       &
+                       + ( flux_n(k) + diss_n(k)                               &
+                       -   flux_s_v(k,tn) - diss_s_v(k,tn)       ) * ddy       &
+                       + ( ( flux_t(k) + diss_t(k) )                           &
+                       -   ( flux_d    + diss_d    )                           &
+                                                   ) * drho_air(k) * ddzw(k)   &
+                                      ) + v(k,j,i) * div
+          flux_l_v(k,j,tn) = flux_r(k)
+          diss_l_v(k,j,tn) = diss_r(k)
+          flux_s_v(k,tn)   = flux_n(k)
+          diss_s_v(k,tn)   = diss_n(k)
+!
+!--       Statistical Evaluation of v'v'. The factor has to be applied for
+!--       right evaluation when gallilei_trans = .T. .
+          sums_vs2_ws_l(k,tn) = sums_vs2_ws_l(k,tn)                            &
+                + ( flux_n(k)                                                  &
+                    * ( v_comp(k) - 2.0_wp * hom(k,1,2,0)                   )  &
+                    / ( v_comp(k) - gv + SIGN( 1.0E-20_wp, v_comp(k) - gv ) )  &
+                  + diss_n(k)                                                  &
+                    *   ABS( v_comp(k) - 2.0_wp * hom(k,1,2,0)              )  &
+                    / ( ABS( v_comp(k) - gv ) + 1.0E-20_wp                  )  &
+                  ) *   weight_substep(intermediate_timestep_count)
+!
+!--       Statistical Evaluation of w'u'.
+          sums_wsvs_ws_l(k,tn) = sums_wsvs_ws_l(k,tn)                          &
+                + ( flux_t(k)                                                  &
+                    * ( w_comp(k) - 2.0_wp * hom(k,1,3,0)                   )  &
+                    / ( w_comp(k) + SIGN( 1.0E-20_wp, w_comp(k) )           )  &
+                  + diss_t(k)                                                  &
+                    *   ABS( w_comp(k) - 2.0_wp * hom(k,1,3,0)              )  &
+                    / ( ABS( w_comp(k) ) + 1.0E-20_wp                       )  &
+                  ) *   weight_substep(intermediate_timestep_count)
+       ENDDO
 …
     SUBROUTINE advec_w_ws_ij( i, j, i_omp, tn )
        USE arrays_3d,                                                         &
            ONLY:  ddzu, diss_l_w, diss_s_w, flux_l_w, flux_s_w, tend, u, v, w,&
+       USE arrays_3d,                                                          &
+           ONLY:  ddzu, diss_l_w, diss_s_w, flux_l_w, flux_s_w, tend, u, v, w, &
                   drho_air_zw, rho_air
+       USE control_parameters,                                                &
+           ONLY:  intermediate_timestep_count, u_gtrans, v_gtrans
+       USE grid_variables,                                                    &
+       USE control_parameters,                                                 &
+           ONLY:  bc_dirichlet_l, bc_dirichlet_n, bc_dirichlet_r,              &
+                  bc_dirichlet_s, bc_radiation_l, bc_radiation_n,              &
+                  bc_radiation_r, bc_radiation_s,intermediate_timestep_count,  &
+                  u_gtrans, v_gtrans
+       USE grid_variables,                                                     &
            ONLY:  ddx, ddy
+       USE indices,                                                           &
+           ONLY:  nys, nzb, nzb_max, nzt, advc_flags_1, advc_flags_2
+       USE indices,                                                            &
+           ONLY:  nyn, nys, nxl, nxr, nzb, nzb_max, nzt, advc_flags_1,         &
+                  advc_flags_2
        USE kinds
        USE statistics,                                                        &
+       USE statistics,                                                         &
            ONLY:  hom, sums_ws2_ws_l, weight_substep
        IMPLICIT NONE
+       INTEGER(iwp) ::  i      !< grid index along x-direction
+       INTEGER(iwp) ::  i_omp  !< leftmost index on subdomain, or in case of OpenMP, on thread
+       INTEGER(iwp) ::  j      !< grid index along y-direction
+       INTEGER(iwp) ::  k      !< grid index along z-direction
+       INTEGER(iwp) ::  k_mm   !< k-2 index in disretization, can be modified to avoid segmentation faults
+       INTEGER(iwp) ::  k_pp   !< k+2 index in disretization, can be modified to avoid segmentation faults
+       INTEGER(iwp) ::  k_ppp  !< k+3 index in disretization, can be modified to avoid segmentation faults
+       INTEGER(iwp) ::  tn     !< number of OpenMP thread
+       INTEGER(iwp) ::  i         !< grid index along x-direction
+       INTEGER(iwp) ::  i_omp     !< leftmost index on subdomain, or in case of OpenMP, on thread
+       INTEGER(iwp) ::  j         !< grid index along y-direction
+       INTEGER(iwp) ::  k         !< grid index along z-direction
+       INTEGER(iwp) ::  k_mm      !< k-2 index in disretization, can be modified to avoid segmentation faults
+       INTEGER(iwp) ::  k_pp      !< k+2 index in disretization, can be modified to avoid segmentation faults
+       INTEGER(iwp) ::  k_ppp     !< k+3 index in disretization, can be modified to avoid segmentation faults
+       INTEGER(iwp) ::  nzb_max_l !< index indicating upper bound for order degradation of horizontal advection terms
+       INTEGER(iwp) ::  tn        !< number of OpenMP thread
        REAL(wp)    ::  ibit27  !< flag indicating 1st-order scheme along x-direction
 …
        REAL(wp), DIMENSION(nzb:nzt+1)  ::  v_comp !< advection velocity along y
        REAL(wp), DIMENSION(nzb:nzt+1)  ::  w_comp !< advection velocity along z
+!
+!--    Used local modified copy of nzb_max (used to degrade order of
+!--    discretization) at non-cyclic boundaries. Modify only at relevant points
+!--    instead of the entire subdomain. This should lead to better
+!--    load balance between boundary and non-boundary PEs.
+       IF( ( bc_dirichlet_l  .OR.  bc_radiation_l )  .AND.  i <= nxl + 2  .OR. &
+           ( bc_dirichlet_r  .OR.  bc_radiation_r )  .AND.  i >= nxr - 2  .OR. &
+           ( bc_dirichlet_s  .OR.  bc_radiation_s )  .AND.  j <= nys + 2  .OR. &
+           ( bc_dirichlet_n  .OR.  bc_radiation_n )  .AND.  j >= nyn - 2 )  THEN
+          nzb_max_l = nzt
+       ELSE
+          nzb_max_l = nzb_max
+       END IF
        gu = 2.0_wp * u_gtrans
        gv = 2.0_wp * v_gtrans
+!
 !--    Compute southside fluxes for the respective boundary.
        IF ( j == nys )  THEN
           DO  k = nzb+1, nzb_max
+          DO  k = nzb+1, nzb_max_l
              ibit32 = REAL( IBITS(advc_flags_2(k,j-1,i),0,1),  KIND = wp )
              ibit31 = REAL( IBITS(advc_flags_1(k,j-1,i),31,1), KIND = wp )
 …
           ENDDO
           DO  k = nzb_max+1, nzt
+          DO  k = nzb_max_l+1, nzt
              v_comp(k)      = v(k+1,j,i) + v(k,j,i) - gv
 …
        IF ( i == i_omp ) THEN
           DO  k = nzb+1, nzb_max
+          DO  k = nzb+1, nzb_max_l
              ibit29 = REAL( IBITS(advc_flags_1(k,j,i-1),29,1), KIND = wp )
 …
           ENDDO
           DO  k = nzb_max+1, nzt
+          DO  k = nzb_max_l+1, nzt
              u_comp(k)        = u(k+1,j,i) + u(k,j,i) - gu
 …
 !--    Now compute the fluxes and tendency terms for the horizontal
 !--    and vertical parts.
        DO  k = nzb+1, nzb_max
+       DO  k = nzb+1, nzb_max_l
           ibit29 = REAL( IBITS(advc_flags_1(k,j,i),29,1), KIND = wp )
 …
        ENDDO
        DO  k = nzb_max+1, nzt
+       DO  k = nzb_max_l+1, nzt
           u_comp(k) = u(k+1,j,i+1) + u(k,j,i+1) - gu
 …
        ENDDO
        DO  k = nzb+1, nzt
+       DO  k = nzb+1, nzb_max_l
           flux_d    = flux_t(k-1)
           diss_d    = diss_t(k-1)
+          ibit29 = REAL( IBITS(advc_flags_1(k,j,i),29,1), KIND = wp )
+          ibit28 = REAL( IBITS(advc_flags_1(k,j,i),28,1), KIND = wp )
+          ibit27 = REAL( IBITS(advc_flags_1(k,j,i),27,1), KIND = wp )
+          ibit32 = REAL( IBITS(advc_flags_2(k,j,i),0,1),  KIND = wp )
+          ibit31 = REAL( IBITS(advc_flags_1(k,j,i),31,1), KIND = wp )
+          ibit30 = REAL( IBITS(advc_flags_1(k,j,i),30,1), KIND = wp )
+          ibit35 = REAL( IBITS(advc_flags_2(k,j,i),3,1), KIND = wp )
+          ibit34 = REAL( IBITS(advc_flags_2(k,j,i),2,1), KIND = wp )
+          ibit33 = REAL( IBITS(advc_flags_2(k,j,i),1,1), KIND = wp )
+!
 !--       Calculate the divergence of the velocity field. A respective
 …
        ENDDO
+       DO  k = nzb_max_l+1, nzt
+          flux_d    = flux_t(k-1)
+          diss_d    = diss_t(k-1)
+!
+!--       Calculate the divergence of the velocity field. A respective
+!--       correction is needed to overcome numerical instabilities introduced
+!--       by a not sufficient reduction of divergences near topography.
+          div = ( ( u_comp(k) + gu - ( u(k+1,j,i) + u(k,j,i)   ) ) * ddx       &
+              +   ( v_comp(k) + gv - ( v(k+1,j,i) + v(k,j,i)   ) ) * ddy       &
+              +   ( w_comp(k)               * rho_air(k+1)                     &
+                - ( w(k,j,i) + w(k-1,j,i) ) * rho_air(k)                       &
+                  ) * drho_air_zw(k) * ddzu(k+1)                               &
+                ) * 0.5_wp
+          tend(k,j,i) = tend(k,j,i) - (                                        &
+                      ( flux_r(k) + diss_r(k)                                  &
+                    -   flux_l_w(k,j,tn) - diss_l_w(k,j,tn)   ) * ddx          &
+                    + ( flux_n(k) + diss_n(k)                                  &
+                    -   flux_s_w(k,tn) - diss_s_w(k,tn)       ) * ddy          &
+                    + ( ( flux_t(k) + diss_t(k) )                              &
+                    -   ( flux_d    + diss_d    )                              &
+                                              ) * drho_air_zw(k) * ddzu(k+1)   &
+                                      ) + div * w(k,j,i)
+          flux_l_w(k,j,tn) = flux_r(k)
+          diss_l_w(k,j,tn) = diss_r(k)
+          flux_s_w(k,tn)   = flux_n(k)
+          diss_s_w(k,tn)   = diss_n(k)
+!
+!--       Statistical Evaluation of w'w'.
+          sums_ws2_ws_l(k,tn)  = sums_ws2_ws_l(k,tn)                           &
+                      + ( flux_t(k)                                            &
+                       * ( w_comp(k) - 2.0_wp * hom(k,1,3,0)                )  &
+                       / ( w_comp(k) + SIGN( 1.0E-20_wp, w_comp(k) )        )  &
+                        + diss_t(k)                                            &
+                       *   ABS( w_comp(k) - 2.0_wp * hom(k,1,3,0)           )  &
+                       / ( ABS( w_comp(k) ) + 1.0E-20_wp                    )  &
+                        ) *   weight_substep(intermediate_timestep_count)
+       ENDDO
     END SUBROUTINE advec_w_ws_ij
 …
        USE control_parameters,                                                 &
+           ONLY:  intermediate_timestep_count, u_gtrans, v_gtrans
+           ONLY:  bc_dirichlet_l, bc_dirichlet_n, bc_dirichlet_r,              &
+                  bc_dirichlet_s, bc_radiation_l, bc_radiation_n,              &
+                  bc_radiation_r, bc_radiation_s, intermediate_timestep_count, &
+                  u_gtrans, v_gtrans
        USE grid_variables,                                                     &
 …
        INTEGER(iwp) ::  sk_num !< integer identifier, used for assign fluxes to the correct dimension in the analysis array
+       INTEGER(iwp) ::  i      !< grid index along x-direction
+       INTEGER(iwp) ::  j      !< grid index along y-direction
+       INTEGER(iwp) ::  k      !< grid index along z-direction
+       INTEGER(iwp) ::  k_mm   !< k-2 index in disretization, can be modified to avoid segmentation faults
+       INTEGER(iwp) ::  k_pp   !< k+2 index in disretization, can be modified to avoid segmentation faults
+       INTEGER(iwp) ::  k_ppp  !< k+3 index in disretization, can be modified to avoid segmentation faults
+       INTEGER(iwp) ::  tn = 0 !< number of OpenMP thread
+       INTEGER(iwp) ::  i         !< grid index along x-direction
+       INTEGER(iwp) ::  j         !< grid index along y-direction
+       INTEGER(iwp) ::  k         !< grid index along z-direction
+       INTEGER(iwp) ::  k_mm      !< k-2 index in disretization, can be modified to avoid segmentation faults
+       INTEGER(iwp) ::  k_pp      !< k+2 index in disretization, can be modified to avoid segmentation faults
+       INTEGER(iwp) ::  k_ppp     !< k+3 index in disretization, can be modified to avoid segmentation faults
+       INTEGER(iwp) ::  nzb_max_l !< index indicating upper bound for order degradation of horizontal advection terms
+       INTEGER(iwp) ::  tn = 0    !< number of OpenMP thread
+!
 …
 #endif
+!
+!--    Set local version of nzb_max. At non-cyclic boundaries the order of the
+!--    advection need to be degraded near the boundary. Please note, in contrast
+!--    to the cache-optimized routines, nzb_max_l is set constantly for the
+!--    entire subdomain, in order to avoid unsymmetric loops which might be
+!--    an issue for GPUs.
+       IF( bc_dirichlet_l  .OR.  bc_radiation_l  .OR.                          &
+           bc_dirichlet_r  .OR.  bc_radiation_r  .OR.                          &
+           bc_dirichlet_s  .OR.  bc_radiation_s  .OR.                          &
+           bc_dirichlet_n  .OR.  bc_radiation_n )  THEN
+          nzb_max_l = nzt
+       ELSE
+          nzb_max_l = nzb_max
+       END IF
        SELECT CASE ( sk_char )
 …
        DO  j = nys, nyn
           DO  k = nzb+1, nzb_max
+          DO  k = nzb+1, nzb_max_l
              ibit2 = REAL( IBITS(advc_flags_1(k,j,i-1),2,1), KIND = wp )
 …
           ENDDO
           DO  k = nzb_max+1, nzt
+          DO  k = nzb_max_l+1, nzt
              u_comp                 = u(k,j,i) - u_gtrans + u_stokes_zu(k)
 …
        !$ACC PRESENT(drho_air, rho_air_zw, ddzw) &
        !$ACC PRESENT(tend) &
        !$ACC PRESENT(hom(nzb+1:nzb_max,1,1:3,0)) &
+       !$ACC PRESENT(hom(nzb+1:nzb_max_l,1,1:3,0)) &
        !$ACC PRESENT(weight_substep(intermediate_timestep_count)) &
        !$ACC PRESENT(sums_wspts_ws_l, sums_wssas_ws_l) &
 …
 #ifndef _OPENACC
           j = nys
           DO  k = nzb+1, nzb_max
+          DO  k = nzb+1, nzb_max_l
              ibit5 = REAL( IBITS(advc_flags_1(k,j-1,i),5,1), KIND = wp )
 …
+!
 !--       Above to the top of the highest topography. No degradation necessary.
           DO  k = nzb_max+1, nzt
+          DO  k = nzb_max_l+1, nzt
              v_comp               = v(k,j,i) - v_gtrans + v_stokes_zu(k)
 …
              diss_d    = 0.0_wp
              DO  k = nzb+1, nzb_max
+             DO  k = nzb+1, nzb_max_l
                 ibit2 = REAL( IBITS(advc_flags_1(k,j,i),2,1), KIND = wp )
 …
              ENDDO
              DO  k = nzb_max+1, nzt
+             DO  k = nzb_max_l+1, nzt
                 u_comp = u(k,j,i+1) - u_gtrans + u_stokes_zu(k)
 …
                 flux_t = w(k,j,i) * rho_air_zw(k) * (                      &
+                flux_t = w(k,j,i) * rho_air_zw(k) * (                         &
                            ( 37.0_wp * ibit8 * adv_sca_5                      &
                         +     7.0_wp * ibit7 * adv_sca_3                      &
 …
+                                       )
                 diss_t = -ABS( w(k,j,i) ) * rho_air_zw(k) * (              &
+                diss_t = -ABS( w(k,j,i) ) * rho_air_zw(k) * (                 &
                            ( 10.0_wp * ibit8 * adv_sca_5                      &
                         +     3.0_wp * ibit7 * adv_sca_3                      &
 …
        USE control_parameters,                                                 &
+           ONLY:  intermediate_timestep_count, u_gtrans, v_gtrans
+           ONLY:  bc_dirichlet_l, bc_dirichlet_n, bc_dirichlet_r,              &
+                  bc_dirichlet_s, bc_radiation_l, bc_radiation_n,              &
+                  bc_radiation_r, bc_radiation_s, intermediate_timestep_count, &
+                  u_gtrans, v_gtrans
        USE grid_variables,                                                     &
 …
        IMPLICIT NONE
+       INTEGER(iwp) ::  i      !< grid index along x-direction
+       INTEGER(iwp) ::  j      !< grid index along y-direction
+       INTEGER(iwp) ::  k      !< grid index along z-direction
+       INTEGER(iwp) ::  k_mm   !< k-2 index in disretization, can be modified to avoid segmentation faults
+       INTEGER(iwp) ::  k_pp   !< k+2 index in disretization, can be modified to avoid segmentation faults
+       INTEGER(iwp) ::  k_ppp  !< k+3 index in disretization, can be modified to avoid segmentation faults
+       INTEGER(iwp) ::  tn = 0 !< number of OpenMP thread
+       INTEGER(iwp) ::  i         !< grid index along x-direction
+       INTEGER(iwp) ::  j         !< grid index along y-direction
+       INTEGER(iwp) ::  k         !< grid index along z-direction
+       INTEGER(iwp) ::  k_mm      !< k-2 index in disretization, can be modified to avoid segmentation faults
+       INTEGER(iwp) ::  k_pp      !< k+2 index in disretization, can be modified to avoid segmentation faults
+       INTEGER(iwp) ::  k_ppp     !< k+3 index in disretization, can be modified to avoid segmentation faults
+       INTEGER(iwp) ::  nzb_max_l !< index indicating upper bound for order degradation of horizontal advection terms
+       INTEGER(iwp) ::  tn = 0    !< number of OpenMP thread
        REAL(wp)    ::  ibit9  !< flag indicating 1st-order scheme along x-direction
 …
        REAL(wp)    ::  u_comp_l !<
 #endif
+!
+!--    Set local version of nzb_max. At non-cyclic boundaries the order of the
+!--    advection need to be degraded near the boundary. Please note, in contrast
+!--    to the cache-optimized routines, nzb_max_l is set constantly for the
+!--    entire subdomain, in order to avoid unsymmetric loops which might be
+!--    an issue for GPUs.
+       IF( bc_dirichlet_l  .OR.  bc_radiation_l  .OR.                          &
+           bc_dirichlet_r  .OR.  bc_radiation_r  .OR.                          &
+           bc_dirichlet_s  .OR.  bc_radiation_s  .OR.                          &
+           bc_dirichlet_n  .OR.  bc_radiation_n )  THEN
+          nzb_max_l = nzt
+       ELSE
+          nzb_max_l = nzb_max
+       END IF
        gu = 2.0_wp * u_gtrans
 …
        i = nxlu
        DO  j = nys, nyn
           DO  k = nzb+1, nzb_max
+          DO  k = nzb+1, nzb_max_l
              ibit11 = REAL( IBITS(advc_flags_1(k,j,i-1),11,1), KIND = wp )
 …
           ENDDO
           DO  k = nzb_max+1, nzt
+          DO  k = nzb_max_l+1, nzt
              u_comp            = u(k,j,i) + u(k,j,i-1) - gu
 …
        !$ACC PRESENT(drho_air, rho_air_zw, ddzw) &
        !$ACC PRESENT(tend) &
        !$ACC PRESENT(hom(nzb+1:nzb_max,1,1:3,0)) &
+       !$ACC PRESENT(hom(nzb+1:nzb_max_l,1,1:3,0)) &
        !$ACC PRESENT(weight_substep(intermediate_timestep_count)) &
        !$ACC PRESENT(sums_us2_ws_l, sums_wsus_ws_l)
 …
 !--       The following loop computes the fluxes for the south boundary points
           j = nys
           DO  k = nzb+1, nzb_max
+          DO  k = nzb+1, nzb_max_l
              ibit14 = REAL( IBITS(advc_flags_1(k,j-1,i),14,1), KIND = wp )
 …
           ENDDO
           DO  k = nzb_max+1, nzt
+          DO  k = nzb_max_l+1, nzt
              v_comp                 = v(k,j,i) + v(k,j,i-1) - gv
 …
              diss_d    = 0.0_wp
              DO  k = nzb+1, nzb_max
+             DO  k = nzb+1, nzb_max_l
                 ibit11 = REAL( IBITS(advc_flags_1(k,j,i),11,1), KIND = wp )
 …
              ENDDO
              DO  k = nzb_max+1, nzt
+             DO  k = nzb_max_l+1, nzt
                 u_comp = u(k,j,i+1) + u(k,j,i)
 …
           ENDDO
        ENDDO
-       sums_us2_ws_l(nzb,tn) = sums_us2_ws_l(nzb+1,tn)
     END SUBROUTINE advec_u_ws
 …
        USE control_parameters,                                                 &
+           ONLY:  intermediate_timestep_count, u_gtrans, v_gtrans
+           ONLY:  bc_dirichlet_l, bc_dirichlet_n, bc_dirichlet_r,              &
+                  bc_dirichlet_s, bc_radiation_l, bc_radiation_n,              &
+                  bc_radiation_r, bc_radiation_s, intermediate_timestep_count, &
+                  u_gtrans, v_gtrans
        USE grid_variables,                                                     &
 …
+       INTEGER(iwp) ::  i      !< grid index along x-direction
+       INTEGER(iwp) ::  j      !< grid index along y-direction
+       INTEGER(iwp) ::  k      !< grid index along z-direction
+       INTEGER(iwp) ::  k_mm   !< k-2 index in disretization, can be modified to avoid segmentation faults
+       INTEGER(iwp) ::  k_pp   !< k+2 index in disretization, can be modified to avoid segmentation faults
+       INTEGER(iwp) ::  k_ppp  !< k+3 index in disretization, can be modified to avoid segmentation faults
+       INTEGER(iwp) ::  tn = 0 !< number of OpenMP thread
+       INTEGER(iwp) ::  i         !< grid index along x-direction
+       INTEGER(iwp) ::  j         !< grid index along y-direction
+       INTEGER(iwp) ::  k         !< grid index along z-direction
+       INTEGER(iwp) ::  k_mm      !< k-2 index in disretization, can be modified to avoid segmentation faults
+       INTEGER(iwp) ::  k_pp      !< k+2 index in disretization, can be modified to avoid segmentation faults
+       INTEGER(iwp) ::  k_ppp     !< k+3 index in disretization, can be modified to avoid segmentation faults
+       INTEGER(iwp) ::  nzb_max_l !< index indicating upper bound for order degradation of horizontal advection terms
+       INTEGER(iwp) ::  tn = 0    !< number of OpenMP thread
        REAL(wp)    ::  ibit18 !< flag indicating 1st-order scheme along x-direction
 …
        REAL(wp)    ::  v_comp_s !<
 #endif
+!
+!--    Set local version of nzb_max. At non-cyclic boundaries the order of the
+!--    advection need to be degraded near the boundary. Please note, in contrast
+!--    to the cache-optimized routines, nzb_max_l is set constantly for the
+!--    entire subdomain, in order to avoid unsymmetric loops which might be
+!--    an issue for GPUs.
+       IF( bc_dirichlet_l  .OR.  bc_radiation_l  .OR.                          &
+           bc_dirichlet_r  .OR.  bc_radiation_r  .OR.                          &
+           bc_dirichlet_s  .OR.  bc_radiation_s  .OR.                          &
+           bc_dirichlet_n  .OR.  bc_radiation_n )  THEN
+          nzb_max_l = nzt
+       ELSE
+          nzb_max_l = nzb_max
+       END IF
        gu = 2.0_wp * u_gtrans
        gv = 2.0_wp * v_gtrans
 …
        i = nxl
        DO  j = nysv, nyn
           DO  k = nzb+1, nzb_max
+          DO  k = nzb+1, nzb_max_l
              ibit20 = REAL( IBITS(advc_flags_1(k,j,i-1),20,1), KIND = wp )
 …
           ENDDO
           DO  k = nzb_max+1, nzt
+          DO  k = nzb_max_l+1, nzt
              u_comp                   = u(k,j-1,i) + u(k,j,i) - gu
 …
        !$ACC PRESENT(drho_air, rho_air_zw, ddzw) &
        !$ACC PRESENT(tend) &
        !$ACC PRESENT(hom(nzb+1:nzb_max,1,2:3,0)) &
+       !$ACC PRESENT(hom(nzb+1:nzb_max_l,1,2:3,0)) &
        !$ACC PRESENT(weight_substep(intermediate_timestep_count)) &
        !$ACC PRESENT(sums_vs2_ws_l, sums_wsvs_ws_l)
 …
 #ifndef _OPENACC
           j = nysv
           DO  k = nzb+1, nzb_max
+          DO  k = nzb+1, nzb_max_l
              ibit23 = REAL( IBITS(advc_flags_1(k,j-1,i),23,1), KIND = wp )
 …
           ENDDO
           DO  k = nzb_max+1, nzt
+          DO  k = nzb_max_l+1, nzt
              v_comp                 = v(k,j,i) + v(k,j-1,i) - gv
 …
              diss_d    = 0.0_wp
              DO  k = nzb+1, nzb_max
+             DO  k = nzb+1, nzb_max_l
                 ibit20 = REAL( IBITS(advc_flags_1(k,j,i),20,1), KIND = wp )
 …
              ENDDO
              DO  k = nzb_max+1, nzt
+             DO  k = nzb_max_l+1, nzt
                 u_comp = u(k,j-1,i+1) + u(k,j,i+1) - gu
 …
           ENDDO
        ENDDO
-!$ACC UPDATE HOST(sums_vs2_ws_l(nzb+1,tn))
-       sums_vs2_ws_l(nzb,tn) = sums_vs2_ws_l(nzb+1,tn)
-!$ACC UPDATE DEVICE(sums_vs2_ws_l(nzb,tn))
     END SUBROUTINE advec_v_ws
 …
        USE control_parameters,                                                 &
+           ONLY:  intermediate_timestep_count, u_gtrans, v_gtrans
+           ONLY:  bc_dirichlet_l, bc_dirichlet_n, bc_dirichlet_r,              &
+                  bc_dirichlet_s, bc_radiation_l, bc_radiation_n,              &
+                  bc_radiation_r, bc_radiation_s, intermediate_timestep_count, &
+                  u_gtrans, v_gtrans
        USE grid_variables,                                                     &
 …
        IMPLICIT NONE
+       INTEGER(iwp) ::  i      !< grid index along x-direction
+       INTEGER(iwp) ::  j      !< grid index along y-direction
+       INTEGER(iwp) ::  k      !< grid index along z-direction
+       INTEGER(iwp) ::  k_mm   !< k-2 index in disretization, can be modified to avoid segmentation faults
+       INTEGER(iwp) ::  k_pp   !< k+2 index in disretization, can be modified to avoid segmentation faults
+       INTEGER(iwp) ::  k_ppp  !< k+3 index in disretization, can be modified to avoid segmentation faults
+       INTEGER(iwp) ::  tn = 0 !< number of OpenMP thread
+       INTEGER(iwp) ::  i         !< grid index along x-direction
+       INTEGER(iwp) ::  j         !< grid index along y-direction
+       INTEGER(iwp) ::  k         !< grid index along z-direction
+       INTEGER(iwp) ::  k_mm      !< k-2 index in disretization, can be modified to avoid segmentation faults
+       INTEGER(iwp) ::  k_pp      !< k+2 index in disretization, can be modified to avoid segmentation faults
+       INTEGER(iwp) ::  k_ppp     !< k+3 index in disretization, can be modified to avoid segmentation faults
+       INTEGER(iwp) ::  nzb_max_l !< index indicating upper bound for order degradation of horizontal advection terms
+       INTEGER(iwp) ::  tn = 0    !< number of OpenMP thread
        REAL(wp)    ::  ibit27 !< flag indicating 1st-order scheme along x-direction
 …
        REAL(wp), DIMENSION(nzb+1:nzt,nys:nyn) ::  swap_flux_x_local_w !< discretized 6th-order flux at leftward-side of the grid box
 #endif
+!
+!--    Set local version of nzb_max. At non-cyclic boundaries the order of the
+!--    advection need to be degraded near the boundary. Please note, in contrast
+!--    to the cache-optimized routines, nzb_max_l is set constantly for the
+!--    entire subdomain, in order to avoid unsymmetric loops which might be
+!--    an issue for GPUs.
+       IF( bc_dirichlet_l  .OR.  bc_radiation_l  .OR.                          &
+           bc_dirichlet_r  .OR.  bc_radiation_r  .OR.                          &
+           bc_dirichlet_s  .OR.  bc_radiation_s  .OR.                          &
+           bc_dirichlet_n  .OR.  bc_radiation_n )  THEN
+          nzb_max_l = nzt
+       ELSE
+          nzb_max_l = nzb_max
+       END IF
        gu = 2.0_wp * u_gtrans
        gv = 2.0_wp * v_gtrans
 …
        i = nxl
        DO  j = nys, nyn
           DO  k = nzb+1, nzb_max
+          DO  k = nzb+1, nzb_max_l
              ibit29 = REAL( IBITS(advc_flags_1(k,j,i-1),29,1), KIND = wp )
 …
           ENDDO
           DO  k = nzb_max+1, nzt
+          DO  k = nzb_max_l+1, nzt
              u_comp                   = u(k+1,j,i) + u(k,j,i) - gu
 …
        !$ACC PRESENT(rho_air, drho_air_zw, ddzu) &
        !$ACC PRESENT(tend) &
        !$ACC PRESENT(hom(nzb+1:nzb_max,1,3,0)) &
+       !$ACC PRESENT(hom(nzb+1:nzb_max_l,1,3,0)) &
        !$ACC PRESENT(weight_substep(intermediate_timestep_count)) &
        !$ACC PRESENT(sums_ws2_ws_l(nzb+1:nzb_max,0))
+       !$ACC PRESENT(sums_ws2_ws_l(nzb+1:nzb_max_l,0))
        DO i = nxl, nxr
 #ifndef _OPENACC
           j = nys
           DO  k = nzb+1, nzb_max
+          DO  k = nzb+1, nzb_max_l
              ibit32 = REAL( IBITS(advc_flags_2(k,j-1,i),0,1),  KIND = wp )
 …
           ENDDO
           DO  k = nzb_max+1, nzt
+          DO  k = nzb_max_l+1, nzt
              v_comp                 = v(k+1,j,i) + v(k,j,i) - gv
 …
              diss_d = -ABS(w_comp) * ( w(k,j,i) - w(k-1,j,i) ) * adv_mom_1
              DO  k = nzb+1, nzb_max
+             DO  k = nzb+1, nzb_max_l
                 ibit29 = REAL( IBITS(advc_flags_1(k,j,i),29,1), KIND = wp )
 …
              ENDDO
              DO  k = nzb_max+1, nzt
+             DO  k = nzb_max_l+1, nzt
                 u_comp = u(k+1,j,i+1) + u(k,j,i+1) - gu

Note: See TracChangeset for help on using the changeset viewer.

Context Navigation

Changeset 3661 for palm/trunk/SOURCE/advec_ws.f90

Legend:

palm/trunk/SOURCE/advec_ws.f90

Download in other formats: