MODULE tridia_solver

!--------------------------------------------------------------------------------!
! This file is part of PALM.
!
! PALM is free software: you can redistribute it and/or modify it under the terms
! of the GNU General Public License as published by the Free Software Foundation,
! either version 3 of the License, or (at your option) any later version.
!
! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
! A PARTICULAR PURPOSE.  See the GNU General Public License for more details.
!
! You should have received a copy of the GNU General Public License along with
! PALM. If not, see <http://www.gnu.org/licenses/>.
!
! Copyright 1997-2012  Leibniz University Hannover
!--------------------------------------------------------------------------------!
!
! Current revisions:
! ------------------
!
!
! Former revisions:
! -----------------
! $Id: tridia_solver.f90 1217 2013-08-26 11:30:05Z heinze $
!
! 1216 2013-08-26 09:31:42Z raasch
! +tridia_substi_overlap for handling overlapping fft / transposition
!
! 1212 2013-08-15 08:46:27Z raasch
! Initial revision.
! Routines have been moved to seperate module from former file poisfft to here.
! The tridiagonal matrix coefficients of array tri are calculated only once at
! the beginning, i.e. routine split is called within tridia_init.
!
!
! Description:
! ------------
! solves the linear system of equations:
!
! -(4 pi^2(i^2/(dx^2*nnx^2)+j^2/(dy^2*nny^2))+
!   1/(dzu(k)*dzw(k))+1/(dzu(k-1)*dzw(k)))*p(i,j,k)+
! 1/(dzu(k)*dzw(k))*p(i,j,k+1)+1/(dzu(k-1)*dzw(k))*p(i,j,k-1)=d(i,j,k)
!
! by using the Thomas algorithm
!------------------------------------------------------------------------------!

    USE indices
    USE transpose_indices

    IMPLICIT NONE

    REAL, DIMENSION(:,:), ALLOCATABLE ::  ddzuw

    PRIVATE

    INTERFACE tridia_substi
       MODULE PROCEDURE tridia_substi
    END INTERFACE tridia_substi

    INTERFACE tridia_substi_overlap
       MODULE PROCEDURE tridia_substi_overlap
    END INTERFACE tridia_substi_overlap

    PUBLIC  tridia_substi, tridia_substi_overlap, tridia_init, tridia_1dd

 CONTAINS


    SUBROUTINE tridia_init

       USE arrays_3d,  ONLY:  ddzu_pres, ddzw

       IMPLICIT NONE

       INTEGER ::  k

       ALLOCATE( ddzuw(0:nz-1,3) )

       DO  k = 0, nz-1
          ddzuw(k,1) = ddzu_pres(k+1) * ddzw(k+1)
          ddzuw(k,2) = ddzu_pres(k+2) * ddzw(k+1)
          ddzuw(k,3) = -1.0 * &
                       ( ddzu_pres(k+2) * ddzw(k+1) + ddzu_pres(k+1) * ddzw(k+1) )
       ENDDO
!
!--    Calculate constant coefficients of the tridiagonal matrix
#if ! defined ( __check )
       CALL maketri
       CALL split
#endif

    END SUBROUTINE tridia_init


    SUBROUTINE maketri

!------------------------------------------------------------------------------!
! Computes the i- and j-dependent component of the matrix
!------------------------------------------------------------------------------!

          USE arrays_3d,  ONLY: tric
          USE constants
          USE control_parameters
          USE grid_variables

          IMPLICIT NONE

          INTEGER ::  i, j, k, nnxh, nnyh

          !$acc declare create( ll )
          REAL    ::  ll(nxl_z:nxr_z,nys_z:nyn_z)


          nnxh = ( nx + 1 ) / 2
          nnyh = ( ny + 1 ) / 2

!
!--       Provide the constant coefficients of the tridiagonal matrix for solution
!--       of the Poisson equation in Fourier space.
!--       The coefficients are computed following the method of
!--       Schmidt et al. (DFVLR-Mitteilung 84-15), which departs from Stephan
!--       Siano's original version by discretizing the Poisson equation,
!--       before it is Fourier-transformed.

          !$acc kernels present( tric )
          !$acc loop vector( 32 )
          DO  j = nys_z, nyn_z
             DO  i = nxl_z, nxr_z
                IF ( j >= 0  .AND.  j <= nnyh )  THEN
                   IF ( i >= 0  .AND.  i <= nnxh )  THEN
                      ll(i,j) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
                                                  REAL( nx+1 ) ) ) / ( dx * dx ) + &
                                2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
                                                  REAL( ny+1 ) ) ) / ( dy * dy )
                   ELSE
                      ll(i,j) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
                                                  REAL( nx+1 ) ) ) / ( dx * dx ) + &
                                2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
                                                  REAL( ny+1 ) ) ) / ( dy * dy )
                   ENDIF
                ELSE
                   IF ( i >= 0  .AND.  i <= nnxh )  THEN
                      ll(i,j) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
                                                  REAL( nx+1 ) ) ) / ( dx * dx ) + &
                                2.0 * ( 1.0 - COS( ( 2.0 * pi * ( ny+1-j ) ) / &
                                                  REAL( ny+1 ) ) ) / ( dy * dy )
                   ELSE
                      ll(i,j) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
                                                  REAL( nx+1 ) ) ) / ( dx * dx ) + &
                                2.0 * ( 1.0 - COS( ( 2.0 * pi * ( ny+1-j ) ) / &
                                                  REAL( ny+1 ) ) ) / ( dy * dy )
                   ENDIF
                ENDIF
             ENDDO
          ENDDO

          !$acc loop
          DO  k = 0, nz-1
             DO  j = nys_z, nyn_z
                !$acc loop vector( 32 )
                DO  i = nxl_z, nxr_z
                   tric(i,j,k) = ddzuw(k,3) - ll(i,j)
                ENDDO
             ENDDO
          ENDDO
          !$acc end kernels

          IF ( ibc_p_b == 1 )  THEN
             !$acc kernels present( tric )
             !$acc loop
             DO  j = nys_z, nyn_z
                DO  i = nxl_z, nxr_z
                   tric(i,j,0) = tric(i,j,0) + ddzuw(0,1)
                ENDDO
             ENDDO
             !$acc end kernels
          ENDIF
          IF ( ibc_p_t == 1 )  THEN
             !$acc kernels present( tric )
             !$acc loop
             DO  j = nys_z, nyn_z
                DO  i = nxl_z, nxr_z
                   tric(i,j,nz-1) = tric(i,j,nz-1) + ddzuw(nz-1,2)
                ENDDO
             ENDDO
             !$acc end kernels
          ENDIF

    END SUBROUTINE maketri


    SUBROUTINE tridia_substi( ar )

!------------------------------------------------------------------------------!
! Substitution (Forward and Backward) (Thomas algorithm)
!------------------------------------------------------------------------------!

          USE arrays_3d,  ONLY: tri
          USE control_parameters

          IMPLICIT NONE

          INTEGER ::  i, j, k

          REAL    ::  ar(nxl_z:nxr_z,nys_z:nyn_z,1:nz)

          !$acc declare create( ar1 )
          REAL, DIMENSION(nxl_z:nxr_z,nys_z:nyn_z,0:nz-1)   ::  ar1

!
!--       Forward substitution
          DO  k = 0, nz - 1
             !$acc kernels present( ar, tri )
             !$acc loop
             DO  j = nys_z, nyn_z
                DO  i = nxl_z, nxr_z

                   IF ( k == 0 )  THEN
                      ar1(i,j,k) = ar(i,j,k+1)
                   ELSE
                      ar1(i,j,k) = ar(i,j,k+1) - tri(i,j,k,2) * ar1(i,j,k-1)
                   ENDIF

                ENDDO
             ENDDO
             !$acc end kernels
          ENDDO

!
!--       Backward substitution
!--       Note, the 1.0E-20 in the denominator is due to avoid divisions
!--       by zero appearing if the pressure bc is set to neumann at the top of
!--       the model domain.
          DO  k = nz-1, 0, -1
             !$acc kernels present( ar, tri )
             !$acc loop
             DO  j = nys_z, nyn_z
                DO  i = nxl_z, nxr_z

                   IF ( k == nz-1 )  THEN
                      ar(i,j,k+1) = ar1(i,j,k) / ( tri(i,j,k,1) + 1.0E-20 )
                   ELSE
                      ar(i,j,k+1) = ( ar1(i,j,k) - ddzuw(k,2) * ar(i,j,k+2) ) &
                              / tri(i,j,k,1)
                   ENDIF
                ENDDO
             ENDDO
             !$acc end kernels
          ENDDO

!
!--       Indices i=0, j=0 correspond to horizontally averaged pressure.
!--       The respective values of ar should be zero at all k-levels if
!--       acceleration of horizontally averaged vertical velocity is zero.
          IF ( ibc_p_b == 1  .AND.  ibc_p_t == 1 )  THEN
             IF ( nys_z == 0  .AND.  nxl_z == 0 )  THEN
                !$acc kernels loop present( ar )
                DO  k = 1, nz
                   ar(nxl_z,nys_z,k) = 0.0
                ENDDO
             ENDIF
          ENDIF

    END SUBROUTINE tridia_substi


    SUBROUTINE tridia_substi_overlap( ar, jj )

!------------------------------------------------------------------------------!
! Substitution (Forward and Backward) (Thomas algorithm)
!------------------------------------------------------------------------------!

          USE arrays_3d,  ONLY: tri
          USE control_parameters

          IMPLICIT NONE

          INTEGER ::  i, j, jj, k

          REAL    ::  ar(nxl_z:nxr_z,nys_z:nyn_z,1:nz)

          !$acc declare create( ar1 )
          REAL, DIMENSION(nxl_z:nxr_z,nys_z:nyn_z,0:nz-1)   ::  ar1

!
!--       Forward substitution
          DO  k = 0, nz - 1
             !$acc kernels present( ar, tri )
             !$acc loop
             DO  j = nys_z, nyn_z
                DO  i = nxl_z, nxr_z

                   IF ( k == 0 )  THEN
                      ar1(i,j,k) = ar(i,j,k+1)
                   ELSE
                      ar1(i,j,k) = ar(i,j,k+1) - tri(i,jj,k,2) * ar1(i,j,k-1)
                   ENDIF

                ENDDO
             ENDDO
             !$acc end kernels
          ENDDO

!
!--       Backward substitution
!--       Note, the 1.0E-20 in the denominator is due to avoid divisions
!--       by zero appearing if the pressure bc is set to neumann at the top of
!--       the model domain.
          DO  k = nz-1, 0, -1
             !$acc kernels present( ar, tri )
             !$acc loop
             DO  j = nys_z, nyn_z
                DO  i = nxl_z, nxr_z

                   IF ( k == nz-1 )  THEN
                      ar(i,j,k+1) = ar1(i,j,k) / ( tri(i,jj,k,1) + 1.0E-20 )
                   ELSE
                      ar(i,j,k+1) = ( ar1(i,j,k) - ddzuw(k,2) * ar(i,j,k+2) ) &
                              / tri(i,jj,k,1)
                   ENDIF
                ENDDO
             ENDDO
             !$acc end kernels
          ENDDO

!
!--       Indices i=0, j=0 correspond to horizontally averaged pressure.
!--       The respective values of ar should be zero at all k-levels if
!--       acceleration of horizontally averaged vertical velocity is zero.
          IF ( ibc_p_b == 1  .AND.  ibc_p_t == 1 )  THEN
             IF ( nys_z == 0  .AND.  nxl_z == 0 )  THEN
                !$acc kernels loop present( ar )
                DO  k = 1, nz
                   ar(nxl_z,nys_z,k) = 0.0
                ENDDO
             ENDIF
          ENDIF

    END SUBROUTINE tridia_substi_overlap


    SUBROUTINE split

!------------------------------------------------------------------------------!
! Splitting of the tridiagonal matrix (Thomas algorithm)
!------------------------------------------------------------------------------!

          USE arrays_3d,  ONLY: tri, tric

          IMPLICIT NONE

          INTEGER ::  i, j, k

!
!--       Splitting
          !$acc kernels present( tri, tric )
          !$acc loop
          DO  j = nys_z, nyn_z
             !$acc loop vector( 32 )
             DO  i = nxl_z, nxr_z
                tri(i,j,0,1) = tric(i,j,0)
             ENDDO
          ENDDO
          !$acc end kernels

          DO  k = 1, nz-1
             !$acc kernels present( tri, tric )
             !$acc loop
             DO  j = nys_z, nyn_z
                !$acc loop vector( 32 )
                DO  i = nxl_z, nxr_z
                   tri(i,j,k,2) = ddzuw(k,1) / tri(i,j,k-1,1)
                   tri(i,j,k,1) = tric(i,j,k) - ddzuw(k-1,2) * tri(i,j,k,2)
                ENDDO
             ENDDO
             !$acc end kernels
          ENDDO

    END SUBROUTINE split


    SUBROUTINE tridia_1dd( ddx2, ddy2, nx, ny, j, ar, tri )

!------------------------------------------------------------------------------!
! Solves the linear system of equations for a 1d-decomposition along x (see
! tridia)
!
! Attention:  when using the intel compilers older than 12.0, array tri must
!             be passed as an argument to the contained subroutines. Otherwise
!             addres faults will occur. This feature can be activated with
!             cpp-switch __intel11
!             On NEC, tri should not be passed (except for routine substi_1dd)
!             because this causes very bad performance.
!------------------------------------------------------------------------------!

       USE arrays_3d
       USE control_parameters

       USE pegrid

       IMPLICIT NONE

       INTEGER ::  i, j, k, nnyh, nx, ny, omp_get_thread_num, tn

       REAL    ::  ddx2, ddy2

       REAL, DIMENSION(0:nx,1:nz)     ::  ar
       REAL, DIMENSION(5,0:nx,0:nz-1) ::  tri


       nnyh = ( ny + 1 ) / 2

!
!--    Define constant elements of the tridiagonal matrix.
!--    The compiler on SX6 does loop exchange. If 0:nx is a high power of 2,
!--    the exchanged loops create bank conflicts. The following directive
!--    prohibits loop exchange and the loops perform much better.
!       tn = omp_get_thread_num()
!       WRITE( 120+tn, * ) '+++ id=',myid,' nx=',nx,' thread=', omp_get_thread_num()
!       CALL local_flush( 120+tn )
!CDIR NOLOOPCHG
       DO  k = 0, nz-1
          DO  i = 0,nx
             tri(2,i,k) = ddzu_pres(k+1) * ddzw(k+1)
             tri(3,i,k) = ddzu_pres(k+2) * ddzw(k+1)
          ENDDO
       ENDDO
!       WRITE( 120+tn, * ) '+++ id=',myid,' end of first tridia loop   thread=', omp_get_thread_num()
!       CALL local_flush( 120+tn )

       IF ( j <= nnyh )  THEN
#if defined( __intel11 )
          CALL maketri_1dd( j, tri )
#else
          CALL maketri_1dd( j )
#endif
       ELSE
#if defined( __intel11 )
          CALL maketri_1dd( ny+1-j, tri )
#else
          CALL maketri_1dd( ny+1-j )
#endif
       ENDIF
#if defined( __intel11 )
       CALL split_1dd( tri )
#else
       CALL split_1dd
#endif
       CALL substi_1dd( ar, tri )

    CONTAINS

#if defined( __intel11 )
       SUBROUTINE maketri_1dd( j, tri )
#else
       SUBROUTINE maketri_1dd( j )
#endif

!------------------------------------------------------------------------------!
! computes the i- and j-dependent component of the matrix
!------------------------------------------------------------------------------!

          USE constants

          IMPLICIT NONE

          INTEGER ::  i, j, k, nnxh
          REAL    ::  a, c

          REAL, DIMENSION(0:nx) ::  l

#if defined( __intel11 )
          REAL, DIMENSION(5,0:nx,0:nz-1) ::  tri
#endif


          nnxh = ( nx + 1 ) / 2
!
!--       Provide the tridiagonal matrix for solution of the Poisson equation in
!--       Fourier space. The coefficients are computed following the method of
!--       Schmidt et al. (DFVLR-Mitteilung 84-15), which departs from Stephan
!--       Siano's original version by discretizing the Poisson equation,
!--       before it is Fourier-transformed
          DO  i = 0, nx
             IF ( i >= 0 .AND. i <= nnxh ) THEN
                l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
                                         REAL( nx+1 ) ) ) * ddx2 + &
                       2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
                                         REAL( ny+1 ) ) ) * ddy2
             ELSE
                l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
                                         REAL( nx+1 ) ) ) * ddx2 + &
                       2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
                                         REAL( ny+1 ) ) ) * ddy2
             ENDIF
          ENDDO

          DO  k = 0, nz-1
             DO  i = 0, nx
                a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
                c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
                tri(1,i,k) = a + c - l(i)
             ENDDO
          ENDDO
          IF ( ibc_p_b == 1 )  THEN
             DO  i = 0, nx
                tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
             ENDDO
          ENDIF
          IF ( ibc_p_t == 1 )  THEN
             DO  i = 0, nx
                tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
             ENDDO
          ENDIF

       END SUBROUTINE maketri_1dd


#if defined( __intel11 )
       SUBROUTINE split_1dd( tri )
#else
       SUBROUTINE split_1dd
#endif

!------------------------------------------------------------------------------!
! Splitting of the tridiagonal matrix (Thomas algorithm)
!------------------------------------------------------------------------------!

          IMPLICIT NONE

          INTEGER ::  i, k

#if defined( __intel11 )
          REAL, DIMENSION(5,0:nx,0:nz-1) ::  tri
#endif


!
!--       Splitting
          DO  i = 0, nx
             tri(4,i,0) = tri(1,i,0)
          ENDDO
          DO  k = 1, nz-1
             DO  i = 0, nx
                tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
                tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
             ENDDO
          ENDDO

       END SUBROUTINE split_1dd


       SUBROUTINE substi_1dd( ar, tri )

!------------------------------------------------------------------------------!
! Substitution (Forward and Backward) (Thomas algorithm)
!------------------------------------------------------------------------------!

          IMPLICIT NONE

          INTEGER ::  i, k

          REAL, DIMENSION(0:nx,nz)       ::  ar
          REAL, DIMENSION(0:nx,0:nz-1)   ::  ar1
          REAL, DIMENSION(5,0:nx,0:nz-1) ::  tri

!
!--       Forward substitution
          DO  i = 0, nx
             ar1(i,0) = ar(i,1)
          ENDDO
          DO  k = 1, nz-1
             DO  i = 0, nx
                ar1(i,k) = ar(i,k+1) - tri(5,i,k) * ar1(i,k-1)
             ENDDO
          ENDDO

!
!--       Backward substitution
!--       Note, the add of 1.0E-20 in the denominator is due to avoid divisions
!--       by zero appearing if the pressure bc is set to neumann at the top of
!--       the model domain.
          DO  i = 0, nx
             ar(i,nz) = ar1(i,nz-1) / ( tri(4,i,nz-1) + 1.0E-20 )
          ENDDO
          DO  k = nz-2, 0, -1
             DO  i = 0, nx
                ar(i,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,k+2) ) &
                            / tri(4,i,k)
             ENDDO
          ENDDO

!
!--       Indices i=0, j=0 correspond to horizontally averaged pressure.
!--       The respective values of ar should be zero at all k-levels if
!--       acceleration of horizontally averaged vertical velocity is zero.
          IF ( ibc_p_b == 1  .AND.  ibc_p_t == 1 )  THEN
             IF ( j == 0 )  THEN
                DO  k = 1, nz
                   ar(0,k) = 0.0
                ENDDO
             ENDIF
          ENDIF

       END SUBROUTINE substi_1dd

    END SUBROUTINE tridia_1dd


 END MODULE tridia_solver