MODULE fft_xy !--------------------------------------------------------------------------------! ! This file is part of PALM. ! ! PALM is free software: you can redistribute it and/or modify it under the terms ! of the GNU General Public License as published by the Free Software Foundation, ! either version 3 of the License, or (at your option) any later version. ! ! PALM is distributed in the hope that it will be useful, but WITHOUT ANY ! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR ! A PARTICULAR PURPOSE. See the GNU General Public License for more details. ! ! You should have received a copy of the GNU General Public License along with ! PALM. If not, see . ! ! Copyright 1997-2012 Leibniz University Hannover !--------------------------------------------------------------------------------! ! ! Current revisions: ! ----------------- ! ! ! Former revisions: ! ----------------- ! $Id: fft_xy.f90 1107 2013-03-04 06:23:14Z raasch $ ! ! 1106 2013-03-04 05:31:38Z raasch ! CUDA fft added ! array_kind renamed precision_kind, 3D- instead of 1D-loops in fft_x and fft_y ! old fft_x, fft_y become fft_x_1d, fft_y_1d and are used for 1D-decomposition ! ! 1092 2013-02-02 11:24:22Z raasch ! variable sizw declared for NEC case only ! ! 1036 2012-10-22 13:43:42Z raasch ! code put under GPL (PALM 3.9) ! ! 274 2009-03-26 15:11:21Z heinze ! Output of messages replaced by message handling routine. ! ! Feb. 2007 ! RCS Log replace by Id keyword, revision history cleaned up ! ! Revision 1.4 2006/03/28 12:27:09 raasch ! Stop when system-specific fft is selected on NEC. For unknown reasons this ! causes a program abort during first allocation in init_grid. ! ! Revision 1.2 2004/04/30 11:44:27 raasch ! Module renamed from fft_for_1d_decomp to fft_xy, 1d-routines renamed to ! fft_x and fft_y, ! function FFT replaced by subroutine FFTN due to problems with 64-bit ! mode on ibm, ! shape of array cwork is explicitly stored in ishape/jshape and handled ! to routine FFTN instead of shape-function (due to compiler error on ! decalpha), ! non vectorized FFT for nec included ! ! Revision 1.1 2002/06/11 13:00:49 raasch ! Initial revision ! ! ! Description: ! ------------ ! Fast Fourier transformation along x and y for 1d domain decomposition along x. ! Original version: Klaus Ketelsen (May 2002) !------------------------------------------------------------------------------! USE control_parameters USE indices USE precision_kind USE singleton USE temperton_fft USE transpose_indices IMPLICIT NONE PRIVATE PUBLIC fft_x, fft_x_1d, fft_y, fft_y_1d, fft_init, fft_x_m, fft_y_m INTEGER, DIMENSION(:), ALLOCATABLE, SAVE :: ifax_x, ifax_y LOGICAL, SAVE :: init_fft = .FALSE. REAL, SAVE :: dnx, dny, sqr_dnx, sqr_dny REAL, DIMENSION(:), ALLOCATABLE, SAVE :: trigs_x, trigs_y #if defined( __ibm ) INTEGER, PARAMETER :: nau1 = 20000, nau2 = 22000 ! !-- The following working arrays contain tables and have to be "save" and !-- shared in OpenMP sense REAL, DIMENSION(nau1), SAVE :: aux1, auy1, aux3, auy3 #elif defined( __nec ) INTEGER, SAVE :: nz1 REAL, DIMENSION(:), ALLOCATABLE, SAVE :: trig_xb, trig_xf, trig_yb, & trig_yf #elif defined( __cuda_fft ) INTEGER, SAVE :: plan_xf, plan_xi, plan_yf, plan_yi, total_points_x_transpo, & total_points_y_transpo #endif ! !-- Public interfaces INTERFACE fft_init MODULE PROCEDURE fft_init END INTERFACE fft_init INTERFACE fft_x MODULE PROCEDURE fft_x END INTERFACE fft_x INTERFACE fft_x_1d MODULE PROCEDURE fft_x_1d END INTERFACE fft_x_1d INTERFACE fft_y MODULE PROCEDURE fft_y END INTERFACE fft_y INTERFACE fft_y_1d MODULE PROCEDURE fft_y_1d END INTERFACE fft_y_1d INTERFACE fft_x_m MODULE PROCEDURE fft_x_m END INTERFACE fft_x_m INTERFACE fft_y_m MODULE PROCEDURE fft_y_m END INTERFACE fft_y_m CONTAINS SUBROUTINE fft_init USE cuda_fft_interfaces IMPLICIT NONE ! !-- The following temporary working arrays have to be on stack or private !-- in OpenMP sense #if defined( __ibm ) REAL, DIMENSION(0:nx+2) :: workx REAL, DIMENSION(0:ny+2) :: worky REAL, DIMENSION(nau2) :: aux2, auy2, aux4, auy4 #elif defined( __nec ) REAL, DIMENSION(0:nx+3,nz+1) :: work_x REAL, DIMENSION(0:ny+3,nz+1) :: work_y REAL, DIMENSION(6*(nx+3),nz+1) :: workx REAL, DIMENSION(6*(ny+3),nz+1) :: worky #endif ! !-- Return, if already called IF ( init_fft ) THEN RETURN ELSE init_fft = .TRUE. ENDIF IF ( fft_method == 'system-specific' ) THEN dnx = 1.0 / ( nx + 1.0 ) dny = 1.0 / ( ny + 1.0 ) sqr_dnx = SQRT( dnx ) sqr_dny = SQRT( dny ) #if defined( __ibm ) && ! defined( __ibmy_special ) ! !-- Initialize tables for fft along x CALL DRCFT( 1, workx, 1, workx, 1, nx+1, 1, 1, sqr_dnx, aux1, nau1, & aux2, nau2 ) CALL DCRFT( 1, workx, 1, workx, 1, nx+1, 1, -1, sqr_dnx, aux3, nau1, & aux4, nau2 ) ! !-- Initialize tables for fft along y CALL DRCFT( 1, worky, 1, worky, 1, ny+1, 1, 1, sqr_dny, auy1, nau1, & auy2, nau2 ) CALL DCRFT( 1, worky, 1, worky, 1, ny+1, 1, -1, sqr_dny, auy3, nau1, & auy4, nau2 ) #elif defined( __nec ) message_string = 'fft method "' // TRIM( fft_method) // & '" currently does not work on NEC' CALL message( 'fft_init', 'PA0187', 1, 2, 0, 6, 0 ) ALLOCATE( trig_xb(2*(nx+1)), trig_xf(2*(nx+1)), & trig_yb(2*(ny+1)), trig_yf(2*(ny+1)) ) work_x = 0.0 work_y = 0.0 nz1 = nz + MOD( nz+1, 2 ) ! odd nz slows down fft significantly ! when using the NEC ffts ! !-- Initialize tables for fft along x (non-vector and vector case (M)) CALL DZFFT( 0, nx+1, sqr_dnx, work_x, work_x, trig_xf, workx, 0 ) CALL ZDFFT( 0, nx+1, sqr_dnx, work_x, work_x, trig_xb, workx, 0 ) CALL DZFFTM( 0, nx+1, nz1, sqr_dnx, work_x, nx+4, work_x, nx+4, & trig_xf, workx, 0 ) CALL ZDFFTM( 0, nx+1, nz1, sqr_dnx, work_x, nx+4, work_x, nx+4, & trig_xb, workx, 0 ) ! !-- Initialize tables for fft along y (non-vector and vector case (M)) CALL DZFFT( 0, ny+1, sqr_dny, work_y, work_y, trig_yf, worky, 0 ) CALL ZDFFT( 0, ny+1, sqr_dny, work_y, work_y, trig_yb, worky, 0 ) CALL DZFFTM( 0, ny+1, nz1, sqr_dny, work_y, ny+4, work_y, ny+4, & trig_yf, worky, 0 ) CALL ZDFFTM( 0, ny+1, nz1, sqr_dny, work_y, ny+4, work_y, ny+4, & trig_yb, worky, 0 ) #elif defined( __cuda_fft ) total_points_x_transpo = (nx+1) * (nyn_x-nys_x+1) * (nzt_x-nzb_x+1) total_points_y_transpo = (ny+1) * (nxr_y-nxl_y+1) * (nzt_y-nzb_y+1) CALL CUFFTPLAN1D( plan_xf, nx+1, CUFFT_D2Z, (ny+1)*nz ) CALL CUFFTPLAN1D( plan_xi, nx+1, CUFFT_Z2D, (ny+1)*nz ) CALL CUFFTPLAN1D( plan_yf, ny+1, CUFFT_D2Z, (nx+1)*nz ) CALL CUFFTPLAN1D( plan_yi, ny+1, CUFFT_Z2D, (nx+1)*nz ) #else message_string = 'no system-specific fft-call available' CALL message( 'fft_init', 'PA0188', 1, 2, 0, 6, 0 ) #endif ELSEIF ( fft_method == 'temperton-algorithm' ) THEN ! !-- Temperton-algorithm !-- Initialize tables for fft along x and y ALLOCATE( ifax_x(nx+1), ifax_y(ny+1), trigs_x(nx+1), trigs_y(ny+1) ) CALL set99( trigs_x, ifax_x, nx+1 ) CALL set99( trigs_y, ifax_y, ny+1 ) ELSEIF ( fft_method == 'singleton-algorithm' ) THEN CONTINUE ELSE message_string = 'fft method "' // TRIM( fft_method) // & '" not available' CALL message( 'fft_init', 'PA0189', 1, 2, 0, 6, 0 ) ENDIF END SUBROUTINE fft_init SUBROUTINE fft_x( ar, direction ) !----------------------------------------------------------------------! ! fft_x ! ! ! ! Fourier-transformation along x-direction ! ! Version for 2D-decomposition ! ! ! ! fft_x uses internal algorithms (Singleton or Temperton) or ! ! system-specific routines, if they are available ! !----------------------------------------------------------------------! USE cuda_fft_interfaces IMPLICIT NONE CHARACTER (LEN=*) :: direction INTEGER :: i, ishape(1), j, k, m LOGICAL :: forward_fft REAL, DIMENSION(0:nx+2) :: work REAL, DIMENSION(nx+2) :: work1 COMPLEX, DIMENSION(:), ALLOCATABLE :: cwork #if defined( __ibm ) REAL, DIMENSION(nau2) :: aux2, aux4 #elif defined( __nec ) REAL, DIMENSION(6*(nx+1)) :: work2 #elif defined( __cuda_fft ) REAL(dpk), DEVICE, DIMENSION(:), ALLOCATABLE :: cuda_a_device COMPLEX(dpk), DEVICE, DIMENSION(:), ALLOCATABLE :: cuda_b_device COMPLEX(dpk), DIMENSION(:), ALLOCATABLE :: cuda_host #endif REAL, DIMENSION(0:nx,nys_x:nyn_x,nzb_x:nzt_x) :: ar IF ( direction == 'forward' ) THEN forward_fft = .TRUE. ELSE forward_fft = .FALSE. ENDIF IF ( fft_method == 'singleton-algorithm' ) THEN ! !-- Performing the fft with singleton's software works on every system, !-- since it is part of the model ALLOCATE( cwork(0:nx) ) IF ( forward_fft ) then !$OMP PARALLEL PRIVATE ( cwork, i, ishape, j, k ) !$OMP DO DO k = nzb_x, nzt_x DO j = nys_x, nyn_x DO i = 0, nx cwork(i) = CMPLX( ar(i,j,k) ) ENDDO ishape = SHAPE( cwork ) CALL FFTN( cwork, ishape ) DO i = 0, (nx+1)/2 ar(i,j,k) = REAL( cwork(i) ) ENDDO DO i = 1, (nx+1)/2 - 1 ar(nx+1-i,j,k) = -AIMAG( cwork(i) ) ENDDO ENDDO ENDDO !$OMP END PARALLEL ELSE !$OMP PARALLEL PRIVATE ( cwork, i, ishape, j, k ) !$OMP DO DO k = nzb_x, nzt_x DO j = nys_x, nyn_x cwork(0) = CMPLX( ar(0,j,k), 0.0 ) DO i = 1, (nx+1)/2 - 1 cwork(i) = CMPLX( ar(i,j,k), -ar(nx+1-i,j,k) ) cwork(nx+1-i) = CMPLX( ar(i,j,k), ar(nx+1-i,j,k) ) ENDDO cwork((nx+1)/2) = CMPLX( ar((nx+1)/2,j,k), 0.0 ) ishape = SHAPE( cwork ) CALL FFTN( cwork, ishape, inv = .TRUE. ) DO i = 0, nx ar(i,j,k) = REAL( cwork(i) ) ENDDO ENDDO ENDDO !$OMP END PARALLEL ENDIF DEALLOCATE( cwork ) ELSEIF ( fft_method == 'temperton-algorithm' ) THEN ! !-- Performing the fft with Temperton's software works on every system, !-- since it is part of the model IF ( forward_fft ) THEN !$OMP PARALLEL PRIVATE ( work, i, j, k ) !$OMP DO DO k = nzb_x, nzt_x DO j = nys_x, nyn_x work(0:nx) = ar(0:nx,j,k) CALL fft991cy( work, work1, trigs_x, ifax_x, 1, nx+1, nx+1, 1, -1 ) DO i = 0, (nx+1)/2 ar(i,j,k) = work(2*i) ENDDO DO i = 1, (nx+1)/2 - 1 ar(nx+1-i,j,k) = work(2*i+1) ENDDO ENDDO ENDDO !$OMP END PARALLEL ELSE !$OMP PARALLEL PRIVATE ( work, i, j, k ) !$OMP DO DO k = nzb_x, nzt_x DO j = nys_x, nyn_x DO i = 0, (nx+1)/2 work(2*i) = ar(i,j,k) ENDDO DO i = 1, (nx+1)/2 - 1 work(2*i+1) = ar(nx+1-i,j,k) ENDDO work(1) = 0.0 work(nx+2) = 0.0 CALL fft991cy( work, work1, trigs_x, ifax_x, 1, nx+1, nx+1, 1, 1 ) ar(0:nx,j,k) = work(0:nx) ENDDO ENDDO !$OMP END PARALLEL ENDIF ELSEIF ( fft_method == 'system-specific' ) THEN #if defined( __ibm ) && ! defined( __ibmy_special ) IF ( forward_fft ) THEN !$OMP PARALLEL PRIVATE ( work, i, j, k ) !$OMP DO DO k = nzb_x, nzt_x DO j = nys_x, nyn_x CALL DRCFT( 0, ar, 1, work, 1, nx+1, 1, 1, sqr_dnx, aux1, nau1, & aux2, nau2 ) DO i = 0, (nx+1)/2 ar(i,j,k) = work(2*i) ENDDO DO i = 1, (nx+1)/2 - 1 ar(nx+1-i,j,k) = work(2*i+1) ENDDO ENDDO ENDDO !$OMP END PARALLEL ELSE !$OMP PARALLEL PRIVATE ( work, i, j, k ) !$OMP DO DO k = nzb_x, nzt_x DO j = nys_x, nyn_x DO i = 0, (nx+1)/2 work(2*i) = ar(i,j,k) ENDDO DO i = 1, (nx+1)/2 - 1 work(2*i+1) = ar(nx+1-i,j,k) ENDDO work(1) = 0.0 work(nx+2) = 0.0 CALL DCRFT( 0, work, 1, work, 1, nx+1, 1, -1, sqr_dnx, aux3, nau1, & aux4, nau2 ) DO i = 0, nx ar(i,j,k) = work(i) ENDDO ENDDO ENDDO !$OMP END PARALLEL ENDIF #elif defined( __nec ) IF ( forward_fft ) THEN !$OMP PARALLEL PRIVATE ( work, i, j, k ) !$OMP DO DO k = nzb_x, nzt_x DO j = nys_x, nyn_x work(0:nx) = ar(0:nx,j,k) CALL DZFFT( 1, nx+1, sqr_dnx, work, work, trig_xf, work2, 0 ) DO i = 0, (nx+1)/2 ar(i,j,k) = work(2*i) ENDDO DO i = 1, (nx+1)/2 - 1 ar(nx+1-i,j,k) = work(2*i+1) ENDDO ENDDO ENDDO !$END OMP PARALLEL ELSE !$OMP PARALLEL PRIVATE ( work, i, j, k ) !$OMP DO DO k = nzb_x, nzt_x DO j = nys_x, nyn_x DO i = 0, (nx+1)/2 work(2*i) = ar(i,j,k) ENDDO DO i = 1, (nx+1)/2 - 1 work(2*i+1) = ar(nx+1-i,j,k) ENDDO work(1) = 0.0 work(nx+2) = 0.0 CALL ZDFFT( -1, nx+1, sqr_dnx, work, work, trig_xb, work2, 0 ) ar(0:nx,j,k) = work(0:nx) ENDDO ENDDO !$OMP END PARALLEL ENDIF #elif defined( __cuda_fft ) ALLOCATE( cuda_a_device(0:total_points_x_transpo-1) ) ALLOCATE( cuda_b_device(0:((nx+1)/2+1) * (nyn_x-nys_x+1) * (nzt_x-nzb_x+1) - 1) ) ALLOCATE( cuda_host(0:((nx+1)/2+1) * (nyn_x-nys_x+1) * (nzt_x-nzb_x+1) - 1) ) m = 0 IF ( forward_fft ) THEN cuda_a_device = ar(0:total_points_x_transpo-1,nys_x,nzb_x) CALL CUFFTEXECD2Z( plan_xf, cuda_a_device, cuda_b_device ) cuda_host = cuda_b_device DO k = nzb_x, nzt_x DO j = nys_x, nyn_x DO i = 0, (nx+1)/2 ar(i,j,k) = REAL( cuda_host(m+i) ) * dnx ENDDO DO i = 1, (nx+1)/2 - 1 ar(nx+1-i,j,k) = AIMAG( cuda_host(m+i) ) * dnx ENDDO m = m + (nx+1)/2 + 1 ENDDO ENDDO ELSE DO k = nzb_x, nzt_x DO j = nys_x, nyn_x cuda_host(m) = CMPLX( ar(0,j,k), 0.0 ) DO i = 1, (nx+1)/2 - 1 cuda_host(m+i) = CMPLX( ar(i,j,k), ar(nx+1-i,j,k) ) ENDDO cuda_host(m+(nx+1)/2) = CMPLX( ar((nx+1)/2,j,k), 0.0 ) m = m + (nx+1)/2 + 1 ENDDO ENDDO cuda_b_device = cuda_host CALL CUFFTEXECZ2D( plan_xi, cuda_b_device, cuda_a_device ) ar(0:total_points_x_transpo-1,nys_x,nzb_x) = cuda_a_device ENDIF DEALLOCATE( cuda_a_device, cuda_b_device, cuda_host ) #else message_string = 'no system-specific fft-call available' CALL message( 'fft_x', 'PA0188', 1, 2, 0, 6, 0 ) #endif ELSE message_string = 'fft method "' // TRIM( fft_method) // & '" not available' CALL message( 'fft_x', 'PA0189', 1, 2, 0, 6, 0 ) ENDIF END SUBROUTINE fft_x SUBROUTINE fft_x_1d( ar, direction ) !----------------------------------------------------------------------! ! fft_x_1d ! ! ! ! Fourier-transformation along x-direction ! ! Version for 1D-decomposition ! ! ! ! fft_x uses internal algorithms (Singleton or Temperton) or ! ! system-specific routines, if they are available ! !----------------------------------------------------------------------! IMPLICIT NONE CHARACTER (LEN=*) :: direction INTEGER :: i, ishape(1) LOGICAL :: forward_fft REAL, DIMENSION(0:nx) :: ar REAL, DIMENSION(0:nx+2) :: work REAL, DIMENSION(nx+2) :: work1 COMPLEX, DIMENSION(:), ALLOCATABLE :: cwork #if defined( __ibm ) REAL, DIMENSION(nau2) :: aux2, aux4 #elif defined( __nec ) REAL, DIMENSION(6*(nx+1)) :: work2 #endif IF ( direction == 'forward' ) THEN forward_fft = .TRUE. ELSE forward_fft = .FALSE. ENDIF IF ( fft_method == 'singleton-algorithm' ) THEN ! !-- Performing the fft with singleton's software works on every system, !-- since it is part of the model ALLOCATE( cwork(0:nx) ) IF ( forward_fft ) then DO i = 0, nx cwork(i) = CMPLX( ar(i) ) ENDDO ishape = SHAPE( cwork ) CALL FFTN( cwork, ishape ) DO i = 0, (nx+1)/2 ar(i) = REAL( cwork(i) ) ENDDO DO i = 1, (nx+1)/2 - 1 ar(nx+1-i) = -AIMAG( cwork(i) ) ENDDO ELSE cwork(0) = CMPLX( ar(0), 0.0 ) DO i = 1, (nx+1)/2 - 1 cwork(i) = CMPLX( ar(i), -ar(nx+1-i) ) cwork(nx+1-i) = CMPLX( ar(i), ar(nx+1-i) ) ENDDO cwork((nx+1)/2) = CMPLX( ar((nx+1)/2), 0.0 ) ishape = SHAPE( cwork ) CALL FFTN( cwork, ishape, inv = .TRUE. ) DO i = 0, nx ar(i) = REAL( cwork(i) ) ENDDO ENDIF DEALLOCATE( cwork ) ELSEIF ( fft_method == 'temperton-algorithm' ) THEN ! !-- Performing the fft with Temperton's software works on every system, !-- since it is part of the model IF ( forward_fft ) THEN work(0:nx) = ar CALL fft991cy( work, work1, trigs_x, ifax_x, 1, nx+1, nx+1, 1, -1 ) DO i = 0, (nx+1)/2 ar(i) = work(2*i) ENDDO DO i = 1, (nx+1)/2 - 1 ar(nx+1-i) = work(2*i+1) ENDDO ELSE DO i = 0, (nx+1)/2 work(2*i) = ar(i) ENDDO DO i = 1, (nx+1)/2 - 1 work(2*i+1) = ar(nx+1-i) ENDDO work(1) = 0.0 work(nx+2) = 0.0 CALL fft991cy( work, work1, trigs_x, ifax_x, 1, nx+1, nx+1, 1, 1 ) ar = work(0:nx) ENDIF ELSEIF ( fft_method == 'system-specific' ) THEN #if defined( __ibm ) && ! defined( __ibmy_special ) IF ( forward_fft ) THEN CALL DRCFT( 0, ar, 1, work, 1, nx+1, 1, 1, sqr_dnx, aux1, nau1, & aux2, nau2 ) DO i = 0, (nx+1)/2 ar(i) = work(2*i) ENDDO DO i = 1, (nx+1)/2 - 1 ar(nx+1-i) = work(2*i+1) ENDDO ELSE DO i = 0, (nx+1)/2 work(2*i) = ar(i) ENDDO DO i = 1, (nx+1)/2 - 1 work(2*i+1) = ar(nx+1-i) ENDDO work(1) = 0.0 work(nx+2) = 0.0 CALL DCRFT( 0, work, 1, work, 1, nx+1, 1, -1, sqr_dnx, aux3, nau1, & aux4, nau2 ) DO i = 0, nx ar(i) = work(i) ENDDO ENDIF #elif defined( __nec ) IF ( forward_fft ) THEN work(0:nx) = ar(0:nx) CALL DZFFT( 1, nx+1, sqr_dnx, work, work, trig_xf, work2, 0 ) DO i = 0, (nx+1)/2 ar(i) = work(2*i) ENDDO DO i = 1, (nx+1)/2 - 1 ar(nx+1-i) = work(2*i+1) ENDDO ELSE DO i = 0, (nx+1)/2 work(2*i) = ar(i) ENDDO DO i = 1, (nx+1)/2 - 1 work(2*i+1) = ar(nx+1-i) ENDDO work(1) = 0.0 work(nx+2) = 0.0 CALL ZDFFT( -1, nx+1, sqr_dnx, work, work, trig_xb, work2, 0 ) ar(0:nx) = work(0:nx) ENDIF #else message_string = 'no system-specific fft-call available' CALL message( 'fft_x_1d', 'PA0188', 1, 2, 0, 6, 0 ) #endif ELSE message_string = 'fft method "' // TRIM( fft_method) // & '" not available' CALL message( 'fft_x_1d', 'PA0189', 1, 2, 0, 6, 0 ) ENDIF END SUBROUTINE fft_x_1d SUBROUTINE fft_y( ar, direction ) !----------------------------------------------------------------------! ! fft_y ! ! ! ! Fourier-transformation along y-direction ! ! Version for 2D-decomposition ! ! ! ! fft_y uses internal algorithms (Singleton or Temperton) or ! ! system-specific routines, if they are available ! !----------------------------------------------------------------------! USE cuda_fft_interfaces IMPLICIT NONE CHARACTER (LEN=*) :: direction INTEGER :: i, j, jshape(1), k, m LOGICAL :: forward_fft REAL, DIMENSION(0:ny+2) :: work REAL, DIMENSION(ny+2) :: work1 COMPLEX, DIMENSION(:), ALLOCATABLE :: cwork #if defined( __ibm ) REAL, DIMENSION(nau2) :: auy2, auy4 #elif defined( __nec ) REAL, DIMENSION(6*(ny+1)) :: work2 #elif defined( __cuda_fft ) REAL(dpk), DEVICE, DIMENSION(:), ALLOCATABLE :: cuda_a_device COMPLEX(dpk), DEVICE, DIMENSION(:), ALLOCATABLE :: cuda_b_device COMPLEX(dpk), DIMENSION(:), ALLOCATABLE :: cuda_host #endif REAL, DIMENSION(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) :: ar IF ( direction == 'forward' ) THEN forward_fft = .TRUE. ELSE forward_fft = .FALSE. ENDIF IF ( fft_method == 'singleton-algorithm' ) THEN ! !-- Performing the fft with singleton's software works on every system, !-- since it is part of the model ALLOCATE( cwork(0:ny) ) IF ( forward_fft ) then !$OMP PARALLEL PRIVATE ( cwork, i, jshape, j, k ) !$OMP DO DO k = nzb_y, nzt_y DO i = nxl_y, nxr_y DO j = 0, ny cwork(j) = CMPLX( ar(j,i,k) ) ENDDO jshape = SHAPE( cwork ) CALL FFTN( cwork, jshape ) DO j = 0, (ny+1)/2 ar(j,i,k) = REAL( cwork(j) ) ENDDO DO j = 1, (ny+1)/2 - 1 ar(ny+1-j,i,k) = -AIMAG( cwork(j) ) ENDDO ENDDO ENDDO !$OMP END PARALLEL ELSE !$OMP PARALLEL PRIVATE ( cwork, i, jshape, j, k ) !$OMP DO DO k = nzb_y, nzt_y DO i = nxl_y, nxr_y cwork(0) = CMPLX( ar(0,i,k), 0.0 ) DO j = 1, (ny+1)/2 - 1 cwork(j) = CMPLX( ar(j,i,k), -ar(ny+1-j,i,k) ) cwork(ny+1-j) = CMPLX( ar(j,i,k), ar(ny+1-j,i,k) ) ENDDO cwork((ny+1)/2) = CMPLX( ar((ny+1)/2,i,k), 0.0 ) jshape = SHAPE( cwork ) CALL FFTN( cwork, jshape, inv = .TRUE. ) DO j = 0, ny ar(j,i,k) = REAL( cwork(j) ) ENDDO ENDDO ENDDO !$OMP END PARALLEL ENDIF DEALLOCATE( cwork ) ELSEIF ( fft_method == 'temperton-algorithm' ) THEN ! !-- Performing the fft with Temperton's software works on every system, !-- since it is part of the model IF ( forward_fft ) THEN !$OMP PARALLEL PRIVATE ( work, i, j, k ) !$OMP DO DO k = nzb_y, nzt_y DO i = nxl_y, nxr_y work(0:ny) = ar(0:ny,i,k) CALL fft991cy( work, work1, trigs_y, ifax_y, 1, ny+1, ny+1, 1, -1 ) DO j = 0, (ny+1)/2 ar(j,i,k) = work(2*j) ENDDO DO j = 1, (ny+1)/2 - 1 ar(ny+1-j,i,k) = work(2*j+1) ENDDO ENDDO ENDDO !$OMP END PARALLEL ELSE !$OMP PARALLEL PRIVATE ( work, i, j, k ) !$OMP DO DO k = nzb_y, nzt_y DO i = nxl_y, nxr_y DO j = 0, (ny+1)/2 work(2*j) = ar(j,i,k) ENDDO DO j = 1, (ny+1)/2 - 1 work(2*j+1) = ar(ny+1-j,i,k) ENDDO work(1) = 0.0 work(ny+2) = 0.0 CALL fft991cy( work, work1, trigs_y, ifax_y, 1, ny+1, ny+1, 1, 1 ) ar(0:ny,i,k) = work(0:ny) ENDDO ENDDO !$OMP END PARALLEL ENDIF ELSEIF ( fft_method == 'system-specific' ) THEN #if defined( __ibm ) && ! defined( __ibmy_special ) IF ( forward_fft) THEN !$OMP PARALLEL PRIVATE ( work, i, j, k ) !$OMP DO DO k = nzb_y, nzt_y DO i = nxl_y, nxr_y CALL DRCFT( 0, ar, 1, work, 1, ny+1, 1, 1, sqr_dny, auy1, nau1, & auy2, nau2 ) DO j = 0, (ny+1)/2 ar(j,i,k) = work(2*j) ENDDO DO j = 1, (ny+1)/2 - 1 ar(ny+1-j,i,k) = work(2*j+1) ENDDO ENDDO ENDDO !$OMP END PARALLEL ELSE !$OMP PARALLEL PRIVATE ( work, i, j, k ) !$OMP DO DO k = nzb_y, nzt_y DO i = nxl_y, nxr_y DO j = 0, (ny+1)/2 work(2*j) = ar(j,i,k) ENDDO DO j = 1, (ny+1)/2 - 1 work(2*j+1) = ar(ny+1-j,i,k) ENDDO work(1) = 0.0 work(ny+2) = 0.0 CALL DCRFT( 0, work, 1, work, 1, ny+1, 1, -1, sqr_dny, auy3, nau1, & auy4, nau2 ) DO j = 0, ny ar(j,i,k) = work(j) ENDDO ENDDO ENDDO !$OMP END PARALLEL ENDIF #elif defined( __nec ) IF ( forward_fft ) THEN !$OMP PARALLEL PRIVATE ( work, i, j, k ) !$OMP DO DO k = nzb_y, nzt_y DO i = nxl_y, nxr_y work(0:ny) = ar(0:ny,i,k) CALL DZFFT( 1, ny+1, sqr_dny, work, work, trig_yf, work2, 0 ) DO j = 0, (ny+1)/2 ar(j,i,k) = work(2*j) ENDDO DO j = 1, (ny+1)/2 - 1 ar(ny+1-j,i,k) = work(2*j+1) ENDDO ENDDO ENDDO !$END OMP PARALLEL ELSE !$OMP PARALLEL PRIVATE ( work, i, j, k ) !$OMP DO DO k = nzb_y, nzt_y DO i = nxl_y, nxr_y DO j = 0, (ny+1)/2 work(2*j) = ar(j,i,k) ENDDO DO j = 1, (ny+1)/2 - 1 work(2*j+1) = ar(ny+1-j,i,k) ENDDO work(1) = 0.0 work(ny+2) = 0.0 CALL ZDFFT( -1, ny+1, sqr_dny, work, work, trig_yb, work2, 0 ) ar(0:ny,i,k) = work(0:ny) ENDDO ENDDO !$OMP END PARALLEL ENDIF #elif defined( __cuda_fft ) ALLOCATE( cuda_a_device(0:total_points_y_transpo-1) ) ALLOCATE( cuda_b_device(0:((ny+1)/2+1) * (nxr_y-nxl_y+1) * (nzt_y-nzb_y+1) - 1) ) ALLOCATE( cuda_host(0:((ny+1)/2+1) * (nxr_y-nxl_y+1) * (nzt_y-nzb_y+1) - 1) ) m = 0 IF ( forward_fft ) THEN cuda_a_device = ar(0:total_points_y_transpo-1,nxl_y,nzb_y) CALL CUFFTEXECD2Z( plan_yf, cuda_a_device, cuda_b_device ) cuda_host = cuda_b_device DO k = nzb_y, nzt_y DO i = nxl_y, nxr_y DO j = 0, (ny+1)/2 ar(j,i,k) = REAL( cuda_host(m+j) ) * dny ENDDO DO j = 1, (ny+1)/2 - 1 ar(ny+1-j,i,k) = AIMAG( cuda_host(m+j) ) * dny ENDDO m = m + (ny+1)/2 + 1 ENDDO ENDDO ELSE DO k = nzb_y, nzt_y DO i = nxl_y, nxr_y cuda_host(m) = CMPLX( ar(0,i,k), 0.0 ) DO j = 1, (ny+1)/2 - 1 cuda_host(m+j) = CMPLX( ar(j,i,k), ar(ny+1-j,i,k) ) ENDDO cuda_host(m+(ny+1)/2) = CMPLX( ar((ny+1)/2,i,k), 0.0 ) m = m + (ny+1)/2 + 1 ENDDO ENDDO cuda_b_device = cuda_host CALL CUFFTEXECZ2D( plan_yi, cuda_b_device, cuda_a_device ) ar(0:total_points_y_transpo-1,nxl_y,nzb_y) = cuda_a_device ENDIF DEALLOCATE( cuda_a_device, cuda_b_device, cuda_host ) #else message_string = 'no system-specific fft-call available' CALL message( 'fft_y', 'PA0188', 1, 2, 0, 6, 0 ) #endif ELSE message_string = 'fft method "' // TRIM( fft_method) // & '" not available' CALL message( 'fft_y', 'PA0189', 1, 2, 0, 6, 0 ) ENDIF END SUBROUTINE fft_y SUBROUTINE fft_y_1d( ar, direction ) !----------------------------------------------------------------------! ! fft_y_1d ! ! ! ! Fourier-transformation along y-direction ! ! Version for 1D-decomposition ! ! ! ! fft_y uses internal algorithms (Singleton or Temperton) or ! ! system-specific routines, if they are available ! !----------------------------------------------------------------------! IMPLICIT NONE CHARACTER (LEN=*) :: direction INTEGER :: j, jshape(1) LOGICAL :: forward_fft REAL, DIMENSION(0:ny) :: ar REAL, DIMENSION(0:ny+2) :: work REAL, DIMENSION(ny+2) :: work1 COMPLEX, DIMENSION(:), ALLOCATABLE :: cwork #if defined( __ibm ) REAL, DIMENSION(nau2) :: auy2, auy4 #elif defined( __nec ) REAL, DIMENSION(6*(ny+1)) :: work2 #endif IF ( direction == 'forward' ) THEN forward_fft = .TRUE. ELSE forward_fft = .FALSE. ENDIF IF ( fft_method == 'singleton-algorithm' ) THEN ! !-- Performing the fft with singleton's software works on every system, !-- since it is part of the model ALLOCATE( cwork(0:ny) ) IF ( forward_fft ) THEN DO j = 0, ny cwork(j) = CMPLX( ar(j) ) ENDDO jshape = SHAPE( cwork ) CALL FFTN( cwork, jshape ) DO j = 0, (ny+1)/2 ar(j) = REAL( cwork(j) ) ENDDO DO j = 1, (ny+1)/2 - 1 ar(ny+1-j) = -AIMAG( cwork(j) ) ENDDO ELSE cwork(0) = CMPLX( ar(0), 0.0 ) DO j = 1, (ny+1)/2 - 1 cwork(j) = CMPLX( ar(j), -ar(ny+1-j) ) cwork(ny+1-j) = CMPLX( ar(j), ar(ny+1-j) ) ENDDO cwork((ny+1)/2) = CMPLX( ar((ny+1)/2), 0.0 ) jshape = SHAPE( cwork ) CALL FFTN( cwork, jshape, inv = .TRUE. ) DO j = 0, ny ar(j) = REAL( cwork(j) ) ENDDO ENDIF DEALLOCATE( cwork ) ELSEIF ( fft_method == 'temperton-algorithm' ) THEN ! !-- Performing the fft with Temperton's software works on every system, !-- since it is part of the model IF ( forward_fft ) THEN work(0:ny) = ar CALL fft991cy( work, work1, trigs_y, ifax_y, 1, ny+1, ny+1, 1, -1 ) DO j = 0, (ny+1)/2 ar(j) = work(2*j) ENDDO DO j = 1, (ny+1)/2 - 1 ar(ny+1-j) = work(2*j+1) ENDDO ELSE DO j = 0, (ny+1)/2 work(2*j) = ar(j) ENDDO DO j = 1, (ny+1)/2 - 1 work(2*j+1) = ar(ny+1-j) ENDDO work(1) = 0.0 work(ny+2) = 0.0 CALL fft991cy( work, work1, trigs_y, ifax_y, 1, ny+1, ny+1, 1, 1 ) ar = work(0:ny) ENDIF ELSEIF ( fft_method == 'system-specific' ) THEN #if defined( __ibm ) && ! defined( __ibmy_special ) IF ( forward_fft ) THEN CALL DRCFT( 0, ar, 1, work, 1, ny+1, 1, 1, sqr_dny, auy1, nau1, & auy2, nau2 ) DO j = 0, (ny+1)/2 ar(j) = work(2*j) ENDDO DO j = 1, (ny+1)/2 - 1 ar(ny+1-j) = work(2*j+1) ENDDO ELSE DO j = 0, (ny+1)/2 work(2*j) = ar(j) ENDDO DO j = 1, (ny+1)/2 - 1 work(2*j+1) = ar(ny+1-j) ENDDO work(1) = 0.0 work(ny+2) = 0.0 CALL DCRFT( 0, work, 1, work, 1, ny+1, 1, -1, sqr_dny, auy3, nau1, & auy4, nau2 ) DO j = 0, ny ar(j) = work(j) ENDDO ENDIF #elif defined( __nec ) IF ( forward_fft ) THEN work(0:ny) = ar(0:ny) CALL DZFFT( 1, ny+1, sqr_dny, work, work, trig_yf, work2, 0 ) DO j = 0, (ny+1)/2 ar(j) = work(2*j) ENDDO DO j = 1, (ny+1)/2 - 1 ar(ny+1-j) = work(2*j+1) ENDDO ELSE DO j = 0, (ny+1)/2 work(2*j) = ar(j) ENDDO DO j = 1, (ny+1)/2 - 1 work(2*j+1) = ar(ny+1-j) ENDDO work(1) = 0.0 work(ny+2) = 0.0 CALL ZDFFT( -1, ny+1, sqr_dny, work, work, trig_yb, work2, 0 ) ar(0:ny) = work(0:ny) ENDIF #else message_string = 'no system-specific fft-call available' CALL message( 'fft_y_1d', 'PA0188', 1, 2, 0, 6, 0 ) #endif ELSE message_string = 'fft method "' // TRIM( fft_method) // & '" not available' CALL message( 'fft_y_1d', 'PA0189', 1, 2, 0, 6, 0 ) ENDIF END SUBROUTINE fft_y_1d SUBROUTINE fft_x_m( ar, direction ) !----------------------------------------------------------------------! ! fft_x_m ! ! ! ! Fourier-transformation along x-direction ! ! Version for 1d domain decomposition ! ! using multiple 1D FFT from Math Keisan on NEC ! ! or Temperton-algorithm ! ! (no singleton-algorithm on NEC because it does not vectorize) ! ! ! !----------------------------------------------------------------------! IMPLICIT NONE CHARACTER (LEN=*) :: direction INTEGER :: i, k, siza REAL, DIMENSION(0:nx,nz) :: ar REAL, DIMENSION(0:nx+3,nz+1) :: ai REAL, DIMENSION(6*(nx+4),nz+1) :: work1 #if defined( __nec ) INTEGER :: sizw COMPLEX, DIMENSION((nx+4)/2+1,nz+1) :: work #endif IF ( fft_method == 'temperton-algorithm' ) THEN siza = SIZE( ai, 1 ) IF ( direction == 'forward') THEN ai(0:nx,1:nz) = ar(0:nx,1:nz) ai(nx+1:,:) = 0.0 CALL fft991cy( ai, work1, trigs_x, ifax_x, 1, siza, nx+1, nz, -1 ) DO k = 1, nz DO i = 0, (nx+1)/2 ar(i,k) = ai(2*i,k) ENDDO DO i = 1, (nx+1)/2 - 1 ar(nx+1-i,k) = ai(2*i+1,k) ENDDO ENDDO ELSE DO k = 1, nz DO i = 0, (nx+1)/2 ai(2*i,k) = ar(i,k) ENDDO DO i = 1, (nx+1)/2 - 1 ai(2*i+1,k) = ar(nx+1-i,k) ENDDO ai(1,k) = 0.0 ai(nx+2,k) = 0.0 ENDDO CALL fft991cy( ai, work1, trigs_x, ifax_x, 1, siza, nx+1, nz, 1 ) ar(0:nx,1:nz) = ai(0:nx,1:nz) ENDIF ELSEIF ( fft_method == 'system-specific' ) THEN #if defined( __nec ) siza = SIZE( ai, 1 ) sizw = SIZE( work, 1 ) IF ( direction == 'forward') THEN ! !-- Tables are initialized once more. This call should not be !-- necessary, but otherwise program aborts in asymmetric case CALL DZFFTM( 0, nx+1, nz1, sqr_dnx, work, nx+4, work, nx+4, & trig_xf, work1, 0 ) ai(0:nx,1:nz) = ar(0:nx,1:nz) IF ( nz1 > nz ) THEN ai(:,nz1) = 0.0 ENDIF CALL DZFFTM( 1, nx+1, nz1, sqr_dnx, ai, siza, work, sizw, & trig_xf, work1, 0 ) DO k = 1, nz DO i = 0, (nx+1)/2 ar(i,k) = REAL( work(i+1,k) ) ENDDO DO i = 1, (nx+1)/2 - 1 ar(nx+1-i,k) = AIMAG( work(i+1,k) ) ENDDO ENDDO ELSE ! !-- Tables are initialized once more. This call should not be !-- necessary, but otherwise program aborts in asymmetric case CALL ZDFFTM( 0, nx+1, nz1, sqr_dnx, work, nx+4, work, nx+4, & trig_xb, work1, 0 ) IF ( nz1 > nz ) THEN work(:,nz1) = 0.0 ENDIF DO k = 1, nz work(1,k) = CMPLX( ar(0,k), 0.0 ) DO i = 1, (nx+1)/2 - 1 work(i+1,k) = CMPLX( ar(i,k), ar(nx+1-i,k) ) ENDDO work(((nx+1)/2)+1,k) = CMPLX( ar((nx+1)/2,k), 0.0 ) ENDDO CALL ZDFFTM( -1, nx+1, nz1, sqr_dnx, work, sizw, ai, siza, & trig_xb, work1, 0 ) ar(0:nx,1:nz) = ai(0:nx,1:nz) ENDIF #else message_string = 'no system-specific fft-call available' CALL message( 'fft_x_m', 'PA0188', 1, 2, 0, 6, 0 ) #endif ELSE message_string = 'fft method "' // TRIM( fft_method) // & '" not available' CALL message( 'fft_x_m', 'PA0189', 1, 2, 0, 6, 0 ) ENDIF END SUBROUTINE fft_x_m SUBROUTINE fft_y_m( ar, ny1, direction ) !----------------------------------------------------------------------! ! fft_y_m ! ! ! ! Fourier-transformation along y-direction ! ! Version for 1d domain decomposition ! ! using multiple 1D FFT from Math Keisan on NEC ! ! or Temperton-algorithm ! ! (no singleton-algorithm on NEC because it does not vectorize) ! ! ! !----------------------------------------------------------------------! IMPLICIT NONE CHARACTER (LEN=*) :: direction INTEGER :: j, k, ny1, siza REAL, DIMENSION(0:ny1,nz) :: ar REAL, DIMENSION(0:ny+3,nz+1) :: ai REAL, DIMENSION(6*(ny+4),nz+1) :: work1 #if defined( __nec ) INTEGER :: sizw COMPLEX, DIMENSION((ny+4)/2+1,nz+1) :: work #endif IF ( fft_method == 'temperton-algorithm' ) THEN siza = SIZE( ai, 1 ) IF ( direction == 'forward') THEN ai(0:ny,1:nz) = ar(0:ny,1:nz) ai(ny+1:,:) = 0.0 CALL fft991cy( ai, work1, trigs_y, ifax_y, 1, siza, ny+1, nz, -1 ) DO k = 1, nz DO j = 0, (ny+1)/2 ar(j,k) = ai(2*j,k) ENDDO DO j = 1, (ny+1)/2 - 1 ar(ny+1-j,k) = ai(2*j+1,k) ENDDO ENDDO ELSE DO k = 1, nz DO j = 0, (ny+1)/2 ai(2*j,k) = ar(j,k) ENDDO DO j = 1, (ny+1)/2 - 1 ai(2*j+1,k) = ar(ny+1-j,k) ENDDO ai(1,k) = 0.0 ai(ny+2,k) = 0.0 ENDDO CALL fft991cy( ai, work1, trigs_y, ifax_y, 1, siza, ny+1, nz, 1 ) ar(0:ny,1:nz) = ai(0:ny,1:nz) ENDIF ELSEIF ( fft_method == 'system-specific' ) THEN #if defined( __nec ) siza = SIZE( ai, 1 ) sizw = SIZE( work, 1 ) IF ( direction == 'forward') THEN ! !-- Tables are initialized once more. This call should not be !-- necessary, but otherwise program aborts in asymmetric case CALL DZFFTM( 0, ny+1, nz1, sqr_dny, work, ny+4, work, ny+4, & trig_yf, work1, 0 ) ai(0:ny,1:nz) = ar(0:ny,1:nz) IF ( nz1 > nz ) THEN ai(:,nz1) = 0.0 ENDIF CALL DZFFTM( 1, ny+1, nz1, sqr_dny, ai, siza, work, sizw, & trig_yf, work1, 0 ) DO k = 1, nz DO j = 0, (ny+1)/2 ar(j,k) = REAL( work(j+1,k) ) ENDDO DO j = 1, (ny+1)/2 - 1 ar(ny+1-j,k) = AIMAG( work(j+1,k) ) ENDDO ENDDO ELSE ! !-- Tables are initialized once more. This call should not be !-- necessary, but otherwise program aborts in asymmetric case CALL ZDFFTM( 0, ny+1, nz1, sqr_dny, work, ny+4, work, ny+4, & trig_yb, work1, 0 ) IF ( nz1 > nz ) THEN work(:,nz1) = 0.0 ENDIF DO k = 1, nz work(1,k) = CMPLX( ar(0,k), 0.0 ) DO j = 1, (ny+1)/2 - 1 work(j+1,k) = CMPLX( ar(j,k), ar(ny+1-j,k) ) ENDDO work(((ny+1)/2)+1,k) = CMPLX( ar((ny+1)/2,k), 0.0 ) ENDDO CALL ZDFFTM( -1, ny+1, nz1, sqr_dny, work, sizw, ai, siza, & trig_yb, work1, 0 ) ar(0:ny,1:nz) = ai(0:ny,1:nz) ENDIF #else message_string = 'no system-specific fft-call available' CALL message( 'fft_y_m', 'PA0188', 1, 2, 0, 6, 0 ) #endif ELSE message_string = 'fft method "' // TRIM( fft_method) // & '" not available' CALL message( 'fft_x_m', 'PA0189', 1, 2, 0, 6, 0 ) ENDIF END SUBROUTINE fft_y_m END MODULE fft_xy