Home

Context Navigation

source: palm/trunk/SOURCE/poisfft.f90 @ 1208

Last change on this file since 1208 was 1208, checked in by raasch, 12 years ago
acc-update clauses added for "ar" so that ffts other than cufft can also be used
Property svn:keywords set to `Id`
File size: 46.8 KB

Rev	Line
[1]	1	MODULE poisfft_mod
	2
[1036]	3	!--------------------------------------------------------------------------------!
	4	! This file is part of PALM.
	5	!
	6	! PALM is free software: you can redistribute it and/or modify it under the terms
	7	! of the GNU General Public License as published by the Free Software Foundation,
	8	! either version 3 of the License, or (at your option) any later version.
	9	!
	10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
	11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
	12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
	13	!
	14	! You should have received a copy of the GNU General Public License along with
	15	! PALM. If not, see <http://www.gnu.org/licenses/>.
	16	!
	17	! Copyright 1997-2012 Leibniz University Hannover
	18	!--------------------------------------------------------------------------------!
	19	!
[484]	20	! Current revisions:
[1]	21	! -----------------
[1208]	22	! acc-update clauses added for "ar" so that ffts other than cufft can also be
	23	! used (although they are not ported and will give a poor performance)
[1112]	24	!
	25	! Former revisions:
	26	! -----------------
	27	! $Id: poisfft.f90 1208 2013-08-13 06:41:49Z raasch $
	28	!
	29	! 1111 2013-03-08 23:54:10Z raasch
[1111]	30	! further openACC porting of non-parallel (MPI) branch:
	31	! tridiagonal routines split into extermal subroutines (instead using CONTAINS),
	32	! no distinction between parallel/non-parallel in poisfft and tridia any more,
[1112]	33	! tridia routines moved to end of file because of probable bug in PGI compiler 12.5
[1111]	34	! (otherwise "invalid device function" is indicated during runtime),
	35	! optimization of tridia routines: constant elements and coefficients of tri are
	36	! stored in seperate arrays ddzuw and tric, last dimension of tri reduced from 5
	37	! to 2,
	38	! poisfft_init is now called internally from poisfft, maketri is called from
	39	! poisfft_init,
	40	! ibc_p_b = 2 removed
[1]	41	!
[1107]	42	! 1106 2013-03-04 05:31:38Z raasch
	43	! routines fftx, ffty, fftxp, fftyp removed, calls replaced by fft_x, fft_y,
	44	! in the 1D-decomposition routines fft_x, ffty are replaced by fft_x_1d,
	45	! fft_y_1d
	46	!
[1104]	47	! 1103 2013-02-20 02:15:53Z raasch
	48	! tri, ar, and ar1 arguments in tridia-routines (2d) are removed because they
	49	! sometimes cause segmentation faults with intel 12.1 compiler
	50	!
[1093]	51	! 1092 2013-02-02 11:24:22Z raasch
	52	! unused variables removed
	53	!
[1037]	54	! 1036 2012-10-22 13:43:42Z raasch
	55	! code put under GPL (PALM 3.9)
	56	!
[1014]	57	! 2012-09-21 07:03:55Z raasch
	58	! FLOAT type conversion replaced by REAL
	59	!
[1004]	60	! 1003 2012-09-14 14:35:53Z raasch
	61	! indices nxa, nya, etc. replaced by nx, ny, etc.
	62	!
[941]	63	! 940 2012-07-09 14:31:00Z raasch
	64	! special handling of tri-array as an argument in tridia_1dd routines switched
	65	! off because it caused segmentation faults with intel 12.1 compiler
	66	!
[878]	67	! 877 2012-04-03 11:21:44Z suehring
	68	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
	69	! pressure at the top of the model domain.
	70	!
[810]	71	! 809 2012-01-30 13:32:58Z maronga
	72	! Bugfix: replaced .AND. and .NOT. with && and ! in the preprocessor directives
	73	!
[808]	74	! 807 2012-01-25 11:53:51Z maronga
	75	! New cpp directive "__check" implemented which is used by check_namelist_files
	76	! (most of the code is unneeded by check_namelist_files).
	77	!
[764]	78	! 763 2011-10-06 09:32:09Z suehring
	79	! Comment added concerning the last change.
	80	!
[762]	81	! 761 2011-10-05 17:58:52Z suehring
	82	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
	83	! pressure at the top of the model domain.
	84	!
[697]	85	! 696 2011-03-18 07:03:49Z raasch
	86	! work_fftx removed from PRIVATE clauses in fftx_tr_xy and tr_yx_fftx
	87	!
[684]	88	! 683 2011-02-09 14:25:15Z raasch
	89	! openMP parallelization for 2d-domain-decomposition
	90	!
[668]	91	! 667 2010-12-23 12:06:00Z suehring/gryschka
	92	! ddzu replaced by ddzu_pres due to changes in zu(0)
	93	!
[623]	94	! 622 2010-12-10 08:08:13Z raasch
	95	! optional barriers included in order to speed up collective operations
	96	!
[392]	97	! 377 2009-09-04 11:09:00Z raasch
	98	! __lcmuk changed to __lc to avoid problems with Intel compiler on sgi-ice
	99	!
[198]	100	! 164 2008-05-15 08:46:15Z raasch
	101	! Arguments removed from transpose routines
	102	!
[139]	103	! 128 2007-10-26 13:11:14Z raasch
	104	! Bugfix: wavenumber calculation for even nx in routines maketri
	105	!
[90]	106	! 85 2007-05-11 09:35:14Z raasch
	107	! Bugfix: work_fft*_vec removed from some PRIVATE-declarations
	108	!
[77]	109	! 76 2007-03-29 00:58:32Z raasch
	110	! Tridiagonal coefficients adjusted for Neumann boundary conditions both at
	111	! the bottom and the top.
	112	!
[3]	113	! RCS Log replace by Id keyword, revision history cleaned up
	114	!
[1]	115	! Revision 1.24 2006/08/04 15:00:24 raasch
	116	! Default setting of the thread number tn in case of not using OpenMP
	117	!
	118	! Revision 1.23 2006/02/23 12:48:38 raasch
	119	! Additional compiler directive in routine tridia_1dd for preventing loop
	120	! exchange on NEC-SX6
	121	!
	122	! Revision 1.20 2004/04/30 12:38:09 raasch
	123	! Parts of former poisfft_hybrid moved to this subroutine,
	124	! former subroutine changed to a module, renaming of FFT-subroutines and
	125	! -module, FFTs completely substituted by calls of fft_x and fft_y,
	126	! NAG fft used in the non-parallel case completely removed, l in maketri
	127	! is now a 1d-array, variables passed by modules instead of using parameter
	128	! lists, enlarged transposition arrays introduced
	129	!
	130	! Revision 1.1 1997/07/24 11:24:14 raasch
	131	! Initial revision
	132	!
	133	!
	134	! Description:
	135	! ------------
	136	! See below.
	137	!------------------------------------------------------------------------------!
	138
	139	!--------------------------------------------------------------------------!
	140	! poisfft !
	141	! !
	142	! Original version: Stephan Siano (pois3d) !
	143	! !
	144	! Institute of Meteorology and Climatology, University of Hannover !
	145	! Germany !
	146	! !
	147	! Version as of July 23,1996 !
	148	! !
	149	! !
	150	! Version for parallel computers: Siegfried Raasch !
	151	! !
	152	! Version as of July 03,1997 !
	153	! !
	154	! Solves the Poisson equation with a 2D spectral method !
	155	! d^2 p / dx^2 + d^2 p / dy^2 + d^2 p / dz^2 = s !
	156	! !
	157	! Input: !
	158	! real ar contains in the (nnx,nny,nnz) elements, !
	159	! starting from the element (1,nys,nxl), the !
	160	! values for s !
	161	! real work Temporary array !
	162	! !
	163	! Output: !
	164	! real ar contains the solution for p !
	165	!--------------------------------------------------------------------------!
	166
	167	USE fft_xy
	168	USE indices
	169	USE transpose_indices
	170
	171	IMPLICIT NONE
	172
[1111]	173	LOGICAL, SAVE :: poisfft_initialized = .FALSE.
	174
	175	REAL, DIMENSION(:,:), ALLOCATABLE :: ddzuw
	176
[1]	177	PRIVATE
[807]	178
[809]	179	#if ! defined ( __check )
[1]	180	PUBLIC poisfft, poisfft_init
	181
	182	INTERFACE poisfft
	183	MODULE PROCEDURE poisfft
	184	END INTERFACE poisfft
	185
	186	INTERFACE poisfft_init
	187	MODULE PROCEDURE poisfft_init
	188	END INTERFACE poisfft_init
[807]	189	#else
	190	PUBLIC poisfft_init
[1]	191
[807]	192	INTERFACE poisfft_init
	193	MODULE PROCEDURE poisfft_init
	194	END INTERFACE poisfft_init
	195	#endif
	196
[1]	197	CONTAINS
	198
	199	SUBROUTINE poisfft_init
	200
[1111]	201	USE arrays_3d, ONLY: ddzu_pres, ddzw
	202
	203	IMPLICIT NONE
	204
	205	INTEGER :: k
	206
	207
[1]	208	CALL fft_init
	209
[1111]	210	ALLOCATE( ddzuw(0:nz-1,3) )
	211
	212	DO k = 0, nz-1
	213	ddzuw(k,1) = ddzu_pres(k+1) * ddzw(k+1)
	214	ddzuw(k,2) = ddzu_pres(k+2) * ddzw(k+1)
	215	ddzuw(k,3) = -1.0 * &
	216	( ddzu_pres(k+2) * ddzw(k+1) + ddzu_pres(k+1) * ddzw(k+1) )
	217	ENDDO
	218	!
	219	!-- Calculate constant coefficients of the tridiagonal matrix
	220	#if ! defined ( __check )
	221	CALL maketri
	222	#endif
	223
	224	poisfft_initialized = .TRUE.
	225
[1]	226	END SUBROUTINE poisfft_init
	227
[1111]	228
[809]	229	#if ! defined ( __check )
[1]	230	SUBROUTINE poisfft( ar, work )
	231
[1208]	232	USE control_parameters, ONLY : fft_method
[1]	233	USE cpulog
	234	USE interfaces
	235	USE pegrid
	236
	237	IMPLICIT NONE
	238
[1003]	239	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar, work
[1]	240
	241
	242	CALL cpu_log( log_point_s(3), 'poisfft', 'start' )
	243
[1111]	244	IF ( .NOT. poisfft_initialized ) CALL poisfft_init
	245
[1]	246	!
	247	!-- Two-dimensional Fourier Transformation in x- and y-direction.
[1111]	248	IF ( pdims(2) == 1 .AND. pdims(1) > 1 ) THEN
[1]	249
	250	!
	251	!-- 1d-domain-decomposition along x:
	252	!-- FFT along y and transposition y --> x
	253	CALL ffty_tr_yx( ar, work, ar )
	254
	255	!
	256	!-- FFT along x, solving the tridiagonal system and backward FFT
	257	CALL fftx_tri_fftx( ar )
	258
	259	!
	260	!-- Transposition x --> y and backward FFT along y
	261	CALL tr_xy_ffty( ar, work, ar )
	262
[1111]	263	ELSEIF ( pdims(1) == 1 .AND. pdims(2) > 1 ) THEN
[1]	264
	265	!
	266	!-- 1d-domain-decomposition along y:
	267	!-- FFT along x and transposition x --> y
	268	CALL fftx_tr_xy( ar, work, ar )
	269
	270	!
	271	!-- FFT along y, solving the tridiagonal system and backward FFT
	272	CALL ffty_tri_ffty( ar )
	273
	274	!
	275	!-- Transposition y --> x and backward FFT along x
	276	CALL tr_yx_fftx( ar, work, ar )
	277
	278	ELSE
	279
	280	!
[1111]	281	!-- 2d-domain-decomposition or no decomposition (1 PE run)
[1]	282	!-- Transposition z --> x
	283	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
[164]	284	CALL transpose_zx( ar, work, ar )
[1]	285	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
	286
	287	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
[1208]	288	IF ( fft_method /= 'system-specific' ) THEN
	289	!$acc update host( ar )
	290	ENDIF
[1106]	291	CALL fft_x( ar, 'forward' )
[1208]	292	IF ( fft_method /= 'system-specific' ) THEN
	293	!$acc update device( ar )
	294	ENDIF
[1]	295	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
	296
	297	!
	298	!-- Transposition x --> y
	299	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
[164]	300	CALL transpose_xy( ar, work, ar )
[1]	301	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
	302
	303	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
[1208]	304	IF ( fft_method /= 'system-specific' ) THEN
	305	!$acc update host( ar )
	306	ENDIF
[1106]	307	CALL fft_y( ar, 'forward' )
[1208]	308	IF ( fft_method /= 'system-specific' ) THEN
	309	!$acc update device( ar )
	310	ENDIF
[1]	311	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	312
	313	!
	314	!-- Transposition y --> z
	315	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
[164]	316	CALL transpose_yz( ar, work, ar )
[1]	317	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
	318
	319	!
[1106]	320	!-- Solve the tridiagonal equation system along z
[1]	321	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
	322	CALL tridia( ar )
	323	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
	324
	325	!
	326	!-- Inverse Fourier Transformation
	327	!-- Transposition z --> y
	328	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
[164]	329	CALL transpose_zy( ar, work, ar )
[1]	330	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
	331
	332	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
[1208]	333	IF ( fft_method /= 'system-specific' ) THEN
	334	!$acc update host( ar )
	335	ENDIF
[1106]	336	CALL fft_y( ar, 'backward' )
[1208]	337	IF ( fft_method /= 'system-specific' ) THEN
	338	!$acc update device( ar )
	339	ENDIF
[1]	340	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	341
	342	!
	343	!-- Transposition y --> x
	344	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
[164]	345	CALL transpose_yx( ar, work, ar )
[1]	346	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
	347
	348	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
[1208]	349	IF ( fft_method /= 'system-specific' ) THEN
	350	!$acc update host( ar )
	351	ENDIF
[1106]	352	CALL fft_x( ar, 'backward' )
[1208]	353	IF ( fft_method /= 'system-specific' ) THEN
	354	!$acc update device( ar )
	355	ENDIF
[1]	356	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	357
	358	!
	359	!-- Transposition x --> z
	360	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
[164]	361	CALL transpose_xz( ar, work, ar )
[1]	362	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
	363
	364	ENDIF
	365
	366	CALL cpu_log( log_point_s(3), 'poisfft', 'stop' )
	367
	368	END SUBROUTINE poisfft
	369
	370
	371
	372	SUBROUTINE ffty_tr_yx( f_in, work, f_out )
	373
	374	!------------------------------------------------------------------------------!
	375	! Fourier-transformation along y with subsequent transposition y --> x for
	376	! a 1d-decomposition along x
	377	!
	378	! ATTENTION: The performance of this routine is much faster on the NEC-SX6,
	379	! if the first index of work_ffty_vec is odd. Otherwise
	380	! memory bank conflicts may occur (especially if the index is a
	381	! multiple of 128). That's why work_ffty_vec is dimensioned as
	382	! 0:ny+1.
	383	! Of course, this will not work if users are using an odd number
	384	! of gridpoints along y.
	385	!------------------------------------------------------------------------------!
	386
	387	USE control_parameters
	388	USE cpulog
	389	USE indices
	390	USE interfaces
	391	USE pegrid
	392	USE transpose_indices
	393
	394	IMPLICIT NONE
	395
	396	INTEGER :: i, iend, iouter, ir, j, k
	397	INTEGER, PARAMETER :: stridex = 4
	398
	399	REAL, DIMENSION(0:ny,stridex) :: work_ffty
	400	#if defined( __nec )
	401	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
	402	#endif
[1003]	403	REAL, DIMENSION(1:nz,0:ny,nxl:nxr) :: f_in
	404	REAL, DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_out
	405	REAL, DIMENSION(nxl:nxr,1:nz,0:ny) :: work
[1]	406
	407	!
	408	!-- Carry out the FFT along y, where all data are present due to the
	409	!-- 1d-decomposition along x. Resort the data in a way that x becomes
	410	!-- the first index.
[1106]	411	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'start' )
[1]	412
	413	IF ( host(1:3) == 'nec' ) THEN
	414	#if defined( __nec )
	415	!
	416	!-- Code optimized for vector processors
[85]	417	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	418	!$OMP DO
	419	DO i = nxl, nxr
	420
	421	DO j = 0, ny
	422	DO k = 1, nz
	423	work_ffty_vec(j,k,i) = f_in(k,j,i)
	424	ENDDO
	425	ENDDO
	426
	427	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' )
	428
	429	ENDDO
	430
	431	!$OMP DO
	432	DO k = 1, nz
	433	DO j = 0, ny
	434	DO i = nxl, nxr
	435	work(i,k,j) = work_ffty_vec(j,k,i)
	436	ENDDO
	437	ENDDO
	438	ENDDO
	439	!$OMP END PARALLEL
	440	#endif
	441
	442	ELSE
	443
	444	!
	445	!-- Cache optimized code.
	446	!-- The i-(x-)direction is split into a strided outer loop and an inner
	447	!-- loop for better cache performance
	448	!$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty)
	449	!$OMP DO
	450	DO iouter = nxl, nxr, stridex
	451
	452	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
	453
	454	DO k = 1, nz
	455
	456	DO i = iouter, iend
	457
	458	ir = i-iouter+1 ! counter within a stride
	459	DO j = 0, ny
	460	work_ffty(j,ir) = f_in(k,j,i)
	461	ENDDO
	462	!
	463	!-- FFT along y
[1106]	464	CALL fft_y_1d( work_ffty(:,ir), 'forward' )
[1]	465
	466	ENDDO
	467
	468	!
	469	!-- Resort
	470	DO j = 0, ny
	471	DO i = iouter, iend
	472	work(i,k,j) = work_ffty(j,i-iouter+1)
	473	ENDDO
	474	ENDDO
	475
	476	ENDDO
	477
	478	ENDDO
	479	!$OMP END PARALLEL
	480
	481	ENDIF
[1106]	482	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'pause' )
[1]	483
	484	!
	485	!-- Transpose array
[1111]	486	#if defined( __parallel )
[1]	487	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	488	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	489	CALL MPI_ALLTOALL( work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
	490	f_out(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
	491	comm1dx, ierr )
	492	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1111]	493	#endif
[1]	494
	495	END SUBROUTINE ffty_tr_yx
	496
	497
	498	SUBROUTINE tr_xy_ffty( f_in, work, f_out )
	499
	500	!------------------------------------------------------------------------------!
	501	! Transposition x --> y with a subsequent backward Fourier transformation for
	502	! a 1d-decomposition along x
	503	!------------------------------------------------------------------------------!
	504
	505	USE control_parameters
	506	USE cpulog
	507	USE indices
	508	USE interfaces
	509	USE pegrid
	510	USE transpose_indices
	511
	512	IMPLICIT NONE
	513
	514	INTEGER :: i, iend, iouter, ir, j, k
	515	INTEGER, PARAMETER :: stridex = 4
	516
	517	REAL, DIMENSION(0:ny,stridex) :: work_ffty
	518	#if defined( __nec )
	519	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
	520	#endif
[1003]	521	REAL, DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_in
	522	REAL, DIMENSION(1:nz,0:ny,nxl:nxr) :: f_out
	523	REAL, DIMENSION(nxl:nxr,1:nz,0:ny) :: work
[1]	524
	525	!
	526	!-- Transpose array
[1111]	527	#if defined( __parallel )
[1]	528	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	529	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	530	CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
	531	work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
	532	comm1dx, ierr )
	533	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1111]	534	#endif
[1]	535
	536	!
	537	!-- Resort the data in a way that y becomes the first index and carry out the
	538	!-- backward fft along y.
[1106]	539	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'continue' )
[1]	540
	541	IF ( host(1:3) == 'nec' ) THEN
	542	#if defined( __nec )
	543	!
	544	!-- Code optimized for vector processors
[85]	545	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	546	!$OMP DO
	547	DO k = 1, nz
	548	DO j = 0, ny
	549	DO i = nxl, nxr
	550	work_ffty_vec(j,k,i) = work(i,k,j)
	551	ENDDO
	552	ENDDO
	553	ENDDO
	554
	555	!$OMP DO
	556	DO i = nxl, nxr
	557
	558	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' )
	559
	560	DO j = 0, ny
	561	DO k = 1, nz
	562	f_out(k,j,i) = work_ffty_vec(j,k,i)
	563	ENDDO
	564	ENDDO
	565
	566	ENDDO
	567	!$OMP END PARALLEL
	568	#endif
	569
	570	ELSE
	571
	572	!
	573	!-- Cache optimized code.
	574	!-- The i-(x-)direction is split into a strided outer loop and an inner
	575	!-- loop for better cache performance
	576	!$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty )
	577	!$OMP DO
	578	DO iouter = nxl, nxr, stridex
	579
	580	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
	581
	582	DO k = 1, nz
	583	!
	584	!-- Resort
	585	DO j = 0, ny
	586	DO i = iouter, iend
	587	work_ffty(j,i-iouter+1) = work(i,k,j)
	588	ENDDO
	589	ENDDO
	590
	591	DO i = iouter, iend
	592
	593	!
	594	!-- FFT along y
	595	ir = i-iouter+1 ! counter within a stride
[1106]	596	CALL fft_y_1d( work_ffty(:,ir), 'backward' )
[1]	597
	598	DO j = 0, ny
	599	f_out(k,j,i) = work_ffty(j,ir)
	600	ENDDO
	601	ENDDO
	602
	603	ENDDO
	604
	605	ENDDO
	606	!$OMP END PARALLEL
	607
	608	ENDIF
	609
[1106]	610	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'stop' )
[1]	611
	612	END SUBROUTINE tr_xy_ffty
	613
	614
	615	SUBROUTINE fftx_tri_fftx( ar )
	616
	617	!------------------------------------------------------------------------------!
	618	! FFT along x, solution of the tridiagonal system and backward FFT for
	619	! a 1d-decomposition along x
	620	!
	621	! WARNING: this subroutine may still not work for hybrid parallelization
	622	! with OpenMP (for possible necessary changes see the original
	623	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
	624	!------------------------------------------------------------------------------!
	625
	626	USE control_parameters
	627	USE cpulog
	628	USE grid_variables
	629	USE indices
	630	USE interfaces
	631	USE pegrid
	632	USE transpose_indices
	633
	634	IMPLICIT NONE
	635
	636	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
	637
[1003]	638	REAL, DIMENSION(0:nx) :: work_fftx
	639	REAL, DIMENSION(0:nx,1:nz) :: work_trix
	640	REAL, DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: ar
	641	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
[1]	642
	643
[1106]	644	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'start' )
[1]	645
	646	ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) )
	647
	648	tn = 0 ! Default thread number in case of one thread
	649	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_fftx, work_trix )
	650	DO j = nys_x, nyn_x
	651
	652	!$ tn = omp_get_thread_num()
	653
	654	IF ( host(1:3) == 'nec' ) THEN
	655	!
	656	!-- Code optimized for vector processors
	657	DO k = 1, nz
	658
	659	m = 0
	660	DO n = 1, pdims(1)
[1003]	661	DO i = 1, nnx
[1]	662	work_trix(m,k) = ar(i,k,j,n)
	663	m = m + 1
	664	ENDDO
	665	ENDDO
	666
	667	ENDDO
	668
	669	CALL fft_x_m( work_trix, 'forward' )
	670
	671	ELSE
	672	!
	673	!-- Cache optimized code
	674	DO k = 1, nz
	675
	676	m = 0
	677	DO n = 1, pdims(1)
[1003]	678	DO i = 1, nnx
[1]	679	work_fftx(m) = ar(i,k,j,n)
	680	m = m + 1
	681	ENDDO
	682	ENDDO
	683
[1106]	684	CALL fft_x_1d( work_fftx, 'forward' )
[1]	685
	686	DO i = 0, nx
	687	work_trix(i,k) = work_fftx(i)
	688	ENDDO
	689
	690	ENDDO
	691
	692	ENDIF
	693
	694	!
	695	!-- Solve the linear equation system
	696	CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) )
	697
	698	IF ( host(1:3) == 'nec' ) THEN
	699	!
	700	!-- Code optimized for vector processors
	701	CALL fft_x_m( work_trix, 'backward' )
	702
	703	DO k = 1, nz
	704
	705	m = 0
	706	DO n = 1, pdims(1)
[1003]	707	DO i = 1, nnx
[1]	708	ar(i,k,j,n) = work_trix(m,k)
	709	m = m + 1
	710	ENDDO
	711	ENDDO
	712
	713	ENDDO
	714
	715	ELSE
	716	!
	717	!-- Cache optimized code
	718	DO k = 1, nz
	719
	720	DO i = 0, nx
	721	work_fftx(i) = work_trix(i,k)
	722	ENDDO
	723
[1106]	724	CALL fft_x_1d( work_fftx, 'backward' )
[1]	725
	726	m = 0
	727	DO n = 1, pdims(1)
[1003]	728	DO i = 1, nnx
[1]	729	ar(i,k,j,n) = work_fftx(m)
	730	m = m + 1
	731	ENDDO
	732	ENDDO
	733
	734	ENDDO
	735
	736	ENDIF
	737
	738	ENDDO
	739
	740	DEALLOCATE( tri )
	741
[1106]	742	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'stop' )
[1]	743
	744	END SUBROUTINE fftx_tri_fftx
	745
	746
	747	SUBROUTINE fftx_tr_xy( f_in, work, f_out )
	748
	749	!------------------------------------------------------------------------------!
	750	! Fourier-transformation along x with subsequent transposition x --> y for
	751	! a 1d-decomposition along y
	752	!
	753	! ATTENTION: The NEC-branch of this routine may significantly profit from
	754	! further optimizations. So far, performance is much worse than
	755	! for routine ffty_tr_yx (more than three times slower).
	756	!------------------------------------------------------------------------------!
	757
	758	USE control_parameters
	759	USE cpulog
	760	USE indices
	761	USE interfaces
	762	USE pegrid
	763	USE transpose_indices
	764
	765	IMPLICIT NONE
	766
	767	INTEGER :: i, j, k
	768
[1003]	769	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
	770	REAL, DIMENSION(1:nz,nys:nyn,0:nx) :: f_in
	771	REAL, DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_out
	772	REAL, DIMENSION(nys:nyn,1:nz,0:nx) :: work
[1]	773
	774	!
	775	!-- Carry out the FFT along x, where all data are present due to the
	776	!-- 1d-decomposition along y. Resort the data in a way that y becomes
	777	!-- the first index.
[1106]	778	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'start' )
[1]	779
	780	IF ( host(1:3) == 'nec' ) THEN
	781	!
	782	!-- Code for vector processors
[85]	783	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	784	!$OMP DO
	785	DO i = 0, nx
	786
	787	DO j = nys, nyn
	788	DO k = 1, nz
	789	work_fftx(i,k,j) = f_in(k,j,i)
	790	ENDDO
	791	ENDDO
	792
	793	ENDDO
	794
	795	!$OMP DO
	796	DO j = nys, nyn
	797
	798	CALL fft_x_m( work_fftx(:,:,j), 'forward' )
	799
	800	DO k = 1, nz
	801	DO i = 0, nx
	802	work(j,k,i) = work_fftx(i,k,j)
	803	ENDDO
	804	ENDDO
	805
	806	ENDDO
	807	!$OMP END PARALLEL
	808
	809	ELSE
	810
	811	!
	812	!-- Cache optimized code (there might be still a potential for better
	813	!-- optimization).
[696]	814	!$OMP PARALLEL PRIVATE (i,j,k)
[1]	815	!$OMP DO
	816	DO i = 0, nx
	817
	818	DO j = nys, nyn
	819	DO k = 1, nz
	820	work_fftx(i,k,j) = f_in(k,j,i)
	821	ENDDO
	822	ENDDO
	823
	824	ENDDO
	825
	826	!$OMP DO
	827	DO j = nys, nyn
	828	DO k = 1, nz
	829
[1106]	830	CALL fft_x_1d( work_fftx(0:nx,k,j), 'forward' )
[1]	831
	832	DO i = 0, nx
	833	work(j,k,i) = work_fftx(i,k,j)
	834	ENDDO
	835	ENDDO
	836
	837	ENDDO
	838	!$OMP END PARALLEL
	839
	840	ENDIF
[1106]	841	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'pause' )
[1]	842
	843	!
	844	!-- Transpose array
[1111]	845	#if defined( __parallel )
[1]	846	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	847	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	848	CALL MPI_ALLTOALL( work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
	849	f_out(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
	850	comm1dy, ierr )
	851	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1111]	852	#endif
[1]	853
	854	END SUBROUTINE fftx_tr_xy
	855
	856
	857	SUBROUTINE tr_yx_fftx( f_in, work, f_out )
	858
	859	!------------------------------------------------------------------------------!
	860	! Transposition y --> x with a subsequent backward Fourier transformation for
	861	! a 1d-decomposition along x
	862	!------------------------------------------------------------------------------!
	863
	864	USE control_parameters
	865	USE cpulog
	866	USE indices
	867	USE interfaces
	868	USE pegrid
	869	USE transpose_indices
	870
	871	IMPLICIT NONE
	872
	873	INTEGER :: i, j, k
	874
[1003]	875	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
	876	REAL, DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_in
	877	REAL, DIMENSION(1:nz,nys:nyn,0:nx) :: f_out
	878	REAL, DIMENSION(nys:nyn,1:nz,0:nx) :: work
[1]	879
	880	!
	881	!-- Transpose array
[1111]	882	#if defined( __parallel )
[1]	883	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	884	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	885	CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
	886	work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
	887	comm1dy, ierr )
	888	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1111]	889	#endif
[1]	890
	891	!
	892	!-- Carry out the FFT along x, where all data are present due to the
	893	!-- 1d-decomposition along y. Resort the data in a way that y becomes
	894	!-- the first index.
[1106]	895	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'continue' )
[1]	896
	897	IF ( host(1:3) == 'nec' ) THEN
	898	!
	899	!-- Code optimized for vector processors
[85]	900	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	901	!$OMP DO
	902	DO j = nys, nyn
	903
	904	DO k = 1, nz
	905	DO i = 0, nx
	906	work_fftx(i,k,j) = work(j,k,i)
	907	ENDDO
	908	ENDDO
	909
	910	CALL fft_x_m( work_fftx(:,:,j), 'backward' )
	911
	912	ENDDO
	913
	914	!$OMP DO
	915	DO i = 0, nx
	916	DO j = nys, nyn
	917	DO k = 1, nz
	918	f_out(k,j,i) = work_fftx(i,k,j)
	919	ENDDO
	920	ENDDO
	921	ENDDO
	922	!$OMP END PARALLEL
	923
	924	ELSE
	925
	926	!
	927	!-- Cache optimized code (there might be still a potential for better
	928	!-- optimization).
[696]	929	!$OMP PARALLEL PRIVATE (i,j,k)
[1]	930	!$OMP DO
	931	DO j = nys, nyn
	932	DO k = 1, nz
	933
	934	DO i = 0, nx
	935	work_fftx(i,k,j) = work(j,k,i)
	936	ENDDO
	937
[1106]	938	CALL fft_x_1d( work_fftx(0:nx,k,j), 'backward' )
[1]	939
	940	ENDDO
	941	ENDDO
	942
	943	!$OMP DO
	944	DO i = 0, nx
	945	DO j = nys, nyn
	946	DO k = 1, nz
	947	f_out(k,j,i) = work_fftx(i,k,j)
	948	ENDDO
	949	ENDDO
	950	ENDDO
	951	!$OMP END PARALLEL
	952
	953	ENDIF
[1106]	954	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'stop' )
[1]	955
	956	END SUBROUTINE tr_yx_fftx
	957
	958
	959	SUBROUTINE ffty_tri_ffty( ar )
	960
	961	!------------------------------------------------------------------------------!
	962	! FFT along y, solution of the tridiagonal system and backward FFT for
	963	! a 1d-decomposition along y
	964	!
	965	! WARNING: this subroutine may still not work for hybrid parallelization
	966	! with OpenMP (for possible necessary changes see the original
	967	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
	968	!------------------------------------------------------------------------------!
	969
	970	USE control_parameters
	971	USE cpulog
	972	USE grid_variables
	973	USE indices
	974	USE interfaces
	975	USE pegrid
	976	USE transpose_indices
	977
	978	IMPLICIT NONE
	979
	980	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
	981
[1003]	982	REAL, DIMENSION(0:ny) :: work_ffty
	983	REAL, DIMENSION(0:ny,1:nz) :: work_triy
	984	REAL, DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: ar
	985	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
[1]	986
	987
[1106]	988	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'start' )
[1]	989
	990	ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) )
	991
	992	tn = 0 ! Default thread number in case of one thread
[696]	993	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy )
[1]	994	DO i = nxl_y, nxr_y
	995
	996	!$ tn = omp_get_thread_num()
	997
	998	IF ( host(1:3) == 'nec' ) THEN
	999	!
	1000	!-- Code optimized for vector processors
	1001	DO k = 1, nz
	1002
	1003	m = 0
	1004	DO n = 1, pdims(2)
[1003]	1005	DO j = 1, nny
[1]	1006	work_triy(m,k) = ar(j,k,i,n)
	1007	m = m + 1
	1008	ENDDO
	1009	ENDDO
	1010
	1011	ENDDO
	1012
	1013	CALL fft_y_m( work_triy, ny, 'forward' )
	1014
	1015	ELSE
	1016	!
	1017	!-- Cache optimized code
	1018	DO k = 1, nz
	1019
	1020	m = 0
	1021	DO n = 1, pdims(2)
[1003]	1022	DO j = 1, nny
[1]	1023	work_ffty(m) = ar(j,k,i,n)
	1024	m = m + 1
	1025	ENDDO
	1026	ENDDO
	1027
[1106]	1028	CALL fft_y_1d( work_ffty, 'forward' )
[1]	1029
	1030	DO j = 0, ny
	1031	work_triy(j,k) = work_ffty(j)
	1032	ENDDO
	1033
	1034	ENDDO
	1035
	1036	ENDIF
	1037
	1038	!
	1039	!-- Solve the linear equation system
	1040	CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) )
	1041
	1042	IF ( host(1:3) == 'nec' ) THEN
	1043	!
	1044	!-- Code optimized for vector processors
	1045	CALL fft_y_m( work_triy, ny, 'backward' )
	1046
	1047	DO k = 1, nz
	1048
	1049	m = 0
	1050	DO n = 1, pdims(2)
[1003]	1051	DO j = 1, nny
[1]	1052	ar(j,k,i,n) = work_triy(m,k)
	1053	m = m + 1
	1054	ENDDO
	1055	ENDDO
	1056
	1057	ENDDO
	1058
	1059	ELSE
	1060	!
	1061	!-- Cache optimized code
	1062	DO k = 1, nz
	1063
	1064	DO j = 0, ny
	1065	work_ffty(j) = work_triy(j,k)
	1066	ENDDO
	1067
[1106]	1068	CALL fft_y_1d( work_ffty, 'backward' )
[1]	1069
	1070	m = 0
	1071	DO n = 1, pdims(2)
[1003]	1072	DO j = 1, nny
[1]	1073	ar(j,k,i,n) = work_ffty(m)
	1074	m = m + 1
	1075	ENDDO
	1076	ENDDO
	1077
	1078	ENDDO
	1079
	1080	ENDIF
	1081
	1082	ENDDO
	1083
	1084	DEALLOCATE( tri )
	1085
[1106]	1086	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'stop' )
[1]	1087
	1088	END SUBROUTINE ffty_tri_ffty
	1089
	1090
	1091	SUBROUTINE tridia_1dd( ddx2, ddy2, nx, ny, j, ar, tri )
	1092
	1093	!------------------------------------------------------------------------------!
	1094	! Solves the linear system of equations for a 1d-decomposition along x (see
	1095	! tridia)
	1096	!
[940]	1097	! Attention: when using the intel compilers older than 12.0, array tri must
	1098	! be passed as an argument to the contained subroutines. Otherwise
	1099	! addres faults will occur. This feature can be activated with
	1100	! cpp-switch __intel11
[1]	1101	! On NEC, tri should not be passed (except for routine substi_1dd)
	1102	! because this causes very bad performance.
	1103	!------------------------------------------------------------------------------!
	1104
	1105	USE arrays_3d
	1106	USE control_parameters
	1107
	1108	USE pegrid
	1109
	1110	IMPLICIT NONE
	1111
	1112	INTEGER :: i, j, k, nnyh, nx, ny, omp_get_thread_num, tn
	1113
	1114	REAL :: ddx2, ddy2
	1115
	1116	REAL, DIMENSION(0:nx,1:nz) :: ar
	1117	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	1118
	1119
	1120	nnyh = ( ny + 1 ) / 2
	1121
	1122	!
	1123	!-- Define constant elements of the tridiagonal matrix.
	1124	!-- The compiler on SX6 does loop exchange. If 0:nx is a high power of 2,
	1125	!-- the exchanged loops create bank conflicts. The following directive
	1126	!-- prohibits loop exchange and the loops perform much better.
	1127	! tn = omp_get_thread_num()
	1128	! WRITE( 120+tn, * ) '+++ id=',myid,' nx=',nx,' thread=', omp_get_thread_num()
[82]	1129	! CALL local_flush( 120+tn )
[1]	1130	!CDIR NOLOOPCHG
	1131	DO k = 0, nz-1
	1132	DO i = 0,nx
[667]	1133	tri(2,i,k) = ddzu_pres(k+1) * ddzw(k+1)
	1134	tri(3,i,k) = ddzu_pres(k+2) * ddzw(k+1)
[1]	1135	ENDDO
	1136	ENDDO
	1137	! WRITE( 120+tn, * ) '+++ id=',myid,' end of first tridia loop thread=', omp_get_thread_num()
[82]	1138	! CALL local_flush( 120+tn )
[1]	1139
	1140	IF ( j <= nnyh ) THEN
[940]	1141	#if defined( __intel11 )
[1]	1142	CALL maketri_1dd( j, tri )
	1143	#else
	1144	CALL maketri_1dd( j )
	1145	#endif
	1146	ELSE
[940]	1147	#if defined( __intel11 )
[1]	1148	CALL maketri_1dd( ny+1-j, tri )
	1149	#else
	1150	CALL maketri_1dd( ny+1-j )
	1151	#endif
	1152	ENDIF
[940]	1153	#if defined( __intel11 )
[1]	1154	CALL split_1dd( tri )
	1155	#else
	1156	CALL split_1dd
	1157	#endif
	1158	CALL substi_1dd( ar, tri )
	1159
	1160	CONTAINS
	1161
[940]	1162	#if defined( __intel11 )
[1]	1163	SUBROUTINE maketri_1dd( j, tri )
	1164	#else
	1165	SUBROUTINE maketri_1dd( j )
	1166	#endif
	1167
	1168	!------------------------------------------------------------------------------!
	1169	! computes the i- and j-dependent component of the matrix
	1170	!------------------------------------------------------------------------------!
	1171
	1172	USE constants
	1173
	1174	IMPLICIT NONE
	1175
	1176	INTEGER :: i, j, k, nnxh
	1177	REAL :: a, c
	1178
	1179	REAL, DIMENSION(0:nx) :: l
	1180
[940]	1181	#if defined( __intel11 )
[1]	1182	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	1183	#endif
	1184
	1185
	1186	nnxh = ( nx + 1 ) / 2
	1187	!
	1188	!-- Provide the tridiagonal matrix for solution of the Poisson equation in
	1189	!-- Fourier space. The coefficients are computed following the method of
	1190	!-- Schmidt et al. (DFVLR-Mitteilung 84-15), which departs from Stephan
	1191	!-- Siano's original version by discretizing the Poisson equation,
	1192	!-- before it is Fourier-transformed
	1193	DO i = 0, nx
[128]	1194	IF ( i >= 0 .AND. i <= nnxh ) THEN
[1]	1195	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
[1013]	1196	REAL( nx+1 ) ) ) * ddx2 + &
[1]	1197	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
[1013]	1198	REAL( ny+1 ) ) ) * ddy2
[1]	1199	ELSE
	1200	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
[1013]	1201	REAL( nx+1 ) ) ) * ddx2 + &
[1]	1202	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
[1013]	1203	REAL( ny+1 ) ) ) * ddy2
[1]	1204	ENDIF
	1205	ENDDO
	1206
	1207	DO k = 0, nz-1
	1208	DO i = 0, nx
[667]	1209	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
	1210	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
[1]	1211	tri(1,i,k) = a + c - l(i)
	1212	ENDDO
	1213	ENDDO
[1111]	1214	IF ( ibc_p_b == 1 ) THEN
[1]	1215	DO i = 0, nx
	1216	tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
	1217	ENDDO
	1218	ENDIF
	1219	IF ( ibc_p_t == 1 ) THEN
	1220	DO i = 0, nx
	1221	tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
	1222	ENDDO
	1223	ENDIF
	1224
	1225	END SUBROUTINE maketri_1dd
	1226
	1227
[940]	1228	#if defined( __intel11 )
[1]	1229	SUBROUTINE split_1dd( tri )
	1230	#else
	1231	SUBROUTINE split_1dd
	1232	#endif
	1233
	1234	!------------------------------------------------------------------------------!
	1235	! Splitting of the tridiagonal matrix (Thomas algorithm)
	1236	!------------------------------------------------------------------------------!
	1237
	1238	IMPLICIT NONE
	1239
	1240	INTEGER :: i, k
	1241
[940]	1242	#if defined( __intel11 )
[1]	1243	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	1244	#endif
	1245
	1246
	1247	!
	1248	!-- Splitting
	1249	DO i = 0, nx
	1250	tri(4,i,0) = tri(1,i,0)
	1251	ENDDO
	1252	DO k = 1, nz-1
	1253	DO i = 0, nx
	1254	tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
	1255	tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
	1256	ENDDO
	1257	ENDDO
	1258
	1259	END SUBROUTINE split_1dd
	1260
	1261
	1262	SUBROUTINE substi_1dd( ar, tri )
	1263
	1264	!------------------------------------------------------------------------------!
	1265	! Substitution (Forward and Backward) (Thomas algorithm)
	1266	!------------------------------------------------------------------------------!
	1267
	1268	IMPLICIT NONE
	1269
[76]	1270	INTEGER :: i, k
[1]	1271
	1272	REAL, DIMENSION(0:nx,nz) :: ar
	1273	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
	1274	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	1275
	1276	!
	1277	!-- Forward substitution
	1278	DO i = 0, nx
	1279	ar1(i,0) = ar(i,1)
	1280	ENDDO
	1281	DO k = 1, nz-1
	1282	DO i = 0, nx
	1283	ar1(i,k) = ar(i,k+1) - tri(5,i,k) * ar1(i,k-1)
	1284	ENDDO
	1285	ENDDO
	1286
	1287	!
	1288	!-- Backward substitution
[763]	1289	!-- Note, the add of 1.0E-20 in the denominator is due to avoid divisions
	1290	!-- by zero appearing if the pressure bc is set to neumann at the top of
	1291	!-- the model domain.
[1]	1292	DO i = 0, nx
[761]	1293	ar(i,nz) = ar1(i,nz-1) / ( tri(4,i,nz-1) + 1.0E-20 )
[1]	1294	ENDDO
	1295	DO k = nz-2, 0, -1
	1296	DO i = 0, nx
	1297	ar(i,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,k+2) ) &
	1298	/ tri(4,i,k)
	1299	ENDDO
	1300	ENDDO
	1301
[76]	1302	!
	1303	!-- Indices i=0, j=0 correspond to horizontally averaged pressure.
	1304	!-- The respective values of ar should be zero at all k-levels if
	1305	!-- acceleration of horizontally averaged vertical velocity is zero.
	1306	IF ( ibc_p_b == 1 .AND. ibc_p_t == 1 ) THEN
	1307	IF ( j == 0 ) THEN
	1308	DO k = 1, nz
	1309	ar(0,k) = 0.0
	1310	ENDDO
	1311	ENDIF
	1312	ENDIF
	1313
[1]	1314	END SUBROUTINE substi_1dd
	1315
	1316	END SUBROUTINE tridia_1dd
	1317
[1111]	1318
	1319	SUBROUTINE tridia( ar )
	1320
	1321	!------------------------------------------------------------------------------!
	1322	! solves the linear system of equations:
	1323	!
	1324	! -(4 pi^2(i^2/(dx^2nnx^2)+j^2/(dy^2nny^2))+
	1325	! 1/(dzu(k)dzw(k))+1/(dzu(k-1)dzw(k)))*p(i,j,k)+
	1326	! 1/(dzu(k)dzw(k))p(i,j,k+1)+1/(dzu(k-1)dzw(k))p(i,j,k-1)=d(i,j,k)
	1327	!
	1328	! by using the Thomas algorithm
	1329	!------------------------------------------------------------------------------!
	1330
	1331	USE arrays_3d
	1332
	1333	IMPLICIT NONE
	1334
	1335	INTEGER :: i, j, k
	1336
	1337	!$acc declare create( tri )
	1338	REAL, DIMENSION(nxl_z:nxr_z,nys_z:nyn_z,0:nz-1,2) :: tri
	1339
	1340	REAL :: ar(nxl_z:nxr_z,nys_z:nyn_z,1:nz)
	1341
	1342
	1343	CALL split( tri )
	1344	CALL substi( ar, tri )
	1345
	1346	END SUBROUTINE tridia
	1347
	1348
	1349	SUBROUTINE maketri
	1350
	1351	!------------------------------------------------------------------------------!
	1352	! Computes the i- and j-dependent component of the matrix
	1353	!------------------------------------------------------------------------------!
	1354
	1355	USE arrays_3d, ONLY: tric
	1356	USE constants
	1357	USE control_parameters
	1358	USE grid_variables
	1359
	1360	IMPLICIT NONE
	1361
	1362	INTEGER :: i, j, k, nnxh, nnyh
	1363
	1364	!$acc declare create( ll )
	1365	REAL :: ll(nxl_z:nxr_z,nys_z:nyn_z)
	1366
	1367
	1368	nnxh = ( nx + 1 ) / 2
	1369	nnyh = ( ny + 1 ) / 2
	1370
	1371	!
	1372	!-- Provide the constant coefficients of the tridiagonal matrix for solution
	1373	!-- of the Poisson equation in Fourier space.
	1374	!-- The coefficients are computed following the method of
	1375	!-- Schmidt et al. (DFVLR-Mitteilung 84-15), which departs from Stephan
	1376	!-- Siano's original version by discretizing the Poisson equation,
	1377	!-- before it is Fourier-transformed.
	1378
	1379	!$acc kernels present( tric )
	1380	!$acc loop vector( 32 )
	1381	DO j = nys_z, nyn_z
	1382	DO i = nxl_z, nxr_z
	1383	IF ( j >= 0 .AND. j <= nnyh ) THEN
	1384	IF ( i >= 0 .AND. i <= nnxh ) THEN
	1385	ll(i,j) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
	1386	REAL( nx+1 ) ) ) / ( dx * dx ) + &
	1387	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
	1388	REAL( ny+1 ) ) ) / ( dy * dy )
	1389	ELSE
	1390	ll(i,j) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
	1391	REAL( nx+1 ) ) ) / ( dx * dx ) + &
	1392	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
	1393	REAL( ny+1 ) ) ) / ( dy * dy )
	1394	ENDIF
	1395	ELSE
	1396	IF ( i >= 0 .AND. i <= nnxh ) THEN
	1397	ll(i,j) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
	1398	REAL( nx+1 ) ) ) / ( dx * dx ) + &
	1399	2.0 * ( 1.0 - COS( ( 2.0 * pi * ( ny+1-j ) ) / &
	1400	REAL( ny+1 ) ) ) / ( dy * dy )
	1401	ELSE
	1402	ll(i,j) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
	1403	REAL( nx+1 ) ) ) / ( dx * dx ) + &
	1404	2.0 * ( 1.0 - COS( ( 2.0 * pi * ( ny+1-j ) ) / &
	1405	REAL( ny+1 ) ) ) / ( dy * dy )
	1406	ENDIF
	1407	ENDIF
	1408	ENDDO
	1409	ENDDO
	1410
	1411	!$acc loop
	1412	DO k = 0, nz-1
	1413	DO j = nys_z, nyn_z
	1414	!$acc loop vector( 32 )
	1415	DO i = nxl_z, nxr_z
	1416	tric(i,j,k) = ddzuw(k,3) - ll(i,j)
	1417	ENDDO
	1418	ENDDO
	1419	ENDDO
	1420	!$acc end kernels
	1421
	1422	IF ( ibc_p_b == 1 ) THEN
	1423	!$acc kernels present( tric )
	1424	!$acc loop
	1425	DO j = nys_z, nyn_z
	1426	DO i = nxl_z, nxr_z
	1427	tric(i,j,0) = tric(i,j,0) + ddzuw(0,1)
	1428	ENDDO
	1429	ENDDO
	1430	!$acc end kernels
	1431	ENDIF
	1432	IF ( ibc_p_t == 1 ) THEN
	1433	!$acc kernels present( tric )
	1434	!$acc loop
	1435	DO j = nys_z, nyn_z
	1436	DO i = nxl_z, nxr_z
	1437	tric(i,j,nz-1) = tric(i,j,nz-1) + ddzuw(nz-1,2)
	1438	ENDDO
	1439	ENDDO
	1440	!$acc end kernels
	1441	ENDIF
	1442
	1443	END SUBROUTINE maketri
	1444
	1445
	1446	SUBROUTINE substi( ar, tri )
	1447
	1448	!------------------------------------------------------------------------------!
	1449	! Substitution (Forward and Backward) (Thomas algorithm)
	1450	!------------------------------------------------------------------------------!
	1451
	1452	USE control_parameters
	1453
	1454	IMPLICIT NONE
	1455
	1456	INTEGER :: i, j, k
	1457
	1458	REAL :: ar(nxl_z:nxr_z,nys_z:nyn_z,1:nz)
	1459	REAL, DIMENSION(nxl_z:nxr_z,nys_z:nyn_z,0:nz-1,2) :: tri
	1460
	1461	!$acc declare create( ar1 )
	1462	REAL, DIMENSION(nxl_z:nxr_z,nys_z:nyn_z,0:nz-1) :: ar1
	1463
	1464	!
	1465	!-- Forward substitution
	1466	DO k = 0, nz - 1
	1467	!$acc kernels present( ar, tri )
	1468	!$acc loop
	1469	DO j = nys_z, nyn_z
	1470	DO i = nxl_z, nxr_z
	1471
	1472	IF ( k == 0 ) THEN
	1473	ar1(i,j,k) = ar(i,j,k+1)
	1474	ELSE
	1475	ar1(i,j,k) = ar(i,j,k+1) - tri(i,j,k,2) * ar1(i,j,k-1)
	1476	ENDIF
	1477
	1478	ENDDO
	1479	ENDDO
	1480	!$acc end kernels
	1481	ENDDO
	1482
	1483	!
	1484	!-- Backward substitution
	1485	!-- Note, the 1.0E-20 in the denominator is due to avoid divisions
	1486	!-- by zero appearing if the pressure bc is set to neumann at the top of
	1487	!-- the model domain.
	1488	DO k = nz-1, 0, -1
	1489	!$acc kernels present( ar, tri )
	1490	!$acc loop
	1491	DO j = nys_z, nyn_z
	1492	DO i = nxl_z, nxr_z
	1493
	1494	IF ( k == nz-1 ) THEN
	1495	ar(i,j,k+1) = ar1(i,j,k) / ( tri(i,j,k,1) + 1.0E-20 )
	1496	ELSE
	1497	ar(i,j,k+1) = ( ar1(i,j,k) - ddzuw(k,2) * ar(i,j,k+2) ) &
	1498	/ tri(i,j,k,1)
	1499	ENDIF
	1500	ENDDO
	1501	ENDDO
	1502	!$acc end kernels
	1503	ENDDO
	1504
	1505	!
	1506	!-- Indices i=0, j=0 correspond to horizontally averaged pressure.
	1507	!-- The respective values of ar should be zero at all k-levels if
	1508	!-- acceleration of horizontally averaged vertical velocity is zero.
	1509	IF ( ibc_p_b == 1 .AND. ibc_p_t == 1 ) THEN
	1510	IF ( nys_z == 0 .AND. nxl_z == 0 ) THEN
	1511	!$acc kernels loop present( ar )
	1512	DO k = 1, nz
	1513	ar(nxl_z,nys_z,k) = 0.0
	1514	ENDDO
	1515	ENDIF
	1516	ENDIF
	1517
	1518	END SUBROUTINE substi
	1519
	1520
	1521	SUBROUTINE split( tri )
	1522
	1523	!------------------------------------------------------------------------------!
	1524	! Splitting of the tridiagonal matrix (Thomas algorithm)
	1525	!------------------------------------------------------------------------------!
	1526
	1527	USE arrays_3d, ONLY: tric
	1528
	1529	IMPLICIT NONE
	1530
	1531	INTEGER :: i, j, k
	1532
	1533	REAL, DIMENSION(nxl_z:nxr_z,nys_z:nyn_z,0:nz-1,2) :: tri
	1534
	1535	!
	1536	!-- Splitting
	1537	!$acc kernels present( tri, tric )
	1538	!$acc loop
	1539	DO j = nys_z, nyn_z
	1540	!$acc loop vector( 32 )
	1541	DO i = nxl_z, nxr_z
	1542	tri(i,j,0,1) = tric(i,j,0)
	1543	ENDDO
	1544	ENDDO
	1545	!$acc end kernels
	1546
	1547	DO k = 1, nz-1
	1548	!$acc kernels present( tri, tric )
	1549	!$acc loop
	1550	DO j = nys_z, nyn_z
	1551	!$acc loop vector( 32 )
	1552	DO i = nxl_z, nxr_z
	1553	tri(i,j,k,2) = ddzuw(k,1) / tri(i,j,k-1,1)
	1554	tri(i,j,k,1) = tric(i,j,k) - ddzuw(k-1,2) * tri(i,j,k,2)
	1555	ENDDO
	1556	ENDDO
	1557	!$acc end kernels
	1558	ENDDO
	1559
	1560	END SUBROUTINE split
	1561
[1]	1562	#endif
[1111]	1563
[1]	1564	END MODULE poisfft_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |