Home

Context Navigation

source: palm/trunk/SOURCE/poisfft.f90 @ 1175

Last change on this file since 1175 was 1112, checked in by raasch, 12 years ago
last commit documented
Property svn:keywords set to `Id`
File size: 45.8 KB

Rev	Line
[1]	1	MODULE poisfft_mod
	2
[1036]	3	!--------------------------------------------------------------------------------!
	4	! This file is part of PALM.
	5	!
	6	! PALM is free software: you can redistribute it and/or modify it under the terms
	7	! of the GNU General Public License as published by the Free Software Foundation,
	8	! either version 3 of the License, or (at your option) any later version.
	9	!
	10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
	11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
	12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
	13	!
	14	! You should have received a copy of the GNU General Public License along with
	15	! PALM. If not, see <http://www.gnu.org/licenses/>.
	16	!
	17	! Copyright 1997-2012 Leibniz University Hannover
	18	!--------------------------------------------------------------------------------!
	19	!
[484]	20	! Current revisions:
[1]	21	! -----------------
[1112]	22	!
	23	!
	24	! Former revisions:
	25	! -----------------
	26	! $Id: poisfft.f90 1112 2013-03-09 00:34:37Z gryschka $
	27	!
	28	! 1111 2013-03-08 23:54:10Z raasch
[1111]	29	! further openACC porting of non-parallel (MPI) branch:
	30	! tridiagonal routines split into extermal subroutines (instead using CONTAINS),
	31	! no distinction between parallel/non-parallel in poisfft and tridia any more,
[1112]	32	! tridia routines moved to end of file because of probable bug in PGI compiler 12.5
[1111]	33	! (otherwise "invalid device function" is indicated during runtime),
	34	! optimization of tridia routines: constant elements and coefficients of tri are
	35	! stored in seperate arrays ddzuw and tric, last dimension of tri reduced from 5
	36	! to 2,
	37	! poisfft_init is now called internally from poisfft, maketri is called from
	38	! poisfft_init,
	39	! ibc_p_b = 2 removed
[1]	40	!
[1107]	41	! 1106 2013-03-04 05:31:38Z raasch
	42	! routines fftx, ffty, fftxp, fftyp removed, calls replaced by fft_x, fft_y,
	43	! in the 1D-decomposition routines fft_x, ffty are replaced by fft_x_1d,
	44	! fft_y_1d
	45	!
[1104]	46	! 1103 2013-02-20 02:15:53Z raasch
	47	! tri, ar, and ar1 arguments in tridia-routines (2d) are removed because they
	48	! sometimes cause segmentation faults with intel 12.1 compiler
	49	!
[1093]	50	! 1092 2013-02-02 11:24:22Z raasch
	51	! unused variables removed
	52	!
[1037]	53	! 1036 2012-10-22 13:43:42Z raasch
	54	! code put under GPL (PALM 3.9)
	55	!
[1014]	56	! 2012-09-21 07:03:55Z raasch
	57	! FLOAT type conversion replaced by REAL
	58	!
[1004]	59	! 1003 2012-09-14 14:35:53Z raasch
	60	! indices nxa, nya, etc. replaced by nx, ny, etc.
	61	!
[941]	62	! 940 2012-07-09 14:31:00Z raasch
	63	! special handling of tri-array as an argument in tridia_1dd routines switched
	64	! off because it caused segmentation faults with intel 12.1 compiler
	65	!
[878]	66	! 877 2012-04-03 11:21:44Z suehring
	67	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
	68	! pressure at the top of the model domain.
	69	!
[810]	70	! 809 2012-01-30 13:32:58Z maronga
	71	! Bugfix: replaced .AND. and .NOT. with && and ! in the preprocessor directives
	72	!
[808]	73	! 807 2012-01-25 11:53:51Z maronga
	74	! New cpp directive "__check" implemented which is used by check_namelist_files
	75	! (most of the code is unneeded by check_namelist_files).
	76	!
[764]	77	! 763 2011-10-06 09:32:09Z suehring
	78	! Comment added concerning the last change.
	79	!
[762]	80	! 761 2011-10-05 17:58:52Z suehring
	81	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
	82	! pressure at the top of the model domain.
	83	!
[697]	84	! 696 2011-03-18 07:03:49Z raasch
	85	! work_fftx removed from PRIVATE clauses in fftx_tr_xy and tr_yx_fftx
	86	!
[684]	87	! 683 2011-02-09 14:25:15Z raasch
	88	! openMP parallelization for 2d-domain-decomposition
	89	!
[668]	90	! 667 2010-12-23 12:06:00Z suehring/gryschka
	91	! ddzu replaced by ddzu_pres due to changes in zu(0)
	92	!
[623]	93	! 622 2010-12-10 08:08:13Z raasch
	94	! optional barriers included in order to speed up collective operations
	95	!
[392]	96	! 377 2009-09-04 11:09:00Z raasch
	97	! __lcmuk changed to __lc to avoid problems with Intel compiler on sgi-ice
	98	!
[198]	99	! 164 2008-05-15 08:46:15Z raasch
	100	! Arguments removed from transpose routines
	101	!
[139]	102	! 128 2007-10-26 13:11:14Z raasch
	103	! Bugfix: wavenumber calculation for even nx in routines maketri
	104	!
[90]	105	! 85 2007-05-11 09:35:14Z raasch
	106	! Bugfix: work_fft*_vec removed from some PRIVATE-declarations
	107	!
[77]	108	! 76 2007-03-29 00:58:32Z raasch
	109	! Tridiagonal coefficients adjusted for Neumann boundary conditions both at
	110	! the bottom and the top.
	111	!
[3]	112	! RCS Log replace by Id keyword, revision history cleaned up
	113	!
[1]	114	! Revision 1.24 2006/08/04 15:00:24 raasch
	115	! Default setting of the thread number tn in case of not using OpenMP
	116	!
	117	! Revision 1.23 2006/02/23 12:48:38 raasch
	118	! Additional compiler directive in routine tridia_1dd for preventing loop
	119	! exchange on NEC-SX6
	120	!
	121	! Revision 1.20 2004/04/30 12:38:09 raasch
	122	! Parts of former poisfft_hybrid moved to this subroutine,
	123	! former subroutine changed to a module, renaming of FFT-subroutines and
	124	! -module, FFTs completely substituted by calls of fft_x and fft_y,
	125	! NAG fft used in the non-parallel case completely removed, l in maketri
	126	! is now a 1d-array, variables passed by modules instead of using parameter
	127	! lists, enlarged transposition arrays introduced
	128	!
	129	! Revision 1.1 1997/07/24 11:24:14 raasch
	130	! Initial revision
	131	!
	132	!
	133	! Description:
	134	! ------------
	135	! See below.
	136	!------------------------------------------------------------------------------!
	137
	138	!--------------------------------------------------------------------------!
	139	! poisfft !
	140	! !
	141	! Original version: Stephan Siano (pois3d) !
	142	! !
	143	! Institute of Meteorology and Climatology, University of Hannover !
	144	! Germany !
	145	! !
	146	! Version as of July 23,1996 !
	147	! !
	148	! !
	149	! Version for parallel computers: Siegfried Raasch !
	150	! !
	151	! Version as of July 03,1997 !
	152	! !
	153	! Solves the Poisson equation with a 2D spectral method !
	154	! d^2 p / dx^2 + d^2 p / dy^2 + d^2 p / dz^2 = s !
	155	! !
	156	! Input: !
	157	! real ar contains in the (nnx,nny,nnz) elements, !
	158	! starting from the element (1,nys,nxl), the !
	159	! values for s !
	160	! real work Temporary array !
	161	! !
	162	! Output: !
	163	! real ar contains the solution for p !
	164	!--------------------------------------------------------------------------!
	165
	166	USE fft_xy
	167	USE indices
	168	USE transpose_indices
	169
	170	IMPLICIT NONE
	171
[1111]	172	LOGICAL, SAVE :: poisfft_initialized = .FALSE.
	173
	174	REAL, DIMENSION(:,:), ALLOCATABLE :: ddzuw
	175
[1]	176	PRIVATE
[807]	177
[809]	178	#if ! defined ( __check )
[1]	179	PUBLIC poisfft, poisfft_init
	180
	181	INTERFACE poisfft
	182	MODULE PROCEDURE poisfft
	183	END INTERFACE poisfft
	184
	185	INTERFACE poisfft_init
	186	MODULE PROCEDURE poisfft_init
	187	END INTERFACE poisfft_init
[807]	188	#else
	189	PUBLIC poisfft_init
[1]	190
[807]	191	INTERFACE poisfft_init
	192	MODULE PROCEDURE poisfft_init
	193	END INTERFACE poisfft_init
	194	#endif
	195
[1]	196	CONTAINS
	197
	198	SUBROUTINE poisfft_init
	199
[1111]	200	USE arrays_3d, ONLY: ddzu_pres, ddzw
	201
	202	IMPLICIT NONE
	203
	204	INTEGER :: k
	205
	206
[1]	207	CALL fft_init
	208
[1111]	209	ALLOCATE( ddzuw(0:nz-1,3) )
	210
	211	DO k = 0, nz-1
	212	ddzuw(k,1) = ddzu_pres(k+1) * ddzw(k+1)
	213	ddzuw(k,2) = ddzu_pres(k+2) * ddzw(k+1)
	214	ddzuw(k,3) = -1.0 * &
	215	( ddzu_pres(k+2) * ddzw(k+1) + ddzu_pres(k+1) * ddzw(k+1) )
	216	ENDDO
	217	!
	218	!-- Calculate constant coefficients of the tridiagonal matrix
	219	#if ! defined ( __check )
	220	CALL maketri
	221	#endif
	222
	223	poisfft_initialized = .TRUE.
	224
[1]	225	END SUBROUTINE poisfft_init
	226
[1111]	227
[809]	228	#if ! defined ( __check )
[1]	229	SUBROUTINE poisfft( ar, work )
	230
	231	USE cpulog
	232	USE interfaces
	233	USE pegrid
	234
	235	IMPLICIT NONE
	236
[1003]	237	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar, work
[1]	238
	239
	240	CALL cpu_log( log_point_s(3), 'poisfft', 'start' )
	241
[1111]	242	IF ( .NOT. poisfft_initialized ) CALL poisfft_init
	243
[1]	244	!
	245	!-- Two-dimensional Fourier Transformation in x- and y-direction.
[1111]	246	IF ( pdims(2) == 1 .AND. pdims(1) > 1 ) THEN
[1]	247
	248	!
	249	!-- 1d-domain-decomposition along x:
	250	!-- FFT along y and transposition y --> x
	251	CALL ffty_tr_yx( ar, work, ar )
	252
	253	!
	254	!-- FFT along x, solving the tridiagonal system and backward FFT
	255	CALL fftx_tri_fftx( ar )
	256
	257	!
	258	!-- Transposition x --> y and backward FFT along y
	259	CALL tr_xy_ffty( ar, work, ar )
	260
[1111]	261	ELSEIF ( pdims(1) == 1 .AND. pdims(2) > 1 ) THEN
[1]	262
	263	!
	264	!-- 1d-domain-decomposition along y:
	265	!-- FFT along x and transposition x --> y
	266	CALL fftx_tr_xy( ar, work, ar )
	267
	268	!
	269	!-- FFT along y, solving the tridiagonal system and backward FFT
	270	CALL ffty_tri_ffty( ar )
	271
	272	!
	273	!-- Transposition y --> x and backward FFT along x
	274	CALL tr_yx_fftx( ar, work, ar )
	275
	276	ELSE
	277
	278	!
[1111]	279	!-- 2d-domain-decomposition or no decomposition (1 PE run)
[1]	280	!-- Transposition z --> x
	281	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
[164]	282	CALL transpose_zx( ar, work, ar )
[1]	283	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
	284
	285	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
[1106]	286	CALL fft_x( ar, 'forward' )
[1]	287	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
	288
	289	!
	290	!-- Transposition x --> y
	291	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
[164]	292	CALL transpose_xy( ar, work, ar )
[1]	293	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
	294
	295	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
[1106]	296	CALL fft_y( ar, 'forward' )
[1]	297	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	298
	299	!
	300	!-- Transposition y --> z
	301	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
[164]	302	CALL transpose_yz( ar, work, ar )
[1]	303	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
	304
	305	!
[1106]	306	!-- Solve the tridiagonal equation system along z
[1]	307	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
	308	CALL tridia( ar )
	309	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
	310
	311	!
	312	!-- Inverse Fourier Transformation
	313	!-- Transposition z --> y
	314	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
[164]	315	CALL transpose_zy( ar, work, ar )
[1]	316	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
	317
	318	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
[1106]	319	CALL fft_y( ar, 'backward' )
[1]	320	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	321
	322	!
	323	!-- Transposition y --> x
	324	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
[164]	325	CALL transpose_yx( ar, work, ar )
[1]	326	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
	327
	328	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
[1106]	329	CALL fft_x( ar, 'backward' )
[1]	330	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	331
	332	!
	333	!-- Transposition x --> z
	334	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
[164]	335	CALL transpose_xz( ar, work, ar )
[1]	336	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
	337
	338	ENDIF
	339
	340	CALL cpu_log( log_point_s(3), 'poisfft', 'stop' )
	341
	342	END SUBROUTINE poisfft
	343
	344
	345
	346	SUBROUTINE ffty_tr_yx( f_in, work, f_out )
	347
	348	!------------------------------------------------------------------------------!
	349	! Fourier-transformation along y with subsequent transposition y --> x for
	350	! a 1d-decomposition along x
	351	!
	352	! ATTENTION: The performance of this routine is much faster on the NEC-SX6,
	353	! if the first index of work_ffty_vec is odd. Otherwise
	354	! memory bank conflicts may occur (especially if the index is a
	355	! multiple of 128). That's why work_ffty_vec is dimensioned as
	356	! 0:ny+1.
	357	! Of course, this will not work if users are using an odd number
	358	! of gridpoints along y.
	359	!------------------------------------------------------------------------------!
	360
	361	USE control_parameters
	362	USE cpulog
	363	USE indices
	364	USE interfaces
	365	USE pegrid
	366	USE transpose_indices
	367
	368	IMPLICIT NONE
	369
	370	INTEGER :: i, iend, iouter, ir, j, k
	371	INTEGER, PARAMETER :: stridex = 4
	372
	373	REAL, DIMENSION(0:ny,stridex) :: work_ffty
	374	#if defined( __nec )
	375	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
	376	#endif
[1003]	377	REAL, DIMENSION(1:nz,0:ny,nxl:nxr) :: f_in
	378	REAL, DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_out
	379	REAL, DIMENSION(nxl:nxr,1:nz,0:ny) :: work
[1]	380
	381	!
	382	!-- Carry out the FFT along y, where all data are present due to the
	383	!-- 1d-decomposition along x. Resort the data in a way that x becomes
	384	!-- the first index.
[1106]	385	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'start' )
[1]	386
	387	IF ( host(1:3) == 'nec' ) THEN
	388	#if defined( __nec )
	389	!
	390	!-- Code optimized for vector processors
[85]	391	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	392	!$OMP DO
	393	DO i = nxl, nxr
	394
	395	DO j = 0, ny
	396	DO k = 1, nz
	397	work_ffty_vec(j,k,i) = f_in(k,j,i)
	398	ENDDO
	399	ENDDO
	400
	401	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' )
	402
	403	ENDDO
	404
	405	!$OMP DO
	406	DO k = 1, nz
	407	DO j = 0, ny
	408	DO i = nxl, nxr
	409	work(i,k,j) = work_ffty_vec(j,k,i)
	410	ENDDO
	411	ENDDO
	412	ENDDO
	413	!$OMP END PARALLEL
	414	#endif
	415
	416	ELSE
	417
	418	!
	419	!-- Cache optimized code.
	420	!-- The i-(x-)direction is split into a strided outer loop and an inner
	421	!-- loop for better cache performance
	422	!$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty)
	423	!$OMP DO
	424	DO iouter = nxl, nxr, stridex
	425
	426	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
	427
	428	DO k = 1, nz
	429
	430	DO i = iouter, iend
	431
	432	ir = i-iouter+1 ! counter within a stride
	433	DO j = 0, ny
	434	work_ffty(j,ir) = f_in(k,j,i)
	435	ENDDO
	436	!
	437	!-- FFT along y
[1106]	438	CALL fft_y_1d( work_ffty(:,ir), 'forward' )
[1]	439
	440	ENDDO
	441
	442	!
	443	!-- Resort
	444	DO j = 0, ny
	445	DO i = iouter, iend
	446	work(i,k,j) = work_ffty(j,i-iouter+1)
	447	ENDDO
	448	ENDDO
	449
	450	ENDDO
	451
	452	ENDDO
	453	!$OMP END PARALLEL
	454
	455	ENDIF
[1106]	456	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'pause' )
[1]	457
	458	!
	459	!-- Transpose array
[1111]	460	#if defined( __parallel )
[1]	461	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	462	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	463	CALL MPI_ALLTOALL( work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
	464	f_out(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
	465	comm1dx, ierr )
	466	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1111]	467	#endif
[1]	468
	469	END SUBROUTINE ffty_tr_yx
	470
	471
	472	SUBROUTINE tr_xy_ffty( f_in, work, f_out )
	473
	474	!------------------------------------------------------------------------------!
	475	! Transposition x --> y with a subsequent backward Fourier transformation for
	476	! a 1d-decomposition along x
	477	!------------------------------------------------------------------------------!
	478
	479	USE control_parameters
	480	USE cpulog
	481	USE indices
	482	USE interfaces
	483	USE pegrid
	484	USE transpose_indices
	485
	486	IMPLICIT NONE
	487
	488	INTEGER :: i, iend, iouter, ir, j, k
	489	INTEGER, PARAMETER :: stridex = 4
	490
	491	REAL, DIMENSION(0:ny,stridex) :: work_ffty
	492	#if defined( __nec )
	493	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
	494	#endif
[1003]	495	REAL, DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_in
	496	REAL, DIMENSION(1:nz,0:ny,nxl:nxr) :: f_out
	497	REAL, DIMENSION(nxl:nxr,1:nz,0:ny) :: work
[1]	498
	499	!
	500	!-- Transpose array
[1111]	501	#if defined( __parallel )
[1]	502	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	503	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	504	CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
	505	work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
	506	comm1dx, ierr )
	507	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1111]	508	#endif
[1]	509
	510	!
	511	!-- Resort the data in a way that y becomes the first index and carry out the
	512	!-- backward fft along y.
[1106]	513	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'continue' )
[1]	514
	515	IF ( host(1:3) == 'nec' ) THEN
	516	#if defined( __nec )
	517	!
	518	!-- Code optimized for vector processors
[85]	519	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	520	!$OMP DO
	521	DO k = 1, nz
	522	DO j = 0, ny
	523	DO i = nxl, nxr
	524	work_ffty_vec(j,k,i) = work(i,k,j)
	525	ENDDO
	526	ENDDO
	527	ENDDO
	528
	529	!$OMP DO
	530	DO i = nxl, nxr
	531
	532	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' )
	533
	534	DO j = 0, ny
	535	DO k = 1, nz
	536	f_out(k,j,i) = work_ffty_vec(j,k,i)
	537	ENDDO
	538	ENDDO
	539
	540	ENDDO
	541	!$OMP END PARALLEL
	542	#endif
	543
	544	ELSE
	545
	546	!
	547	!-- Cache optimized code.
	548	!-- The i-(x-)direction is split into a strided outer loop and an inner
	549	!-- loop for better cache performance
	550	!$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty )
	551	!$OMP DO
	552	DO iouter = nxl, nxr, stridex
	553
	554	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
	555
	556	DO k = 1, nz
	557	!
	558	!-- Resort
	559	DO j = 0, ny
	560	DO i = iouter, iend
	561	work_ffty(j,i-iouter+1) = work(i,k,j)
	562	ENDDO
	563	ENDDO
	564
	565	DO i = iouter, iend
	566
	567	!
	568	!-- FFT along y
	569	ir = i-iouter+1 ! counter within a stride
[1106]	570	CALL fft_y_1d( work_ffty(:,ir), 'backward' )
[1]	571
	572	DO j = 0, ny
	573	f_out(k,j,i) = work_ffty(j,ir)
	574	ENDDO
	575	ENDDO
	576
	577	ENDDO
	578
	579	ENDDO
	580	!$OMP END PARALLEL
	581
	582	ENDIF
	583
[1106]	584	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'stop' )
[1]	585
	586	END SUBROUTINE tr_xy_ffty
	587
	588
	589	SUBROUTINE fftx_tri_fftx( ar )
	590
	591	!------------------------------------------------------------------------------!
	592	! FFT along x, solution of the tridiagonal system and backward FFT for
	593	! a 1d-decomposition along x
	594	!
	595	! WARNING: this subroutine may still not work for hybrid parallelization
	596	! with OpenMP (for possible necessary changes see the original
	597	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
	598	!------------------------------------------------------------------------------!
	599
	600	USE control_parameters
	601	USE cpulog
	602	USE grid_variables
	603	USE indices
	604	USE interfaces
	605	USE pegrid
	606	USE transpose_indices
	607
	608	IMPLICIT NONE
	609
	610	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
	611
[1003]	612	REAL, DIMENSION(0:nx) :: work_fftx
	613	REAL, DIMENSION(0:nx,1:nz) :: work_trix
	614	REAL, DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: ar
	615	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
[1]	616
	617
[1106]	618	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'start' )
[1]	619
	620	ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) )
	621
	622	tn = 0 ! Default thread number in case of one thread
	623	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_fftx, work_trix )
	624	DO j = nys_x, nyn_x
	625
	626	!$ tn = omp_get_thread_num()
	627
	628	IF ( host(1:3) == 'nec' ) THEN
	629	!
	630	!-- Code optimized for vector processors
	631	DO k = 1, nz
	632
	633	m = 0
	634	DO n = 1, pdims(1)
[1003]	635	DO i = 1, nnx
[1]	636	work_trix(m,k) = ar(i,k,j,n)
	637	m = m + 1
	638	ENDDO
	639	ENDDO
	640
	641	ENDDO
	642
	643	CALL fft_x_m( work_trix, 'forward' )
	644
	645	ELSE
	646	!
	647	!-- Cache optimized code
	648	DO k = 1, nz
	649
	650	m = 0
	651	DO n = 1, pdims(1)
[1003]	652	DO i = 1, nnx
[1]	653	work_fftx(m) = ar(i,k,j,n)
	654	m = m + 1
	655	ENDDO
	656	ENDDO
	657
[1106]	658	CALL fft_x_1d( work_fftx, 'forward' )
[1]	659
	660	DO i = 0, nx
	661	work_trix(i,k) = work_fftx(i)
	662	ENDDO
	663
	664	ENDDO
	665
	666	ENDIF
	667
	668	!
	669	!-- Solve the linear equation system
	670	CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) )
	671
	672	IF ( host(1:3) == 'nec' ) THEN
	673	!
	674	!-- Code optimized for vector processors
	675	CALL fft_x_m( work_trix, 'backward' )
	676
	677	DO k = 1, nz
	678
	679	m = 0
	680	DO n = 1, pdims(1)
[1003]	681	DO i = 1, nnx
[1]	682	ar(i,k,j,n) = work_trix(m,k)
	683	m = m + 1
	684	ENDDO
	685	ENDDO
	686
	687	ENDDO
	688
	689	ELSE
	690	!
	691	!-- Cache optimized code
	692	DO k = 1, nz
	693
	694	DO i = 0, nx
	695	work_fftx(i) = work_trix(i,k)
	696	ENDDO
	697
[1106]	698	CALL fft_x_1d( work_fftx, 'backward' )
[1]	699
	700	m = 0
	701	DO n = 1, pdims(1)
[1003]	702	DO i = 1, nnx
[1]	703	ar(i,k,j,n) = work_fftx(m)
	704	m = m + 1
	705	ENDDO
	706	ENDDO
	707
	708	ENDDO
	709
	710	ENDIF
	711
	712	ENDDO
	713
	714	DEALLOCATE( tri )
	715
[1106]	716	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'stop' )
[1]	717
	718	END SUBROUTINE fftx_tri_fftx
	719
	720
	721	SUBROUTINE fftx_tr_xy( f_in, work, f_out )
	722
	723	!------------------------------------------------------------------------------!
	724	! Fourier-transformation along x with subsequent transposition x --> y for
	725	! a 1d-decomposition along y
	726	!
	727	! ATTENTION: The NEC-branch of this routine may significantly profit from
	728	! further optimizations. So far, performance is much worse than
	729	! for routine ffty_tr_yx (more than three times slower).
	730	!------------------------------------------------------------------------------!
	731
	732	USE control_parameters
	733	USE cpulog
	734	USE indices
	735	USE interfaces
	736	USE pegrid
	737	USE transpose_indices
	738
	739	IMPLICIT NONE
	740
	741	INTEGER :: i, j, k
	742
[1003]	743	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
	744	REAL, DIMENSION(1:nz,nys:nyn,0:nx) :: f_in
	745	REAL, DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_out
	746	REAL, DIMENSION(nys:nyn,1:nz,0:nx) :: work
[1]	747
	748	!
	749	!-- Carry out the FFT along x, where all data are present due to the
	750	!-- 1d-decomposition along y. Resort the data in a way that y becomes
	751	!-- the first index.
[1106]	752	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'start' )
[1]	753
	754	IF ( host(1:3) == 'nec' ) THEN
	755	!
	756	!-- Code for vector processors
[85]	757	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	758	!$OMP DO
	759	DO i = 0, nx
	760
	761	DO j = nys, nyn
	762	DO k = 1, nz
	763	work_fftx(i,k,j) = f_in(k,j,i)
	764	ENDDO
	765	ENDDO
	766
	767	ENDDO
	768
	769	!$OMP DO
	770	DO j = nys, nyn
	771
	772	CALL fft_x_m( work_fftx(:,:,j), 'forward' )
	773
	774	DO k = 1, nz
	775	DO i = 0, nx
	776	work(j,k,i) = work_fftx(i,k,j)
	777	ENDDO
	778	ENDDO
	779
	780	ENDDO
	781	!$OMP END PARALLEL
	782
	783	ELSE
	784
	785	!
	786	!-- Cache optimized code (there might be still a potential for better
	787	!-- optimization).
[696]	788	!$OMP PARALLEL PRIVATE (i,j,k)
[1]	789	!$OMP DO
	790	DO i = 0, nx
	791
	792	DO j = nys, nyn
	793	DO k = 1, nz
	794	work_fftx(i,k,j) = f_in(k,j,i)
	795	ENDDO
	796	ENDDO
	797
	798	ENDDO
	799
	800	!$OMP DO
	801	DO j = nys, nyn
	802	DO k = 1, nz
	803
[1106]	804	CALL fft_x_1d( work_fftx(0:nx,k,j), 'forward' )
[1]	805
	806	DO i = 0, nx
	807	work(j,k,i) = work_fftx(i,k,j)
	808	ENDDO
	809	ENDDO
	810
	811	ENDDO
	812	!$OMP END PARALLEL
	813
	814	ENDIF
[1106]	815	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'pause' )
[1]	816
	817	!
	818	!-- Transpose array
[1111]	819	#if defined( __parallel )
[1]	820	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	821	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	822	CALL MPI_ALLTOALL( work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
	823	f_out(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
	824	comm1dy, ierr )
	825	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1111]	826	#endif
[1]	827
	828	END SUBROUTINE fftx_tr_xy
	829
	830
	831	SUBROUTINE tr_yx_fftx( f_in, work, f_out )
	832
	833	!------------------------------------------------------------------------------!
	834	! Transposition y --> x with a subsequent backward Fourier transformation for
	835	! a 1d-decomposition along x
	836	!------------------------------------------------------------------------------!
	837
	838	USE control_parameters
	839	USE cpulog
	840	USE indices
	841	USE interfaces
	842	USE pegrid
	843	USE transpose_indices
	844
	845	IMPLICIT NONE
	846
	847	INTEGER :: i, j, k
	848
[1003]	849	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
	850	REAL, DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_in
	851	REAL, DIMENSION(1:nz,nys:nyn,0:nx) :: f_out
	852	REAL, DIMENSION(nys:nyn,1:nz,0:nx) :: work
[1]	853
	854	!
	855	!-- Transpose array
[1111]	856	#if defined( __parallel )
[1]	857	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	858	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	859	CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
	860	work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
	861	comm1dy, ierr )
	862	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1111]	863	#endif
[1]	864
	865	!
	866	!-- Carry out the FFT along x, where all data are present due to the
	867	!-- 1d-decomposition along y. Resort the data in a way that y becomes
	868	!-- the first index.
[1106]	869	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'continue' )
[1]	870
	871	IF ( host(1:3) == 'nec' ) THEN
	872	!
	873	!-- Code optimized for vector processors
[85]	874	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	875	!$OMP DO
	876	DO j = nys, nyn
	877
	878	DO k = 1, nz
	879	DO i = 0, nx
	880	work_fftx(i,k,j) = work(j,k,i)
	881	ENDDO
	882	ENDDO
	883
	884	CALL fft_x_m( work_fftx(:,:,j), 'backward' )
	885
	886	ENDDO
	887
	888	!$OMP DO
	889	DO i = 0, nx
	890	DO j = nys, nyn
	891	DO k = 1, nz
	892	f_out(k,j,i) = work_fftx(i,k,j)
	893	ENDDO
	894	ENDDO
	895	ENDDO
	896	!$OMP END PARALLEL
	897
	898	ELSE
	899
	900	!
	901	!-- Cache optimized code (there might be still a potential for better
	902	!-- optimization).
[696]	903	!$OMP PARALLEL PRIVATE (i,j,k)
[1]	904	!$OMP DO
	905	DO j = nys, nyn
	906	DO k = 1, nz
	907
	908	DO i = 0, nx
	909	work_fftx(i,k,j) = work(j,k,i)
	910	ENDDO
	911
[1106]	912	CALL fft_x_1d( work_fftx(0:nx,k,j), 'backward' )
[1]	913
	914	ENDDO
	915	ENDDO
	916
	917	!$OMP DO
	918	DO i = 0, nx
	919	DO j = nys, nyn
	920	DO k = 1, nz
	921	f_out(k,j,i) = work_fftx(i,k,j)
	922	ENDDO
	923	ENDDO
	924	ENDDO
	925	!$OMP END PARALLEL
	926
	927	ENDIF
[1106]	928	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'stop' )
[1]	929
	930	END SUBROUTINE tr_yx_fftx
	931
	932
	933	SUBROUTINE ffty_tri_ffty( ar )
	934
	935	!------------------------------------------------------------------------------!
	936	! FFT along y, solution of the tridiagonal system and backward FFT for
	937	! a 1d-decomposition along y
	938	!
	939	! WARNING: this subroutine may still not work for hybrid parallelization
	940	! with OpenMP (for possible necessary changes see the original
	941	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
	942	!------------------------------------------------------------------------------!
	943
	944	USE control_parameters
	945	USE cpulog
	946	USE grid_variables
	947	USE indices
	948	USE interfaces
	949	USE pegrid
	950	USE transpose_indices
	951
	952	IMPLICIT NONE
	953
	954	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
	955
[1003]	956	REAL, DIMENSION(0:ny) :: work_ffty
	957	REAL, DIMENSION(0:ny,1:nz) :: work_triy
	958	REAL, DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: ar
	959	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
[1]	960
	961
[1106]	962	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'start' )
[1]	963
	964	ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) )
	965
	966	tn = 0 ! Default thread number in case of one thread
[696]	967	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy )
[1]	968	DO i = nxl_y, nxr_y
	969
	970	!$ tn = omp_get_thread_num()
	971
	972	IF ( host(1:3) == 'nec' ) THEN
	973	!
	974	!-- Code optimized for vector processors
	975	DO k = 1, nz
	976
	977	m = 0
	978	DO n = 1, pdims(2)
[1003]	979	DO j = 1, nny
[1]	980	work_triy(m,k) = ar(j,k,i,n)
	981	m = m + 1
	982	ENDDO
	983	ENDDO
	984
	985	ENDDO
	986
	987	CALL fft_y_m( work_triy, ny, 'forward' )
	988
	989	ELSE
	990	!
	991	!-- Cache optimized code
	992	DO k = 1, nz
	993
	994	m = 0
	995	DO n = 1, pdims(2)
[1003]	996	DO j = 1, nny
[1]	997	work_ffty(m) = ar(j,k,i,n)
	998	m = m + 1
	999	ENDDO
	1000	ENDDO
	1001
[1106]	1002	CALL fft_y_1d( work_ffty, 'forward' )
[1]	1003
	1004	DO j = 0, ny
	1005	work_triy(j,k) = work_ffty(j)
	1006	ENDDO
	1007
	1008	ENDDO
	1009
	1010	ENDIF
	1011
	1012	!
	1013	!-- Solve the linear equation system
	1014	CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) )
	1015
	1016	IF ( host(1:3) == 'nec' ) THEN
	1017	!
	1018	!-- Code optimized for vector processors
	1019	CALL fft_y_m( work_triy, ny, 'backward' )
	1020
	1021	DO k = 1, nz
	1022
	1023	m = 0
	1024	DO n = 1, pdims(2)
[1003]	1025	DO j = 1, nny
[1]	1026	ar(j,k,i,n) = work_triy(m,k)
	1027	m = m + 1
	1028	ENDDO
	1029	ENDDO
	1030
	1031	ENDDO
	1032
	1033	ELSE
	1034	!
	1035	!-- Cache optimized code
	1036	DO k = 1, nz
	1037
	1038	DO j = 0, ny
	1039	work_ffty(j) = work_triy(j,k)
	1040	ENDDO
	1041
[1106]	1042	CALL fft_y_1d( work_ffty, 'backward' )
[1]	1043
	1044	m = 0
	1045	DO n = 1, pdims(2)
[1003]	1046	DO j = 1, nny
[1]	1047	ar(j,k,i,n) = work_ffty(m)
	1048	m = m + 1
	1049	ENDDO
	1050	ENDDO
	1051
	1052	ENDDO
	1053
	1054	ENDIF
	1055
	1056	ENDDO
	1057
	1058	DEALLOCATE( tri )
	1059
[1106]	1060	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'stop' )
[1]	1061
	1062	END SUBROUTINE ffty_tri_ffty
	1063
	1064
	1065	SUBROUTINE tridia_1dd( ddx2, ddy2, nx, ny, j, ar, tri )
	1066
	1067	!------------------------------------------------------------------------------!
	1068	! Solves the linear system of equations for a 1d-decomposition along x (see
	1069	! tridia)
	1070	!
[940]	1071	! Attention: when using the intel compilers older than 12.0, array tri must
	1072	! be passed as an argument to the contained subroutines. Otherwise
	1073	! addres faults will occur. This feature can be activated with
	1074	! cpp-switch __intel11
[1]	1075	! On NEC, tri should not be passed (except for routine substi_1dd)
	1076	! because this causes very bad performance.
	1077	!------------------------------------------------------------------------------!
	1078
	1079	USE arrays_3d
	1080	USE control_parameters
	1081
	1082	USE pegrid
	1083
	1084	IMPLICIT NONE
	1085
	1086	INTEGER :: i, j, k, nnyh, nx, ny, omp_get_thread_num, tn
	1087
	1088	REAL :: ddx2, ddy2
	1089
	1090	REAL, DIMENSION(0:nx,1:nz) :: ar
	1091	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	1092
	1093
	1094	nnyh = ( ny + 1 ) / 2
	1095
	1096	!
	1097	!-- Define constant elements of the tridiagonal matrix.
	1098	!-- The compiler on SX6 does loop exchange. If 0:nx is a high power of 2,
	1099	!-- the exchanged loops create bank conflicts. The following directive
	1100	!-- prohibits loop exchange and the loops perform much better.
	1101	! tn = omp_get_thread_num()
	1102	! WRITE( 120+tn, * ) '+++ id=',myid,' nx=',nx,' thread=', omp_get_thread_num()
[82]	1103	! CALL local_flush( 120+tn )
[1]	1104	!CDIR NOLOOPCHG
	1105	DO k = 0, nz-1
	1106	DO i = 0,nx
[667]	1107	tri(2,i,k) = ddzu_pres(k+1) * ddzw(k+1)
	1108	tri(3,i,k) = ddzu_pres(k+2) * ddzw(k+1)
[1]	1109	ENDDO
	1110	ENDDO
	1111	! WRITE( 120+tn, * ) '+++ id=',myid,' end of first tridia loop thread=', omp_get_thread_num()
[82]	1112	! CALL local_flush( 120+tn )
[1]	1113
	1114	IF ( j <= nnyh ) THEN
[940]	1115	#if defined( __intel11 )
[1]	1116	CALL maketri_1dd( j, tri )
	1117	#else
	1118	CALL maketri_1dd( j )
	1119	#endif
	1120	ELSE
[940]	1121	#if defined( __intel11 )
[1]	1122	CALL maketri_1dd( ny+1-j, tri )
	1123	#else
	1124	CALL maketri_1dd( ny+1-j )
	1125	#endif
	1126	ENDIF
[940]	1127	#if defined( __intel11 )
[1]	1128	CALL split_1dd( tri )
	1129	#else
	1130	CALL split_1dd
	1131	#endif
	1132	CALL substi_1dd( ar, tri )
	1133
	1134	CONTAINS
	1135
[940]	1136	#if defined( __intel11 )
[1]	1137	SUBROUTINE maketri_1dd( j, tri )
	1138	#else
	1139	SUBROUTINE maketri_1dd( j )
	1140	#endif
	1141
	1142	!------------------------------------------------------------------------------!
	1143	! computes the i- and j-dependent component of the matrix
	1144	!------------------------------------------------------------------------------!
	1145
	1146	USE constants
	1147
	1148	IMPLICIT NONE
	1149
	1150	INTEGER :: i, j, k, nnxh
	1151	REAL :: a, c
	1152
	1153	REAL, DIMENSION(0:nx) :: l
	1154
[940]	1155	#if defined( __intel11 )
[1]	1156	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	1157	#endif
	1158
	1159
	1160	nnxh = ( nx + 1 ) / 2
	1161	!
	1162	!-- Provide the tridiagonal matrix for solution of the Poisson equation in
	1163	!-- Fourier space. The coefficients are computed following the method of
	1164	!-- Schmidt et al. (DFVLR-Mitteilung 84-15), which departs from Stephan
	1165	!-- Siano's original version by discretizing the Poisson equation,
	1166	!-- before it is Fourier-transformed
	1167	DO i = 0, nx
[128]	1168	IF ( i >= 0 .AND. i <= nnxh ) THEN
[1]	1169	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
[1013]	1170	REAL( nx+1 ) ) ) * ddx2 + &
[1]	1171	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
[1013]	1172	REAL( ny+1 ) ) ) * ddy2
[1]	1173	ELSE
	1174	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
[1013]	1175	REAL( nx+1 ) ) ) * ddx2 + &
[1]	1176	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
[1013]	1177	REAL( ny+1 ) ) ) * ddy2
[1]	1178	ENDIF
	1179	ENDDO
	1180
	1181	DO k = 0, nz-1
	1182	DO i = 0, nx
[667]	1183	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
	1184	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
[1]	1185	tri(1,i,k) = a + c - l(i)
	1186	ENDDO
	1187	ENDDO
[1111]	1188	IF ( ibc_p_b == 1 ) THEN
[1]	1189	DO i = 0, nx
	1190	tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
	1191	ENDDO
	1192	ENDIF
	1193	IF ( ibc_p_t == 1 ) THEN
	1194	DO i = 0, nx
	1195	tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
	1196	ENDDO
	1197	ENDIF
	1198
	1199	END SUBROUTINE maketri_1dd
	1200
	1201
[940]	1202	#if defined( __intel11 )
[1]	1203	SUBROUTINE split_1dd( tri )
	1204	#else
	1205	SUBROUTINE split_1dd
	1206	#endif
	1207
	1208	!------------------------------------------------------------------------------!
	1209	! Splitting of the tridiagonal matrix (Thomas algorithm)
	1210	!------------------------------------------------------------------------------!
	1211
	1212	IMPLICIT NONE
	1213
	1214	INTEGER :: i, k
	1215
[940]	1216	#if defined( __intel11 )
[1]	1217	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	1218	#endif
	1219
	1220
	1221	!
	1222	!-- Splitting
	1223	DO i = 0, nx
	1224	tri(4,i,0) = tri(1,i,0)
	1225	ENDDO
	1226	DO k = 1, nz-1
	1227	DO i = 0, nx
	1228	tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
	1229	tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
	1230	ENDDO
	1231	ENDDO
	1232
	1233	END SUBROUTINE split_1dd
	1234
	1235
	1236	SUBROUTINE substi_1dd( ar, tri )
	1237
	1238	!------------------------------------------------------------------------------!
	1239	! Substitution (Forward and Backward) (Thomas algorithm)
	1240	!------------------------------------------------------------------------------!
	1241
	1242	IMPLICIT NONE
	1243
[76]	1244	INTEGER :: i, k
[1]	1245
	1246	REAL, DIMENSION(0:nx,nz) :: ar
	1247	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
	1248	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	1249
	1250	!
	1251	!-- Forward substitution
	1252	DO i = 0, nx
	1253	ar1(i,0) = ar(i,1)
	1254	ENDDO
	1255	DO k = 1, nz-1
	1256	DO i = 0, nx
	1257	ar1(i,k) = ar(i,k+1) - tri(5,i,k) * ar1(i,k-1)
	1258	ENDDO
	1259	ENDDO
	1260
	1261	!
	1262	!-- Backward substitution
[763]	1263	!-- Note, the add of 1.0E-20 in the denominator is due to avoid divisions
	1264	!-- by zero appearing if the pressure bc is set to neumann at the top of
	1265	!-- the model domain.
[1]	1266	DO i = 0, nx
[761]	1267	ar(i,nz) = ar1(i,nz-1) / ( tri(4,i,nz-1) + 1.0E-20 )
[1]	1268	ENDDO
	1269	DO k = nz-2, 0, -1
	1270	DO i = 0, nx
	1271	ar(i,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,k+2) ) &
	1272	/ tri(4,i,k)
	1273	ENDDO
	1274	ENDDO
	1275
[76]	1276	!
	1277	!-- Indices i=0, j=0 correspond to horizontally averaged pressure.
	1278	!-- The respective values of ar should be zero at all k-levels if
	1279	!-- acceleration of horizontally averaged vertical velocity is zero.
	1280	IF ( ibc_p_b == 1 .AND. ibc_p_t == 1 ) THEN
	1281	IF ( j == 0 ) THEN
	1282	DO k = 1, nz
	1283	ar(0,k) = 0.0
	1284	ENDDO
	1285	ENDIF
	1286	ENDIF
	1287
[1]	1288	END SUBROUTINE substi_1dd
	1289
	1290	END SUBROUTINE tridia_1dd
	1291
[1111]	1292
	1293	SUBROUTINE tridia( ar )
	1294
	1295	!------------------------------------------------------------------------------!
	1296	! solves the linear system of equations:
	1297	!
	1298	! -(4 pi^2(i^2/(dx^2nnx^2)+j^2/(dy^2nny^2))+
	1299	! 1/(dzu(k)dzw(k))+1/(dzu(k-1)dzw(k)))*p(i,j,k)+
	1300	! 1/(dzu(k)dzw(k))p(i,j,k+1)+1/(dzu(k-1)dzw(k))p(i,j,k-1)=d(i,j,k)
	1301	!
	1302	! by using the Thomas algorithm
	1303	!------------------------------------------------------------------------------!
	1304
	1305	USE arrays_3d
	1306
	1307	IMPLICIT NONE
	1308
	1309	INTEGER :: i, j, k
	1310
	1311	!$acc declare create( tri )
	1312	REAL, DIMENSION(nxl_z:nxr_z,nys_z:nyn_z,0:nz-1,2) :: tri
	1313
	1314	REAL :: ar(nxl_z:nxr_z,nys_z:nyn_z,1:nz)
	1315
	1316
	1317	CALL split( tri )
	1318	CALL substi( ar, tri )
	1319
	1320	END SUBROUTINE tridia
	1321
	1322
	1323	SUBROUTINE maketri
	1324
	1325	!------------------------------------------------------------------------------!
	1326	! Computes the i- and j-dependent component of the matrix
	1327	!------------------------------------------------------------------------------!
	1328
	1329	USE arrays_3d, ONLY: tric
	1330	USE constants
	1331	USE control_parameters
	1332	USE grid_variables
	1333
	1334	IMPLICIT NONE
	1335
	1336	INTEGER :: i, j, k, nnxh, nnyh
	1337
	1338	!$acc declare create( ll )
	1339	REAL :: ll(nxl_z:nxr_z,nys_z:nyn_z)
	1340
	1341
	1342	nnxh = ( nx + 1 ) / 2
	1343	nnyh = ( ny + 1 ) / 2
	1344
	1345	!
	1346	!-- Provide the constant coefficients of the tridiagonal matrix for solution
	1347	!-- of the Poisson equation in Fourier space.
	1348	!-- The coefficients are computed following the method of
	1349	!-- Schmidt et al. (DFVLR-Mitteilung 84-15), which departs from Stephan
	1350	!-- Siano's original version by discretizing the Poisson equation,
	1351	!-- before it is Fourier-transformed.
	1352
	1353	!$acc kernels present( tric )
	1354	!$acc loop vector( 32 )
	1355	DO j = nys_z, nyn_z
	1356	DO i = nxl_z, nxr_z
	1357	IF ( j >= 0 .AND. j <= nnyh ) THEN
	1358	IF ( i >= 0 .AND. i <= nnxh ) THEN
	1359	ll(i,j) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
	1360	REAL( nx+1 ) ) ) / ( dx * dx ) + &
	1361	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
	1362	REAL( ny+1 ) ) ) / ( dy * dy )
	1363	ELSE
	1364	ll(i,j) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
	1365	REAL( nx+1 ) ) ) / ( dx * dx ) + &
	1366	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
	1367	REAL( ny+1 ) ) ) / ( dy * dy )
	1368	ENDIF
	1369	ELSE
	1370	IF ( i >= 0 .AND. i <= nnxh ) THEN
	1371	ll(i,j) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
	1372	REAL( nx+1 ) ) ) / ( dx * dx ) + &
	1373	2.0 * ( 1.0 - COS( ( 2.0 * pi * ( ny+1-j ) ) / &
	1374	REAL( ny+1 ) ) ) / ( dy * dy )
	1375	ELSE
	1376	ll(i,j) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
	1377	REAL( nx+1 ) ) ) / ( dx * dx ) + &
	1378	2.0 * ( 1.0 - COS( ( 2.0 * pi * ( ny+1-j ) ) / &
	1379	REAL( ny+1 ) ) ) / ( dy * dy )
	1380	ENDIF
	1381	ENDIF
	1382	ENDDO
	1383	ENDDO
	1384
	1385	!$acc loop
	1386	DO k = 0, nz-1
	1387	DO j = nys_z, nyn_z
	1388	!$acc loop vector( 32 )
	1389	DO i = nxl_z, nxr_z
	1390	tric(i,j,k) = ddzuw(k,3) - ll(i,j)
	1391	ENDDO
	1392	ENDDO
	1393	ENDDO
	1394	!$acc end kernels
	1395
	1396	IF ( ibc_p_b == 1 ) THEN
	1397	!$acc kernels present( tric )
	1398	!$acc loop
	1399	DO j = nys_z, nyn_z
	1400	DO i = nxl_z, nxr_z
	1401	tric(i,j,0) = tric(i,j,0) + ddzuw(0,1)
	1402	ENDDO
	1403	ENDDO
	1404	!$acc end kernels
	1405	ENDIF
	1406	IF ( ibc_p_t == 1 ) THEN
	1407	!$acc kernels present( tric )
	1408	!$acc loop
	1409	DO j = nys_z, nyn_z
	1410	DO i = nxl_z, nxr_z
	1411	tric(i,j,nz-1) = tric(i,j,nz-1) + ddzuw(nz-1,2)
	1412	ENDDO
	1413	ENDDO
	1414	!$acc end kernels
	1415	ENDIF
	1416
	1417	END SUBROUTINE maketri
	1418
	1419
	1420	SUBROUTINE substi( ar, tri )
	1421
	1422	!------------------------------------------------------------------------------!
	1423	! Substitution (Forward and Backward) (Thomas algorithm)
	1424	!------------------------------------------------------------------------------!
	1425
	1426	USE control_parameters
	1427
	1428	IMPLICIT NONE
	1429
	1430	INTEGER :: i, j, k
	1431
	1432	REAL :: ar(nxl_z:nxr_z,nys_z:nyn_z,1:nz)
	1433	REAL, DIMENSION(nxl_z:nxr_z,nys_z:nyn_z,0:nz-1,2) :: tri
	1434
	1435	!$acc declare create( ar1 )
	1436	REAL, DIMENSION(nxl_z:nxr_z,nys_z:nyn_z,0:nz-1) :: ar1
	1437
	1438	!
	1439	!-- Forward substitution
	1440	DO k = 0, nz - 1
	1441	!$acc kernels present( ar, tri )
	1442	!$acc loop
	1443	DO j = nys_z, nyn_z
	1444	DO i = nxl_z, nxr_z
	1445
	1446	IF ( k == 0 ) THEN
	1447	ar1(i,j,k) = ar(i,j,k+1)
	1448	ELSE
	1449	ar1(i,j,k) = ar(i,j,k+1) - tri(i,j,k,2) * ar1(i,j,k-1)
	1450	ENDIF
	1451
	1452	ENDDO
	1453	ENDDO
	1454	!$acc end kernels
	1455	ENDDO
	1456
	1457	!
	1458	!-- Backward substitution
	1459	!-- Note, the 1.0E-20 in the denominator is due to avoid divisions
	1460	!-- by zero appearing if the pressure bc is set to neumann at the top of
	1461	!-- the model domain.
	1462	DO k = nz-1, 0, -1
	1463	!$acc kernels present( ar, tri )
	1464	!$acc loop
	1465	DO j = nys_z, nyn_z
	1466	DO i = nxl_z, nxr_z
	1467
	1468	IF ( k == nz-1 ) THEN
	1469	ar(i,j,k+1) = ar1(i,j,k) / ( tri(i,j,k,1) + 1.0E-20 )
	1470	ELSE
	1471	ar(i,j,k+1) = ( ar1(i,j,k) - ddzuw(k,2) * ar(i,j,k+2) ) &
	1472	/ tri(i,j,k,1)
	1473	ENDIF
	1474	ENDDO
	1475	ENDDO
	1476	!$acc end kernels
	1477	ENDDO
	1478
	1479	!
	1480	!-- Indices i=0, j=0 correspond to horizontally averaged pressure.
	1481	!-- The respective values of ar should be zero at all k-levels if
	1482	!-- acceleration of horizontally averaged vertical velocity is zero.
	1483	IF ( ibc_p_b == 1 .AND. ibc_p_t == 1 ) THEN
	1484	IF ( nys_z == 0 .AND. nxl_z == 0 ) THEN
	1485	!$acc kernels loop present( ar )
	1486	DO k = 1, nz
	1487	ar(nxl_z,nys_z,k) = 0.0
	1488	ENDDO
	1489	ENDIF
	1490	ENDIF
	1491
	1492	END SUBROUTINE substi
	1493
	1494
	1495	SUBROUTINE split( tri )
	1496
	1497	!------------------------------------------------------------------------------!
	1498	! Splitting of the tridiagonal matrix (Thomas algorithm)
	1499	!------------------------------------------------------------------------------!
	1500
	1501	USE arrays_3d, ONLY: tric
	1502
	1503	IMPLICIT NONE
	1504
	1505	INTEGER :: i, j, k
	1506
	1507	REAL, DIMENSION(nxl_z:nxr_z,nys_z:nyn_z,0:nz-1,2) :: tri
	1508
	1509	!
	1510	!-- Splitting
	1511	!$acc kernels present( tri, tric )
	1512	!$acc loop
	1513	DO j = nys_z, nyn_z
	1514	!$acc loop vector( 32 )
	1515	DO i = nxl_z, nxr_z
	1516	tri(i,j,0,1) = tric(i,j,0)
	1517	ENDDO
	1518	ENDDO
	1519	!$acc end kernels
	1520
	1521	DO k = 1, nz-1
	1522	!$acc kernels present( tri, tric )
	1523	!$acc loop
	1524	DO j = nys_z, nyn_z
	1525	!$acc loop vector( 32 )
	1526	DO i = nxl_z, nxr_z
	1527	tri(i,j,k,2) = ddzuw(k,1) / tri(i,j,k-1,1)
	1528	tri(i,j,k,1) = tric(i,j,k) - ddzuw(k-1,2) * tri(i,j,k,2)
	1529	ENDDO
	1530	ENDDO
	1531	!$acc end kernels
	1532	ENDDO
	1533
	1534	END SUBROUTINE split
	1535
[1]	1536	#endif
[1111]	1537
[1]	1538	END MODULE poisfft_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |