Home

Context Navigation

source: palm/trunk/SOURCE/poisfft.f90 @ 787

Last change on this file since 787 was 764, checked in by suehring, 13 years ago
last commit documented
Property svn:keywords set to `Id`
File size: 46.0 KB

Rev	Line
[1]	1	MODULE poisfft_mod
	2
	3	!------------------------------------------------------------------------------!
[484]	4	! Current revisions:
[1]	5	! -----------------
	6	!
[761]	7	!
[1]	8	! Former revisions:
	9	! -----------------
[3]	10	! $Id: poisfft.f90 764 2011-10-06 09:34:55Z heinze $
[77]	11	!
[764]	12	! 763 2011-10-06 09:32:09Z suehring
	13	! Comment added concerning the last change.
	14	!
[762]	15	! 761 2011-10-05 17:58:52Z suehring
	16	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
	17	! pressure at the top of the model domain.
	18	!
[697]	19	! 696 2011-03-18 07:03:49Z raasch
	20	! work_fftx removed from PRIVATE clauses in fftx_tr_xy and tr_yx_fftx
	21	!
[684]	22	! 683 2011-02-09 14:25:15Z raasch
	23	! openMP parallelization for 2d-domain-decomposition
	24	!
[668]	25	! 667 2010-12-23 12:06:00Z suehring/gryschka
	26	! ddzu replaced by ddzu_pres due to changes in zu(0)
	27	!
[623]	28	! 622 2010-12-10 08:08:13Z raasch
	29	! optional barriers included in order to speed up collective operations
	30	!
[392]	31	! 377 2009-09-04 11:09:00Z raasch
	32	! __lcmuk changed to __lc to avoid problems with Intel compiler on sgi-ice
	33	!
[198]	34	! 164 2008-05-15 08:46:15Z raasch
	35	! Arguments removed from transpose routines
	36	!
[139]	37	! 128 2007-10-26 13:11:14Z raasch
	38	! Bugfix: wavenumber calculation for even nx in routines maketri
	39	!
[90]	40	! 85 2007-05-11 09:35:14Z raasch
	41	! Bugfix: work_fft*_vec removed from some PRIVATE-declarations
	42	!
[77]	43	! 76 2007-03-29 00:58:32Z raasch
	44	! Tridiagonal coefficients adjusted for Neumann boundary conditions both at
	45	! the bottom and the top.
	46	!
[3]	47	! RCS Log replace by Id keyword, revision history cleaned up
	48	!
[1]	49	! Revision 1.24 2006/08/04 15:00:24 raasch
	50	! Default setting of the thread number tn in case of not using OpenMP
	51	!
	52	! Revision 1.23 2006/02/23 12:48:38 raasch
	53	! Additional compiler directive in routine tridia_1dd for preventing loop
	54	! exchange on NEC-SX6
	55	!
	56	! Revision 1.20 2004/04/30 12:38:09 raasch
	57	! Parts of former poisfft_hybrid moved to this subroutine,
	58	! former subroutine changed to a module, renaming of FFT-subroutines and
	59	! -module, FFTs completely substituted by calls of fft_x and fft_y,
	60	! NAG fft used in the non-parallel case completely removed, l in maketri
	61	! is now a 1d-array, variables passed by modules instead of using parameter
	62	! lists, enlarged transposition arrays introduced
	63	!
	64	! Revision 1.1 1997/07/24 11:24:14 raasch
	65	! Initial revision
	66	!
	67	!
	68	! Description:
	69	! ------------
	70	! See below.
	71	!------------------------------------------------------------------------------!
	72
	73	!--------------------------------------------------------------------------!
	74	! poisfft !
	75	! !
	76	! Original version: Stephan Siano (pois3d) !
	77	! !
	78	! Institute of Meteorology and Climatology, University of Hannover !
	79	! Germany !
	80	! !
	81	! Version as of July 23,1996 !
	82	! !
	83	! !
	84	! Version for parallel computers: Siegfried Raasch !
	85	! !
	86	! Version as of July 03,1997 !
	87	! !
	88	! Solves the Poisson equation with a 2D spectral method !
	89	! d^2 p / dx^2 + d^2 p / dy^2 + d^2 p / dz^2 = s !
	90	! !
	91	! Input: !
	92	! real ar contains in the (nnx,nny,nnz) elements, !
	93	! starting from the element (1,nys,nxl), the !
	94	! values for s !
	95	! real work Temporary array !
	96	! !
	97	! Output: !
	98	! real ar contains the solution for p !
	99	!--------------------------------------------------------------------------!
	100
	101	USE fft_xy
	102	USE indices
	103	USE transpose_indices
	104
	105	IMPLICIT NONE
	106
	107	PRIVATE
	108	PUBLIC poisfft, poisfft_init
	109
	110	INTERFACE poisfft
	111	MODULE PROCEDURE poisfft
	112	END INTERFACE poisfft
	113
	114	INTERFACE poisfft_init
	115	MODULE PROCEDURE poisfft_init
	116	END INTERFACE poisfft_init
	117
	118	CONTAINS
	119
	120	SUBROUTINE poisfft_init
	121
	122	CALL fft_init
	123
	124	END SUBROUTINE poisfft_init
	125
	126
	127	SUBROUTINE poisfft( ar, work )
	128
	129	USE cpulog
	130	USE interfaces
	131	USE pegrid
	132
	133	IMPLICIT NONE
	134
	135	REAL, DIMENSION(1:nza,nys:nyna,nxl:nxra) :: ar, work
	136
	137
	138	CALL cpu_log( log_point_s(3), 'poisfft', 'start' )
	139
	140	!
	141	!-- Two-dimensional Fourier Transformation in x- and y-direction.
	142	#if defined( __parallel )
	143	IF ( pdims(2) == 1 ) THEN
	144
	145	!
	146	!-- 1d-domain-decomposition along x:
	147	!-- FFT along y and transposition y --> x
	148	CALL ffty_tr_yx( ar, work, ar )
	149
	150	!
	151	!-- FFT along x, solving the tridiagonal system and backward FFT
	152	CALL fftx_tri_fftx( ar )
	153
	154	!
	155	!-- Transposition x --> y and backward FFT along y
	156	CALL tr_xy_ffty( ar, work, ar )
	157
	158	ELSEIF ( pdims(1) == 1 ) THEN
	159
	160	!
	161	!-- 1d-domain-decomposition along y:
	162	!-- FFT along x and transposition x --> y
	163	CALL fftx_tr_xy( ar, work, ar )
	164
	165	!
	166	!-- FFT along y, solving the tridiagonal system and backward FFT
	167	CALL ffty_tri_ffty( ar )
	168
	169	!
	170	!-- Transposition y --> x and backward FFT along x
	171	CALL tr_yx_fftx( ar, work, ar )
	172
	173	ELSE
	174
	175	!
	176	!-- 2d-domain-decomposition
	177	!-- Transposition z --> x
	178	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
[164]	179	CALL transpose_zx( ar, work, ar )
[1]	180	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
	181
	182	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
	183	CALL fftxp( ar, 'forward' )
	184	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
	185
	186	!
	187	!-- Transposition x --> y
	188	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
[164]	189	CALL transpose_xy( ar, work, ar )
[1]	190	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
	191
	192	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
	193	CALL fftyp( ar, 'forward' )
	194	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	195
	196	!
	197	!-- Transposition y --> z
	198	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
[164]	199	CALL transpose_yz( ar, work, ar )
[1]	200	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
	201
	202	!
	203	!-- Solve the Poisson equation in z-direction in cartesian space.
	204	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
	205	CALL tridia( ar )
	206	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
	207
	208	!
	209	!-- Inverse Fourier Transformation
	210	!-- Transposition z --> y
	211	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
[164]	212	CALL transpose_zy( ar, work, ar )
[1]	213	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
	214
	215	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
	216	CALL fftyp( ar, 'backward' )
	217	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	218
	219	!
	220	!-- Transposition y --> x
	221	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
[164]	222	CALL transpose_yx( ar, work, ar )
[1]	223	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
	224
	225	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
	226	CALL fftxp( ar, 'backward' )
	227	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	228
	229	!
	230	!-- Transposition x --> z
	231	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
[164]	232	CALL transpose_xz( ar, work, ar )
[1]	233	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
	234
	235	ENDIF
	236
	237	#else
	238
	239	!
	240	!-- Two-dimensional Fourier Transformation along x- and y-direction.
	241	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
	242	CALL fftx( ar, 'forward' )
	243	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
	244	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
	245	CALL ffty( ar, 'forward' )
	246	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	247
	248	!
	249	!-- Solve the Poisson equation in z-direction in cartesian space.
	250	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
	251	CALL tridia( ar )
	252	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
	253
	254	!
	255	!-- Inverse Fourier Transformation.
	256	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
	257	CALL ffty( ar, 'backward' )
	258	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	259	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
	260	CALL fftx( ar, 'backward' )
	261	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	262
	263	#endif
	264
	265	CALL cpu_log( log_point_s(3), 'poisfft', 'stop' )
	266
	267	END SUBROUTINE poisfft
	268
	269
	270
	271	SUBROUTINE tridia( ar )
	272
	273	!------------------------------------------------------------------------------!
	274	! solves the linear system of equations:
	275	!
	276	! -(4 pi^2(i^2/(dx^2nnx^2)+j^2/(dy^2nny^2))+
	277	! 1/(dzu(k)dzw(k))+1/(dzu(k-1)dzw(k)))*p(i,j,k)+
	278	! 1/(dzu(k)dzw(k))p(i,j,k+1)+1/(dzu(k-1)dzw(k))p(i,j,k-1)=d(i,j,k)
	279	!
	280	! by using the Thomas algorithm
	281	!------------------------------------------------------------------------------!
	282
	283	USE arrays_3d
	284
	285	IMPLICIT NONE
	286
	287	INTEGER :: i, j, k, nnyh
	288
	289	REAL, DIMENSION(nxl_z:nxr_z,0:nz-1) :: ar1
	290	REAL, DIMENSION(5,nxl_z:nxr_z,0:nz-1) :: tri
	291
	292	#if defined( __parallel )
	293	REAL :: ar(nxl_z:nxr_za,nys_z:nyn_za,1:nza)
	294	#else
	295	REAL :: ar(1:nz,nys_z:nyn_z,nxl_z:nxr_z)
	296	#endif
	297
	298
	299	nnyh = (ny+1) / 2
	300
	301	!
	302	!-- Define constant elements of the tridiagonal matrix.
[683]	303	!$OMP PARALLEL PRIVATE ( k, i )
	304	!$OMP DO
[1]	305	DO k = 0, nz-1
	306	DO i = nxl_z, nxr_z
[667]	307	tri(2,i,k) = ddzu_pres(k+1) * ddzw(k+1)
	308	tri(3,i,k) = ddzu_pres(k+2) * ddzw(k+1)
[1]	309	ENDDO
	310	ENDDO
[683]	311	!$OMP END PARALLEL
[1]	312
	313	#if defined( __parallel )
	314	!
	315	!-- Repeat for all y-levels.
[683]	316	!$OMP PARALLEL FIRSTPRIVATE( tri ) PRIVATE ( ar1, j )
	317	!$OMP DO
[1]	318	DO j = nys_z, nyn_z
	319	IF ( j <= nnyh ) THEN
	320	CALL maketri( tri, j )
	321	ELSE
	322	CALL maketri( tri, ny+1-j )
	323	ENDIF
	324	CALL split( tri )
	325	CALL substi( ar, ar1, tri, j )
	326	ENDDO
[683]	327	!$OMP END PARALLEL
[1]	328	#else
	329	!
	330	!-- First y-level.
	331	CALL maketri( tri, nys_z )
	332	CALL split( tri )
	333	CALL substi( ar, ar1, tri, 0 )
	334
	335	!
	336	!-- Further y-levels.
	337	DO j = 1, nnyh - 1
	338	CALL maketri( tri, j )
	339	CALL split( tri )
	340	CALL substi( ar, ar1, tri, j )
	341	CALL substi( ar, ar1, tri, ny+1-j )
	342	ENDDO
	343	CALL maketri( tri, nnyh )
	344	CALL split( tri )
	345	CALL substi( ar, ar1, tri, nnyh+nys )
	346	#endif
	347
	348	CONTAINS
	349
	350	SUBROUTINE maketri( tri, j )
	351
	352	!------------------------------------------------------------------------------!
	353	! Computes the i- and j-dependent component of the matrix
	354	!------------------------------------------------------------------------------!
	355
	356	USE arrays_3d
	357	USE constants
	358	USE control_parameters
	359	USE grid_variables
	360
	361	IMPLICIT NONE
	362
	363	INTEGER :: i, j, k, nnxh
	364	REAL :: a, c
	365	REAL :: ll(nxl_z:nxr_z)
	366	REAL :: tri(5,nxl_z:nxr_z,0:nz-1)
	367
	368
	369	nnxh = ( nx + 1 ) / 2
	370
	371	!
	372	!-- Provide the tridiagonal matrix for solution of the Poisson equation in
	373	!-- Fourier space. The coefficients are computed following the method of
	374	!-- Schmidt et al. (DFVLR-Mitteilung 84-15), which departs from Stephan
	375	!-- Siano's original version by discretizing the Poisson equation,
	376	!-- before it is Fourier-transformed
	377	#if defined( __parallel )
	378	DO i = nxl_z, nxr_z
[128]	379	IF ( i >= 0 .AND. i <= nnxh ) THEN
[1]	380	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
	381	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
	382	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
	383	FLOAT( ny+1 ) ) ) / ( dy * dy )
	384	ELSE
	385	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
	386	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
	387	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
	388	FLOAT( ny+1 ) ) ) / ( dy * dy )
	389	ENDIF
	390	DO k = 0,nz-1
[667]	391	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
	392	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
[1]	393	tri(1,i,k) = a + c - ll(i)
	394	ENDDO
	395	ENDDO
	396	#else
	397	DO i = 0, nnxh
	398	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / FLOAT( nx+1 ) ) ) / &
	399	( dx * dx ) + &
	400	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / FLOAT( ny+1 ) ) ) / &
	401	( dy * dy )
	402	DO k = 0, nz-1
[667]	403	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
	404	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
[1]	405	tri(1,i,k) = a + c - ll(i)
	406	IF ( i >= 1 .and. i < nnxh ) THEN
	407	tri(1,nx+1-i,k) = tri(1,i,k)
	408	ENDIF
	409	ENDDO
	410	ENDDO
	411	#endif
	412	IF ( ibc_p_b == 1 .OR. ibc_p_b == 2 ) THEN
	413	DO i = nxl_z, nxr_z
	414	tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
	415	ENDDO
	416	ENDIF
	417	IF ( ibc_p_t == 1 ) THEN
	418	DO i = nxl_z, nxr_z
	419	tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
	420	ENDDO
	421	ENDIF
	422
	423	END SUBROUTINE maketri
	424
	425
	426	SUBROUTINE substi( ar, ar1, tri, j )
	427
	428	!------------------------------------------------------------------------------!
	429	! Substitution (Forward and Backward) (Thomas algorithm)
	430	!------------------------------------------------------------------------------!
	431
[76]	432	USE control_parameters
	433
[1]	434	IMPLICIT NONE
	435
	436	INTEGER :: i, j, k
	437	REAL :: ar1(nxl_z:nxr_z,0:nz-1)
	438	REAL :: tri(5,nxl_z:nxr_z,0:nz-1)
	439	#if defined( __parallel )
	440	REAL :: ar(nxl_z:nxr_za,nys_z:nyn_za,1:nza)
	441	#else
	442	REAL :: ar(1:nz,nys_z:nyn_z,nxl_z:nxr_z)
	443	#endif
	444
	445	!
	446	!-- Forward substitution.
	447	DO i = nxl_z, nxr_z
	448	#if defined( __parallel )
	449	ar1(i,0) = ar(i,j,1)
	450	#else
	451	ar1(i,0) = ar(1,j,i)
	452	#endif
	453	ENDDO
	454	DO k = 1, nz - 1
	455	DO i = nxl_z, nxr_z
	456	#if defined( __parallel )
	457	ar1(i,k) = ar(i,j,k+1) - tri(5,i,k) * ar1(i,k-1)
	458	#else
	459	ar1(i,k) = ar(k+1,j,i) - tri(5,i,k) * ar1(i,k-1)
	460	#endif
	461	ENDDO
	462	ENDDO
	463
	464	!
	465	!-- Backward substitution.
	466	DO i = nxl_z, nxr_z
	467	#if defined( __parallel )
	468	ar(i,j,nz) = ar1(i,nz-1) / tri(4,i,nz-1)
	469	#else
	470	ar(nz,j,i) = ar1(i,nz-1) / tri(4,i,nz-1)
	471	#endif
	472	ENDDO
	473	DO k = nz-2, 0, -1
	474	DO i = nxl_z, nxr_z
	475	#if defined( __parallel )
	476	ar(i,j,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,j,k+2) ) &
	477	/ tri(4,i,k)
	478	#else
	479	ar(k+1,j,i) = ( ar1(i,k) - tri(3,i,k) * ar(k+2,j,i) ) &
	480	/ tri(4,i,k)
	481	#endif
	482	ENDDO
	483	ENDDO
	484
[76]	485	!
	486	!-- Indices i=0, j=0 correspond to horizontally averaged pressure.
	487	!-- The respective values of ar should be zero at all k-levels if
	488	!-- acceleration of horizontally averaged vertical velocity is zero.
	489	IF ( ibc_p_b == 1 .AND. ibc_p_t == 1 ) THEN
	490	IF ( j == 0 .AND. nxl_z == 0 ) THEN
	491	#if defined( __parallel )
	492	DO k = 1, nz
	493	ar(nxl_z,j,k) = 0.0
	494	ENDDO
	495	#else
	496	DO k = 1, nz
	497	ar(k,j,nxl_z) = 0.0
	498	ENDDO
	499	#endif
	500	ENDIF
	501	ENDIF
	502
[1]	503	END SUBROUTINE substi
	504
	505
	506	SUBROUTINE split( tri )
	507
	508	!------------------------------------------------------------------------------!
	509	! Splitting of the tridiagonal matrix (Thomas algorithm)
	510	!------------------------------------------------------------------------------!
	511
	512	IMPLICIT NONE
	513
	514	INTEGER :: i, k
	515	REAL :: tri(5,nxl_z:nxr_z,0:nz-1)
	516
	517	!
	518	!-- Splitting.
	519	DO i = nxl_z, nxr_z
	520	tri(4,i,0) = tri(1,i,0)
	521	ENDDO
	522	DO k = 1, nz-1
	523	DO i = nxl_z, nxr_z
	524	tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
	525	tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
	526	ENDDO
	527	ENDDO
	528
	529	END SUBROUTINE split
	530
	531	END SUBROUTINE tridia
	532
	533
	534	#if defined( __parallel )
	535	SUBROUTINE fftxp( ar, direction )
	536
	537	!------------------------------------------------------------------------------!
	538	! Fourier-transformation along x-direction Parallelized version
	539	!------------------------------------------------------------------------------!
	540
	541	IMPLICIT NONE
	542
	543	CHARACTER (LEN=*) :: direction
	544	INTEGER :: j, k
	545	REAL :: ar(0:nxa,nys_x:nyn_xa,nzb_x:nzt_xa)
	546
	547	!
	548	!-- Performing the fft with one of the methods implemented
[683]	549	!$OMP PARALLEL PRIVATE ( j, k )
	550	!$OMP DO
[1]	551	DO k = nzb_x, nzt_x
	552	DO j = nys_x, nyn_x
	553	CALL fft_x( ar(0:nx,j,k), direction )
	554	ENDDO
	555	ENDDO
[683]	556	!$OMP END PARALLEL
[1]	557
	558	END SUBROUTINE fftxp
	559
	560	#else
	561	SUBROUTINE fftx( ar, direction )
	562
	563	!------------------------------------------------------------------------------!
	564	! Fourier-transformation along x-direction Non parallel version
	565	!------------------------------------------------------------------------------!
	566
	567	IMPLICIT NONE
	568
	569	CHARACTER (LEN=*) :: direction
	570	INTEGER :: i, j, k
	571	REAL :: ar(1:nz,0:ny,0:nx)
	572
	573	!
	574	!-- Performing the fft with one of the methods implemented
[683]	575	!$OMP PARALLEL PRIVATE ( j, k )
	576	!$OMP DO
[1]	577	DO k = 1, nz
	578	DO j = 0, ny
	579	CALL fft_x( ar(k,j,0:nx), direction )
	580	ENDDO
	581	ENDDO
[683]	582	!$OMP END PARALLEL
[1]	583
	584	END SUBROUTINE fftx
	585	#endif
	586
	587
	588	#if defined( __parallel )
	589	SUBROUTINE fftyp( ar, direction )
	590
	591	!------------------------------------------------------------------------------!
	592	! Fourier-transformation along y-direction Parallelized version
	593	!------------------------------------------------------------------------------!
	594
	595	IMPLICIT NONE
	596
	597	CHARACTER (LEN=*) :: direction
	598	INTEGER :: i, k
	599	REAL :: ar(0:nya,nxl_y:nxr_ya,nzb_y:nzt_ya)
	600
	601	!
	602	!-- Performing the fft with one of the methods implemented
[683]	603	!$OMP PARALLEL PRIVATE ( i, k )
	604	!$OMP DO
[1]	605	DO k = nzb_y, nzt_y
	606	DO i = nxl_y, nxr_y
	607	CALL fft_y( ar(0:ny,i,k), direction )
	608	ENDDO
	609	ENDDO
[683]	610	!$OMP END PARALLEL
[1]	611
	612	END SUBROUTINE fftyp
	613
	614	#else
	615	SUBROUTINE ffty( ar, direction )
	616
	617	!------------------------------------------------------------------------------!
	618	! Fourier-transformation along y-direction Non parallel version
	619	!------------------------------------------------------------------------------!
	620
	621	IMPLICIT NONE
	622
	623	CHARACTER (LEN=*) :: direction
	624	INTEGER :: i, k
	625	REAL :: ar(1:nz,0:ny,0:nx)
	626
	627	!
	628	!-- Performing the fft with one of the methods implemented
[683]	629	!$OMP PARALLEL PRIVATE ( i, k )
	630	!$OMP DO
[1]	631	DO k = 1, nz
	632	DO i = 0, nx
	633	CALL fft_y( ar(k,0:ny,i), direction )
	634	ENDDO
	635	ENDDO
[683]	636	!$OMP END PARALLEL
[1]	637
	638	END SUBROUTINE ffty
	639	#endif
	640
	641	#if defined( __parallel )
	642	SUBROUTINE ffty_tr_yx( f_in, work, f_out )
	643
	644	!------------------------------------------------------------------------------!
	645	! Fourier-transformation along y with subsequent transposition y --> x for
	646	! a 1d-decomposition along x
	647	!
	648	! ATTENTION: The performance of this routine is much faster on the NEC-SX6,
	649	! if the first index of work_ffty_vec is odd. Otherwise
	650	! memory bank conflicts may occur (especially if the index is a
	651	! multiple of 128). That's why work_ffty_vec is dimensioned as
	652	! 0:ny+1.
	653	! Of course, this will not work if users are using an odd number
	654	! of gridpoints along y.
	655	!------------------------------------------------------------------------------!
	656
	657	USE control_parameters
	658	USE cpulog
	659	USE indices
	660	USE interfaces
	661	USE pegrid
	662	USE transpose_indices
	663
	664	IMPLICIT NONE
	665
	666	INTEGER :: i, iend, iouter, ir, j, k
	667	INTEGER, PARAMETER :: stridex = 4
	668
	669	REAL, DIMENSION(0:ny,stridex) :: work_ffty
	670	#if defined( __nec )
	671	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
	672	#endif
	673	REAL, DIMENSION(1:nza,0:nya,nxl:nxra) :: f_in
	674	REAL, DIMENSION(nnx,1:nza,nys_x:nyn_xa,pdims(1)) :: f_out
	675	REAL, DIMENSION(nxl:nxra,1:nza,0:nya) :: work
	676
	677	!
	678	!-- Carry out the FFT along y, where all data are present due to the
	679	!-- 1d-decomposition along x. Resort the data in a way that x becomes
	680	!-- the first index.
	681	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
	682
	683	IF ( host(1:3) == 'nec' ) THEN
	684	#if defined( __nec )
	685	!
	686	!-- Code optimized for vector processors
[85]	687	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	688	!$OMP DO
	689	DO i = nxl, nxr
	690
	691	DO j = 0, ny
	692	DO k = 1, nz
	693	work_ffty_vec(j,k,i) = f_in(k,j,i)
	694	ENDDO
	695	ENDDO
	696
	697	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' )
	698
	699	ENDDO
	700
	701	!$OMP DO
	702	DO k = 1, nz
	703	DO j = 0, ny
	704	DO i = nxl, nxr
	705	work(i,k,j) = work_ffty_vec(j,k,i)
	706	ENDDO
	707	ENDDO
	708	ENDDO
	709	!$OMP END PARALLEL
	710	#endif
	711
	712	ELSE
	713
	714	!
	715	!-- Cache optimized code.
	716	!-- The i-(x-)direction is split into a strided outer loop and an inner
	717	!-- loop for better cache performance
	718	!$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty)
	719	!$OMP DO
	720	DO iouter = nxl, nxr, stridex
	721
	722	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
	723
	724	DO k = 1, nz
	725
	726	DO i = iouter, iend
	727
	728	ir = i-iouter+1 ! counter within a stride
	729	DO j = 0, ny
	730	work_ffty(j,ir) = f_in(k,j,i)
	731	ENDDO
	732	!
	733	!-- FFT along y
	734	CALL fft_y( work_ffty(:,ir), 'forward' )
	735
	736	ENDDO
	737
	738	!
	739	!-- Resort
	740	DO j = 0, ny
	741	DO i = iouter, iend
	742	work(i,k,j) = work_ffty(j,i-iouter+1)
	743	ENDDO
	744	ENDDO
	745
	746	ENDDO
	747
	748	ENDDO
	749	!$OMP END PARALLEL
	750
	751	ENDIF
	752	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	753
	754	!
	755	!-- Transpose array
	756	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	757	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	758	CALL MPI_ALLTOALL( work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
	759	f_out(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
	760	comm1dx, ierr )
	761	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	762
	763	END SUBROUTINE ffty_tr_yx
	764
	765
	766	SUBROUTINE tr_xy_ffty( f_in, work, f_out )
	767
	768	!------------------------------------------------------------------------------!
	769	! Transposition x --> y with a subsequent backward Fourier transformation for
	770	! a 1d-decomposition along x
	771	!------------------------------------------------------------------------------!
	772
	773	USE control_parameters
	774	USE cpulog
	775	USE indices
	776	USE interfaces
	777	USE pegrid
	778	USE transpose_indices
	779
	780	IMPLICIT NONE
	781
	782	INTEGER :: i, iend, iouter, ir, j, k
	783	INTEGER, PARAMETER :: stridex = 4
	784
	785	REAL, DIMENSION(0:ny,stridex) :: work_ffty
	786	#if defined( __nec )
	787	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
	788	#endif
	789	REAL, DIMENSION(nnx,1:nza,nys_x:nyn_xa,pdims(1)) :: f_in
	790	REAL, DIMENSION(1:nza,0:nya,nxl:nxra) :: f_out
	791	REAL, DIMENSION(nxl:nxra,1:nza,0:nya) :: work
	792
	793	!
	794	!-- Transpose array
	795	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	796	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	797	CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
	798	work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
	799	comm1dx, ierr )
	800	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	801
	802	!
	803	!-- Resort the data in a way that y becomes the first index and carry out the
	804	!-- backward fft along y.
	805	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
	806
	807	IF ( host(1:3) == 'nec' ) THEN
	808	#if defined( __nec )
	809	!
	810	!-- Code optimized for vector processors
[85]	811	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	812	!$OMP DO
	813	DO k = 1, nz
	814	DO j = 0, ny
	815	DO i = nxl, nxr
	816	work_ffty_vec(j,k,i) = work(i,k,j)
	817	ENDDO
	818	ENDDO
	819	ENDDO
	820
	821	!$OMP DO
	822	DO i = nxl, nxr
	823
	824	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' )
	825
	826	DO j = 0, ny
	827	DO k = 1, nz
	828	f_out(k,j,i) = work_ffty_vec(j,k,i)
	829	ENDDO
	830	ENDDO
	831
	832	ENDDO
	833	!$OMP END PARALLEL
	834	#endif
	835
	836	ELSE
	837
	838	!
	839	!-- Cache optimized code.
	840	!-- The i-(x-)direction is split into a strided outer loop and an inner
	841	!-- loop for better cache performance
	842	!$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty )
	843	!$OMP DO
	844	DO iouter = nxl, nxr, stridex
	845
	846	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
	847
	848	DO k = 1, nz
	849	!
	850	!-- Resort
	851	DO j = 0, ny
	852	DO i = iouter, iend
	853	work_ffty(j,i-iouter+1) = work(i,k,j)
	854	ENDDO
	855	ENDDO
	856
	857	DO i = iouter, iend
	858
	859	!
	860	!-- FFT along y
	861	ir = i-iouter+1 ! counter within a stride
	862	CALL fft_y( work_ffty(:,ir), 'backward' )
	863
	864	DO j = 0, ny
	865	f_out(k,j,i) = work_ffty(j,ir)
	866	ENDDO
	867	ENDDO
	868
	869	ENDDO
	870
	871	ENDDO
	872	!$OMP END PARALLEL
	873
	874	ENDIF
	875
	876	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	877
	878	END SUBROUTINE tr_xy_ffty
	879
	880
	881	SUBROUTINE fftx_tri_fftx( ar )
	882
	883	!------------------------------------------------------------------------------!
	884	! FFT along x, solution of the tridiagonal system and backward FFT for
	885	! a 1d-decomposition along x
	886	!
	887	! WARNING: this subroutine may still not work for hybrid parallelization
	888	! with OpenMP (for possible necessary changes see the original
	889	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
	890	!------------------------------------------------------------------------------!
	891
	892	USE control_parameters
	893	USE cpulog
	894	USE grid_variables
	895	USE indices
	896	USE interfaces
	897	USE pegrid
	898	USE transpose_indices
	899
	900	IMPLICIT NONE
	901
	902	character(len=3) :: myth_char
	903
	904	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
	905
	906	REAL, DIMENSION(0:nx) :: work_fftx
	907	REAL, DIMENSION(0:nx,1:nz) :: work_trix
	908	REAL, DIMENSION(nnx,1:nza,nys_x:nyn_xa,pdims(1)) :: ar
	909	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
	910
	911
	912	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'start' )
	913
	914	ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) )
	915
	916	tn = 0 ! Default thread number in case of one thread
	917	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_fftx, work_trix )
	918	DO j = nys_x, nyn_x
	919
	920	!$ tn = omp_get_thread_num()
	921
	922	IF ( host(1:3) == 'nec' ) THEN
	923	!
	924	!-- Code optimized for vector processors
	925	DO k = 1, nz
	926
	927	m = 0
	928	DO n = 1, pdims(1)
	929	DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
	930	work_trix(m,k) = ar(i,k,j,n)
	931	m = m + 1
	932	ENDDO
	933	ENDDO
	934
	935	ENDDO
	936
	937	CALL fft_x_m( work_trix, 'forward' )
	938
	939	ELSE
	940	!
	941	!-- Cache optimized code
	942	DO k = 1, nz
	943
	944	m = 0
	945	DO n = 1, pdims(1)
	946	DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
	947	work_fftx(m) = ar(i,k,j,n)
	948	m = m + 1
	949	ENDDO
	950	ENDDO
	951
	952	CALL fft_x( work_fftx, 'forward' )
	953
	954	DO i = 0, nx
	955	work_trix(i,k) = work_fftx(i)
	956	ENDDO
	957
	958	ENDDO
	959
	960	ENDIF
	961
	962	!
	963	!-- Solve the linear equation system
	964	CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) )
	965
	966	IF ( host(1:3) == 'nec' ) THEN
	967	!
	968	!-- Code optimized for vector processors
	969	CALL fft_x_m( work_trix, 'backward' )
	970
	971	DO k = 1, nz
	972
	973	m = 0
	974	DO n = 1, pdims(1)
	975	DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
	976	ar(i,k,j,n) = work_trix(m,k)
	977	m = m + 1
	978	ENDDO
	979	ENDDO
	980
	981	ENDDO
	982
	983	ELSE
	984	!
	985	!-- Cache optimized code
	986	DO k = 1, nz
	987
	988	DO i = 0, nx
	989	work_fftx(i) = work_trix(i,k)
	990	ENDDO
	991
	992	CALL fft_x( work_fftx, 'backward' )
	993
	994	m = 0
	995	DO n = 1, pdims(1)
	996	DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
	997	ar(i,k,j,n) = work_fftx(m)
	998	m = m + 1
	999	ENDDO
	1000	ENDDO
	1001
	1002	ENDDO
	1003
	1004	ENDIF
	1005
	1006	ENDDO
	1007
	1008	DEALLOCATE( tri )
	1009
	1010	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'stop' )
	1011
	1012	END SUBROUTINE fftx_tri_fftx
	1013
	1014
	1015	SUBROUTINE fftx_tr_xy( f_in, work, f_out )
	1016
	1017	!------------------------------------------------------------------------------!
	1018	! Fourier-transformation along x with subsequent transposition x --> y for
	1019	! a 1d-decomposition along y
	1020	!
	1021	! ATTENTION: The NEC-branch of this routine may significantly profit from
	1022	! further optimizations. So far, performance is much worse than
	1023	! for routine ffty_tr_yx (more than three times slower).
	1024	!------------------------------------------------------------------------------!
	1025
	1026	USE control_parameters
	1027	USE cpulog
	1028	USE indices
	1029	USE interfaces
	1030	USE pegrid
	1031	USE transpose_indices
	1032
	1033	IMPLICIT NONE
	1034
	1035	INTEGER :: i, j, k
	1036
	1037	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
	1038	REAL, DIMENSION(1:nza,nys:nyna,0:nxa) :: f_in
	1039	REAL, DIMENSION(nny,1:nza,nxl_y:nxr_ya,pdims(2)) :: f_out
	1040	REAL, DIMENSION(nys:nyna,1:nza,0:nxa) :: work
	1041
	1042	!
	1043	!-- Carry out the FFT along x, where all data are present due to the
	1044	!-- 1d-decomposition along y. Resort the data in a way that y becomes
	1045	!-- the first index.
	1046	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
	1047
	1048	IF ( host(1:3) == 'nec' ) THEN
	1049	!
	1050	!-- Code for vector processors
[85]	1051	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	1052	!$OMP DO
	1053	DO i = 0, nx
	1054
	1055	DO j = nys, nyn
	1056	DO k = 1, nz
	1057	work_fftx(i,k,j) = f_in(k,j,i)
	1058	ENDDO
	1059	ENDDO
	1060
	1061	ENDDO
	1062
	1063	!$OMP DO
	1064	DO j = nys, nyn
	1065
	1066	CALL fft_x_m( work_fftx(:,:,j), 'forward' )
	1067
	1068	DO k = 1, nz
	1069	DO i = 0, nx
	1070	work(j,k,i) = work_fftx(i,k,j)
	1071	ENDDO
	1072	ENDDO
	1073
	1074	ENDDO
	1075	!$OMP END PARALLEL
	1076
	1077	ELSE
	1078
	1079	!
	1080	!-- Cache optimized code (there might be still a potential for better
	1081	!-- optimization).
[696]	1082	!$OMP PARALLEL PRIVATE (i,j,k)
[1]	1083	!$OMP DO
	1084	DO i = 0, nx
	1085
	1086	DO j = nys, nyn
	1087	DO k = 1, nz
	1088	work_fftx(i,k,j) = f_in(k,j,i)
	1089	ENDDO
	1090	ENDDO
	1091
	1092	ENDDO
	1093
	1094	!$OMP DO
	1095	DO j = nys, nyn
	1096	DO k = 1, nz
	1097
	1098	CALL fft_x( work_fftx(0:nx,k,j), 'forward' )
	1099
	1100	DO i = 0, nx
	1101	work(j,k,i) = work_fftx(i,k,j)
	1102	ENDDO
	1103	ENDDO
	1104
	1105	ENDDO
	1106	!$OMP END PARALLEL
	1107
	1108	ENDIF
	1109	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
	1110
	1111	!
	1112	!-- Transpose array
	1113	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	1114	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	1115	CALL MPI_ALLTOALL( work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
	1116	f_out(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
	1117	comm1dy, ierr )
	1118	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	1119
	1120	END SUBROUTINE fftx_tr_xy
	1121
	1122
	1123	SUBROUTINE tr_yx_fftx( f_in, work, f_out )
	1124
	1125	!------------------------------------------------------------------------------!
	1126	! Transposition y --> x with a subsequent backward Fourier transformation for
	1127	! a 1d-decomposition along x
	1128	!------------------------------------------------------------------------------!
	1129
	1130	USE control_parameters
	1131	USE cpulog
	1132	USE indices
	1133	USE interfaces
	1134	USE pegrid
	1135	USE transpose_indices
	1136
	1137	IMPLICIT NONE
	1138
	1139	INTEGER :: i, j, k
	1140
	1141	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
	1142	REAL, DIMENSION(nny,1:nza,nxl_y:nxr_ya,pdims(2)) :: f_in
	1143	REAL, DIMENSION(1:nza,nys:nyna,0:nxa) :: f_out
	1144	REAL, DIMENSION(nys:nyna,1:nza,0:nxa) :: work
	1145
	1146	!
	1147	!-- Transpose array
	1148	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	1149	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	1150	CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
	1151	work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
	1152	comm1dy, ierr )
	1153	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	1154
	1155	!
	1156	!-- Carry out the FFT along x, where all data are present due to the
	1157	!-- 1d-decomposition along y. Resort the data in a way that y becomes
	1158	!-- the first index.
	1159	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
	1160
	1161	IF ( host(1:3) == 'nec' ) THEN
	1162	!
	1163	!-- Code optimized for vector processors
[85]	1164	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	1165	!$OMP DO
	1166	DO j = nys, nyn
	1167
	1168	DO k = 1, nz
	1169	DO i = 0, nx
	1170	work_fftx(i,k,j) = work(j,k,i)
	1171	ENDDO
	1172	ENDDO
	1173
	1174	CALL fft_x_m( work_fftx(:,:,j), 'backward' )
	1175
	1176	ENDDO
	1177
	1178	!$OMP DO
	1179	DO i = 0, nx
	1180	DO j = nys, nyn
	1181	DO k = 1, nz
	1182	f_out(k,j,i) = work_fftx(i,k,j)
	1183	ENDDO
	1184	ENDDO
	1185	ENDDO
	1186	!$OMP END PARALLEL
	1187
	1188	ELSE
	1189
	1190	!
	1191	!-- Cache optimized code (there might be still a potential for better
	1192	!-- optimization).
[696]	1193	!$OMP PARALLEL PRIVATE (i,j,k)
[1]	1194	!$OMP DO
	1195	DO j = nys, nyn
	1196	DO k = 1, nz
	1197
	1198	DO i = 0, nx
	1199	work_fftx(i,k,j) = work(j,k,i)
	1200	ENDDO
	1201
	1202	CALL fft_x( work_fftx(0:nx,k,j), 'backward' )
	1203
	1204	ENDDO
	1205	ENDDO
	1206
	1207	!$OMP DO
	1208	DO i = 0, nx
	1209	DO j = nys, nyn
	1210	DO k = 1, nz
	1211	f_out(k,j,i) = work_fftx(i,k,j)
	1212	ENDDO
	1213	ENDDO
	1214	ENDDO
	1215	!$OMP END PARALLEL
	1216
	1217	ENDIF
	1218	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	1219
	1220	END SUBROUTINE tr_yx_fftx
	1221
	1222
	1223	SUBROUTINE ffty_tri_ffty( ar )
	1224
	1225	!------------------------------------------------------------------------------!
	1226	! FFT along y, solution of the tridiagonal system and backward FFT for
	1227	! a 1d-decomposition along y
	1228	!
	1229	! WARNING: this subroutine may still not work for hybrid parallelization
	1230	! with OpenMP (for possible necessary changes see the original
	1231	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
	1232	!------------------------------------------------------------------------------!
	1233
	1234	USE control_parameters
	1235	USE cpulog
	1236	USE grid_variables
	1237	USE indices
	1238	USE interfaces
	1239	USE pegrid
	1240	USE transpose_indices
	1241
	1242	IMPLICIT NONE
	1243
	1244	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
	1245
	1246	REAL, DIMENSION(0:ny) :: work_ffty
	1247	REAL, DIMENSION(0:ny,1:nz) :: work_triy
	1248	REAL, DIMENSION(nny,1:nza,nxl_y:nxr_ya,pdims(2)) :: ar
	1249	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
	1250
	1251
	1252	CALL cpu_log( log_point_s(39), 'fft_y + tridia', 'start' )
	1253
	1254	ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) )
	1255
	1256	tn = 0 ! Default thread number in case of one thread
[696]	1257	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy )
[1]	1258	DO i = nxl_y, nxr_y
	1259
	1260	!$ tn = omp_get_thread_num()
	1261
	1262	IF ( host(1:3) == 'nec' ) THEN
	1263	!
	1264	!-- Code optimized for vector processors
	1265	DO k = 1, nz
	1266
	1267	m = 0
	1268	DO n = 1, pdims(2)
	1269	DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
	1270	work_triy(m,k) = ar(j,k,i,n)
	1271	m = m + 1
	1272	ENDDO
	1273	ENDDO
	1274
	1275	ENDDO
	1276
	1277	CALL fft_y_m( work_triy, ny, 'forward' )
	1278
	1279	ELSE
	1280	!
	1281	!-- Cache optimized code
	1282	DO k = 1, nz
	1283
	1284	m = 0
	1285	DO n = 1, pdims(2)
	1286	DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
	1287	work_ffty(m) = ar(j,k,i,n)
	1288	m = m + 1
	1289	ENDDO
	1290	ENDDO
	1291
	1292	CALL fft_y( work_ffty, 'forward' )
	1293
	1294	DO j = 0, ny
	1295	work_triy(j,k) = work_ffty(j)
	1296	ENDDO
	1297
	1298	ENDDO
	1299
	1300	ENDIF
	1301
	1302	!
	1303	!-- Solve the linear equation system
	1304	CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) )
	1305
	1306	IF ( host(1:3) == 'nec' ) THEN
	1307	!
	1308	!-- Code optimized for vector processors
	1309	CALL fft_y_m( work_triy, ny, 'backward' )
	1310
	1311	DO k = 1, nz
	1312
	1313	m = 0
	1314	DO n = 1, pdims(2)
	1315	DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
	1316	ar(j,k,i,n) = work_triy(m,k)
	1317	m = m + 1
	1318	ENDDO
	1319	ENDDO
	1320
	1321	ENDDO
	1322
	1323	ELSE
	1324	!
	1325	!-- Cache optimized code
	1326	DO k = 1, nz
	1327
	1328	DO j = 0, ny
	1329	work_ffty(j) = work_triy(j,k)
	1330	ENDDO
	1331
	1332	CALL fft_y( work_ffty, 'backward' )
	1333
	1334	m = 0
	1335	DO n = 1, pdims(2)
	1336	DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
	1337	ar(j,k,i,n) = work_ffty(m)
	1338	m = m + 1
	1339	ENDDO
	1340	ENDDO
	1341
	1342	ENDDO
	1343
	1344	ENDIF
	1345
	1346	ENDDO
	1347
	1348	DEALLOCATE( tri )
	1349
	1350	CALL cpu_log( log_point_s(39), 'fft_y + tridia', 'stop' )
	1351
	1352	END SUBROUTINE ffty_tri_ffty
	1353
	1354
	1355	SUBROUTINE tridia_1dd( ddx2, ddy2, nx, ny, j, ar, tri )
	1356
	1357	!------------------------------------------------------------------------------!
	1358	! Solves the linear system of equations for a 1d-decomposition along x (see
	1359	! tridia)
	1360	!
	1361	! Attention: when using the intel compiler, array tri must be passed as an
	1362	! argument to the contained subroutines. Otherwise addres faults
	1363	! will occur.
	1364	! On NEC, tri should not be passed (except for routine substi_1dd)
	1365	! because this causes very bad performance.
	1366	!------------------------------------------------------------------------------!
	1367
	1368	USE arrays_3d
	1369	USE control_parameters
	1370
	1371	USE pegrid
	1372
	1373	IMPLICIT NONE
	1374
	1375	INTEGER :: i, j, k, nnyh, nx, ny, omp_get_thread_num, tn
	1376
	1377	REAL :: ddx2, ddy2
	1378
	1379	REAL, DIMENSION(0:nx,1:nz) :: ar
	1380	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
	1381	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	1382
	1383
	1384	nnyh = ( ny + 1 ) / 2
	1385
	1386	!
	1387	!-- Define constant elements of the tridiagonal matrix.
	1388	!-- The compiler on SX6 does loop exchange. If 0:nx is a high power of 2,
	1389	!-- the exchanged loops create bank conflicts. The following directive
	1390	!-- prohibits loop exchange and the loops perform much better.
	1391	! tn = omp_get_thread_num()
	1392	! WRITE( 120+tn, * ) '+++ id=',myid,' nx=',nx,' thread=', omp_get_thread_num()
[82]	1393	! CALL local_flush( 120+tn )
[1]	1394	!CDIR NOLOOPCHG
	1395	DO k = 0, nz-1
	1396	DO i = 0,nx
[667]	1397	tri(2,i,k) = ddzu_pres(k+1) * ddzw(k+1)
	1398	tri(3,i,k) = ddzu_pres(k+2) * ddzw(k+1)
[1]	1399	ENDDO
	1400	ENDDO
	1401	! WRITE( 120+tn, * ) '+++ id=',myid,' end of first tridia loop thread=', omp_get_thread_num()
[82]	1402	! CALL local_flush( 120+tn )
[1]	1403
	1404	IF ( j <= nnyh ) THEN
[377]	1405	#if defined( __lc )
[1]	1406	CALL maketri_1dd( j, tri )
	1407	#else
	1408	CALL maketri_1dd( j )
	1409	#endif
	1410	ELSE
[377]	1411	#if defined( __lc )
[1]	1412	CALL maketri_1dd( ny+1-j, tri )
	1413	#else
	1414	CALL maketri_1dd( ny+1-j )
	1415	#endif
	1416	ENDIF
[377]	1417	#if defined( __lc )
[1]	1418	CALL split_1dd( tri )
	1419	#else
	1420	CALL split_1dd
	1421	#endif
	1422	CALL substi_1dd( ar, tri )
	1423
	1424	CONTAINS
	1425
[377]	1426	#if defined( __lc )
[1]	1427	SUBROUTINE maketri_1dd( j, tri )
	1428	#else
	1429	SUBROUTINE maketri_1dd( j )
	1430	#endif
	1431
	1432	!------------------------------------------------------------------------------!
	1433	! computes the i- and j-dependent component of the matrix
	1434	!------------------------------------------------------------------------------!
	1435
	1436	USE constants
	1437
	1438	IMPLICIT NONE
	1439
	1440	INTEGER :: i, j, k, nnxh
	1441	REAL :: a, c
	1442
	1443	REAL, DIMENSION(0:nx) :: l
	1444
[377]	1445	#if defined( __lc )
[1]	1446	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	1447	#endif
	1448
	1449
	1450	nnxh = ( nx + 1 ) / 2
	1451	!
	1452	!-- Provide the tridiagonal matrix for solution of the Poisson equation in
	1453	!-- Fourier space. The coefficients are computed following the method of
	1454	!-- Schmidt et al. (DFVLR-Mitteilung 84-15), which departs from Stephan
	1455	!-- Siano's original version by discretizing the Poisson equation,
	1456	!-- before it is Fourier-transformed
	1457	DO i = 0, nx
[128]	1458	IF ( i >= 0 .AND. i <= nnxh ) THEN
[1]	1459	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
	1460	FLOAT( nx+1 ) ) ) * ddx2 + &
	1461	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
	1462	FLOAT( ny+1 ) ) ) * ddy2
	1463	ELSE
	1464	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
	1465	FLOAT( nx+1 ) ) ) * ddx2 + &
	1466	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
	1467	FLOAT( ny+1 ) ) ) * ddy2
	1468	ENDIF
	1469	ENDDO
	1470
	1471	DO k = 0, nz-1
	1472	DO i = 0, nx
[667]	1473	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
	1474	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
[1]	1475	tri(1,i,k) = a + c - l(i)
	1476	ENDDO
	1477	ENDDO
	1478	IF ( ibc_p_b == 1 .OR. ibc_p_b == 2 ) THEN
	1479	DO i = 0, nx
	1480	tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
	1481	ENDDO
	1482	ENDIF
	1483	IF ( ibc_p_t == 1 ) THEN
	1484	DO i = 0, nx
	1485	tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
	1486	ENDDO
	1487	ENDIF
	1488
	1489	END SUBROUTINE maketri_1dd
	1490
	1491
[377]	1492	#if defined( __lc )
[1]	1493	SUBROUTINE split_1dd( tri )
	1494	#else
	1495	SUBROUTINE split_1dd
	1496	#endif
	1497
	1498	!------------------------------------------------------------------------------!
	1499	! Splitting of the tridiagonal matrix (Thomas algorithm)
	1500	!------------------------------------------------------------------------------!
	1501
	1502	IMPLICIT NONE
	1503
	1504	INTEGER :: i, k
	1505
[377]	1506	#if defined( __lc )
[1]	1507	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	1508	#endif
	1509
	1510
	1511	!
	1512	!-- Splitting
	1513	DO i = 0, nx
	1514	tri(4,i,0) = tri(1,i,0)
	1515	ENDDO
	1516	DO k = 1, nz-1
	1517	DO i = 0, nx
	1518	tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
	1519	tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
	1520	ENDDO
	1521	ENDDO
	1522
	1523	END SUBROUTINE split_1dd
	1524
	1525
	1526	SUBROUTINE substi_1dd( ar, tri )
	1527
	1528	!------------------------------------------------------------------------------!
	1529	! Substitution (Forward and Backward) (Thomas algorithm)
	1530	!------------------------------------------------------------------------------!
	1531
	1532	IMPLICIT NONE
	1533
[76]	1534	INTEGER :: i, k
[1]	1535
	1536	REAL, DIMENSION(0:nx,nz) :: ar
	1537	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
	1538	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	1539
	1540	!
	1541	!-- Forward substitution
	1542	DO i = 0, nx
	1543	ar1(i,0) = ar(i,1)
	1544	ENDDO
	1545	DO k = 1, nz-1
	1546	DO i = 0, nx
	1547	ar1(i,k) = ar(i,k+1) - tri(5,i,k) * ar1(i,k-1)
	1548	ENDDO
	1549	ENDDO
	1550
	1551	!
	1552	!-- Backward substitution
[763]	1553	!-- Note, the add of 1.0E-20 in the denominator is due to avoid divisions
	1554	!-- by zero appearing if the pressure bc is set to neumann at the top of
	1555	!-- the model domain.
[1]	1556	DO i = 0, nx
[761]	1557	ar(i,nz) = ar1(i,nz-1) / ( tri(4,i,nz-1) + 1.0E-20 )
[1]	1558	ENDDO
	1559	DO k = nz-2, 0, -1
	1560	DO i = 0, nx
	1561	ar(i,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,k+2) ) &
	1562	/ tri(4,i,k)
	1563	ENDDO
	1564	ENDDO
	1565
[76]	1566	!
	1567	!-- Indices i=0, j=0 correspond to horizontally averaged pressure.
	1568	!-- The respective values of ar should be zero at all k-levels if
	1569	!-- acceleration of horizontally averaged vertical velocity is zero.
	1570	IF ( ibc_p_b == 1 .AND. ibc_p_t == 1 ) THEN
	1571	IF ( j == 0 ) THEN
	1572	DO k = 1, nz
	1573	ar(0,k) = 0.0
	1574	ENDDO
	1575	ENDIF
	1576	ENDIF
	1577
[1]	1578	END SUBROUTINE substi_1dd
	1579
	1580	END SUBROUTINE tridia_1dd
	1581
	1582	#endif
	1583
	1584	END MODULE poisfft_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |