Home

Context Navigation

source: palm/trunk/SOURCE/poisfft.f90 @ 196

Last change on this file since 196 was 164, checked in by raasch, 17 years ago
optimization of transpositions for 2D decompositions, workaround for using -env option with mpiexec, adjustments for lcxt4
Property svn:keywords set to `Id`
File size: 44.4 KB

Rev	Line
[1]	1	MODULE poisfft_mod
	2
	3	!------------------------------------------------------------------------------!
	4	! Actual revisions:
	5	! -----------------
[164]	6	! Arguments removed from transpose routines
[1]	7	!
	8	! Former revisions:
	9	! -----------------
[3]	10	! $Id: poisfft.f90 164 2008-05-15 08:46:15Z raasch $
[77]	11	!
[139]	12	! 128 2007-10-26 13:11:14Z raasch
	13	! Bugfix: wavenumber calculation for even nx in routines maketri
	14	!
[90]	15	! 85 2007-05-11 09:35:14Z raasch
	16	! Bugfix: work_fft*_vec removed from some PRIVATE-declarations
	17	!
[77]	18	! 76 2007-03-29 00:58:32Z raasch
	19	! Tridiagonal coefficients adjusted for Neumann boundary conditions both at
	20	! the bottom and the top.
	21	!
[3]	22	! RCS Log replace by Id keyword, revision history cleaned up
	23	!
[1]	24	! Revision 1.24 2006/08/04 15:00:24 raasch
	25	! Default setting of the thread number tn in case of not using OpenMP
	26	!
	27	! Revision 1.23 2006/02/23 12:48:38 raasch
	28	! Additional compiler directive in routine tridia_1dd for preventing loop
	29	! exchange on NEC-SX6
	30	!
	31	! Revision 1.20 2004/04/30 12:38:09 raasch
	32	! Parts of former poisfft_hybrid moved to this subroutine,
	33	! former subroutine changed to a module, renaming of FFT-subroutines and
	34	! -module, FFTs completely substituted by calls of fft_x and fft_y,
	35	! NAG fft used in the non-parallel case completely removed, l in maketri
	36	! is now a 1d-array, variables passed by modules instead of using parameter
	37	! lists, enlarged transposition arrays introduced
	38	!
	39	! Revision 1.1 1997/07/24 11:24:14 raasch
	40	! Initial revision
	41	!
	42	!
	43	! Description:
	44	! ------------
	45	! See below.
	46	!------------------------------------------------------------------------------!
	47
	48	!--------------------------------------------------------------------------!
	49	! poisfft !
	50	! !
	51	! Original version: Stephan Siano (pois3d) !
	52	! !
	53	! Institute of Meteorology and Climatology, University of Hannover !
	54	! Germany !
	55	! !
	56	! Version as of July 23,1996 !
	57	! !
	58	! !
	59	! Version for parallel computers: Siegfried Raasch !
	60	! !
	61	! Version as of July 03,1997 !
	62	! !
	63	! Solves the Poisson equation with a 2D spectral method !
	64	! d^2 p / dx^2 + d^2 p / dy^2 + d^2 p / dz^2 = s !
	65	! !
	66	! Input: !
	67	! real ar contains in the (nnx,nny,nnz) elements, !
	68	! starting from the element (1,nys,nxl), the !
	69	! values for s !
	70	! real work Temporary array !
	71	! !
	72	! Output: !
	73	! real ar contains the solution for p !
	74	!--------------------------------------------------------------------------!
	75
	76	USE fft_xy
	77	USE indices
	78	USE transpose_indices
	79
	80	IMPLICIT NONE
	81
	82	PRIVATE
	83	PUBLIC poisfft, poisfft_init
	84
	85	INTERFACE poisfft
	86	MODULE PROCEDURE poisfft
	87	END INTERFACE poisfft
	88
	89	INTERFACE poisfft_init
	90	MODULE PROCEDURE poisfft_init
	91	END INTERFACE poisfft_init
	92
	93	CONTAINS
	94
	95	SUBROUTINE poisfft_init
	96
	97	CALL fft_init
	98
	99	END SUBROUTINE poisfft_init
	100
	101
	102	SUBROUTINE poisfft( ar, work )
	103
	104	USE cpulog
	105	USE interfaces
	106	USE pegrid
	107
	108	IMPLICIT NONE
	109
	110	REAL, DIMENSION(1:nza,nys:nyna,nxl:nxra) :: ar, work
	111
	112
	113	CALL cpu_log( log_point_s(3), 'poisfft', 'start' )
	114
	115	!
	116	!-- Two-dimensional Fourier Transformation in x- and y-direction.
	117	#if defined( __parallel )
	118	IF ( pdims(2) == 1 ) THEN
	119
	120	!
	121	!-- 1d-domain-decomposition along x:
	122	!-- FFT along y and transposition y --> x
	123	CALL ffty_tr_yx( ar, work, ar )
	124
	125	!
	126	!-- FFT along x, solving the tridiagonal system and backward FFT
	127	CALL fftx_tri_fftx( ar )
	128
	129	!
	130	!-- Transposition x --> y and backward FFT along y
	131	CALL tr_xy_ffty( ar, work, ar )
	132
	133	ELSEIF ( pdims(1) == 1 ) THEN
	134
	135	!
	136	!-- 1d-domain-decomposition along y:
	137	!-- FFT along x and transposition x --> y
	138	CALL fftx_tr_xy( ar, work, ar )
	139
	140	!
	141	!-- FFT along y, solving the tridiagonal system and backward FFT
	142	CALL ffty_tri_ffty( ar )
	143
	144	!
	145	!-- Transposition y --> x and backward FFT along x
	146	CALL tr_yx_fftx( ar, work, ar )
	147
	148	ELSE
	149
	150	!
	151	!-- 2d-domain-decomposition
	152	!-- Transposition z --> x
	153	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
[164]	154	CALL transpose_zx( ar, work, ar )
[1]	155	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
	156
	157	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
	158	CALL fftxp( ar, 'forward' )
	159	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
	160
	161	!
	162	!-- Transposition x --> y
	163	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
[164]	164	CALL transpose_xy( ar, work, ar )
[1]	165	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
	166
	167	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
	168	CALL fftyp( ar, 'forward' )
	169	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	170
	171	!
	172	!-- Transposition y --> z
	173	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
[164]	174	CALL transpose_yz( ar, work, ar )
[1]	175	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
	176
	177	!
	178	!-- Solve the Poisson equation in z-direction in cartesian space.
	179	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
	180	CALL tridia( ar )
	181	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
	182
	183	!
	184	!-- Inverse Fourier Transformation
	185	!-- Transposition z --> y
	186	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
[164]	187	CALL transpose_zy( ar, work, ar )
[1]	188	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
	189
	190	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
	191	CALL fftyp( ar, 'backward' )
	192	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	193
	194	!
	195	!-- Transposition y --> x
	196	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
[164]	197	CALL transpose_yx( ar, work, ar )
[1]	198	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
	199
	200	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
	201	CALL fftxp( ar, 'backward' )
	202	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	203
	204	!
	205	!-- Transposition x --> z
	206	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
[164]	207	CALL transpose_xz( ar, work, ar )
[1]	208	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
	209
	210	ENDIF
	211
	212	#else
	213
	214	!
	215	!-- Two-dimensional Fourier Transformation along x- and y-direction.
	216	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
	217	CALL fftx( ar, 'forward' )
	218	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
	219	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
	220	CALL ffty( ar, 'forward' )
	221	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	222
	223	!
	224	!-- Solve the Poisson equation in z-direction in cartesian space.
	225	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
	226	CALL tridia( ar )
	227	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
	228
	229	!
	230	!-- Inverse Fourier Transformation.
	231	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
	232	CALL ffty( ar, 'backward' )
	233	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	234	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
	235	CALL fftx( ar, 'backward' )
	236	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	237
	238	#endif
	239
	240	CALL cpu_log( log_point_s(3), 'poisfft', 'stop' )
	241
	242	END SUBROUTINE poisfft
	243
	244
	245
	246	SUBROUTINE tridia( ar )
	247
	248	!------------------------------------------------------------------------------!
	249	! solves the linear system of equations:
	250	!
	251	! -(4 pi^2(i^2/(dx^2nnx^2)+j^2/(dy^2nny^2))+
	252	! 1/(dzu(k)dzw(k))+1/(dzu(k-1)dzw(k)))*p(i,j,k)+
	253	! 1/(dzu(k)dzw(k))p(i,j,k+1)+1/(dzu(k-1)dzw(k))p(i,j,k-1)=d(i,j,k)
	254	!
	255	! by using the Thomas algorithm
	256	!------------------------------------------------------------------------------!
	257
	258	USE arrays_3d
	259
	260	IMPLICIT NONE
	261
	262	INTEGER :: i, j, k, nnyh
	263
	264	REAL, DIMENSION(nxl_z:nxr_z,0:nz-1) :: ar1
	265	REAL, DIMENSION(5,nxl_z:nxr_z,0:nz-1) :: tri
	266
	267	#if defined( __parallel )
	268	REAL :: ar(nxl_z:nxr_za,nys_z:nyn_za,1:nza)
	269	#else
	270	REAL :: ar(1:nz,nys_z:nyn_z,nxl_z:nxr_z)
	271	#endif
	272
	273
	274	nnyh = (ny+1) / 2
	275
	276	!
	277	!-- Define constant elements of the tridiagonal matrix.
	278	DO k = 0, nz-1
	279	DO i = nxl_z, nxr_z
	280	tri(2,i,k) = ddzu(k+1) * ddzw(k+1)
	281	tri(3,i,k) = ddzu(k+2) * ddzw(k+1)
	282	ENDDO
	283	ENDDO
	284
	285	#if defined( __parallel )
	286	!
	287	!-- Repeat for all y-levels.
	288	DO j = nys_z, nyn_z
	289	IF ( j <= nnyh ) THEN
	290	CALL maketri( tri, j )
	291	ELSE
	292	CALL maketri( tri, ny+1-j )
	293	ENDIF
	294	CALL split( tri )
	295	CALL substi( ar, ar1, tri, j )
	296	ENDDO
	297	#else
	298	!
	299	!-- First y-level.
	300	CALL maketri( tri, nys_z )
	301	CALL split( tri )
	302	CALL substi( ar, ar1, tri, 0 )
	303
	304	!
	305	!-- Further y-levels.
	306	DO j = 1, nnyh - 1
	307	CALL maketri( tri, j )
	308	CALL split( tri )
	309	CALL substi( ar, ar1, tri, j )
	310	CALL substi( ar, ar1, tri, ny+1-j )
	311	ENDDO
	312	CALL maketri( tri, nnyh )
	313	CALL split( tri )
	314	CALL substi( ar, ar1, tri, nnyh+nys )
	315	#endif
	316
	317	CONTAINS
	318
	319	SUBROUTINE maketri( tri, j )
	320
	321	!------------------------------------------------------------------------------!
	322	! Computes the i- and j-dependent component of the matrix
	323	!------------------------------------------------------------------------------!
	324
	325	USE arrays_3d
	326	USE constants
	327	USE control_parameters
	328	USE grid_variables
	329
	330	IMPLICIT NONE
	331
	332	INTEGER :: i, j, k, nnxh
	333	REAL :: a, c
	334	REAL :: ll(nxl_z:nxr_z)
	335	REAL :: tri(5,nxl_z:nxr_z,0:nz-1)
	336
	337
	338	nnxh = ( nx + 1 ) / 2
	339
	340	!
	341	!-- Provide the tridiagonal matrix for solution of the Poisson equation in
	342	!-- Fourier space. The coefficients are computed following the method of
	343	!-- Schmidt et al. (DFVLR-Mitteilung 84-15), which departs from Stephan
	344	!-- Siano's original version by discretizing the Poisson equation,
	345	!-- before it is Fourier-transformed
	346	#if defined( __parallel )
	347	DO i = nxl_z, nxr_z
[128]	348	IF ( i >= 0 .AND. i <= nnxh ) THEN
[1]	349	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
	350	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
	351	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
	352	FLOAT( ny+1 ) ) ) / ( dy * dy )
	353	ELSE
	354	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
	355	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
	356	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
	357	FLOAT( ny+1 ) ) ) / ( dy * dy )
	358	ENDIF
	359	DO k = 0,nz-1
	360	a = -1.0 * ddzu(k+2) * ddzw(k+1)
	361	c = -1.0 * ddzu(k+1) * ddzw(k+1)
	362	tri(1,i,k) = a + c - ll(i)
	363	ENDDO
	364	ENDDO
	365	#else
	366	DO i = 0, nnxh
	367	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / FLOAT( nx+1 ) ) ) / &
	368	( dx * dx ) + &
	369	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / FLOAT( ny+1 ) ) ) / &
	370	( dy * dy )
	371	DO k = 0, nz-1
	372	a = -1.0 * ddzu(k+2) * ddzw(k+1)
	373	c = -1.0 * ddzu(k+1) * ddzw(k+1)
	374	tri(1,i,k) = a + c - ll(i)
	375	IF ( i >= 1 .and. i < nnxh ) THEN
	376	tri(1,nx+1-i,k) = tri(1,i,k)
	377	ENDIF
	378	ENDDO
	379	ENDDO
	380	#endif
	381	IF ( ibc_p_b == 1 .OR. ibc_p_b == 2 ) THEN
	382	DO i = nxl_z, nxr_z
	383	tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
	384	ENDDO
	385	ENDIF
	386	IF ( ibc_p_t == 1 ) THEN
	387	DO i = nxl_z, nxr_z
	388	tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
	389	ENDDO
	390	ENDIF
	391
	392	END SUBROUTINE maketri
	393
	394
	395	SUBROUTINE substi( ar, ar1, tri, j )
	396
	397	!------------------------------------------------------------------------------!
	398	! Substitution (Forward and Backward) (Thomas algorithm)
	399	!------------------------------------------------------------------------------!
	400
[76]	401	USE control_parameters
	402
[1]	403	IMPLICIT NONE
	404
	405	INTEGER :: i, j, k
	406	REAL :: ar1(nxl_z:nxr_z,0:nz-1)
	407	REAL :: tri(5,nxl_z:nxr_z,0:nz-1)
	408	#if defined( __parallel )
	409	REAL :: ar(nxl_z:nxr_za,nys_z:nyn_za,1:nza)
	410	#else
	411	REAL :: ar(1:nz,nys_z:nyn_z,nxl_z:nxr_z)
	412	#endif
	413
	414	!
	415	!-- Forward substitution.
	416	DO i = nxl_z, nxr_z
	417	#if defined( __parallel )
	418	ar1(i,0) = ar(i,j,1)
	419	#else
	420	ar1(i,0) = ar(1,j,i)
	421	#endif
	422	ENDDO
	423	DO k = 1, nz - 1
	424	DO i = nxl_z, nxr_z
	425	#if defined( __parallel )
	426	ar1(i,k) = ar(i,j,k+1) - tri(5,i,k) * ar1(i,k-1)
	427	#else
	428	ar1(i,k) = ar(k+1,j,i) - tri(5,i,k) * ar1(i,k-1)
	429	#endif
	430	ENDDO
	431	ENDDO
	432
	433	!
	434	!-- Backward substitution.
	435	DO i = nxl_z, nxr_z
	436	#if defined( __parallel )
	437	ar(i,j,nz) = ar1(i,nz-1) / tri(4,i,nz-1)
	438	#else
	439	ar(nz,j,i) = ar1(i,nz-1) / tri(4,i,nz-1)
	440	#endif
	441	ENDDO
	442	DO k = nz-2, 0, -1
	443	DO i = nxl_z, nxr_z
	444	#if defined( __parallel )
	445	ar(i,j,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,j,k+2) ) &
	446	/ tri(4,i,k)
	447	#else
	448	ar(k+1,j,i) = ( ar1(i,k) - tri(3,i,k) * ar(k+2,j,i) ) &
	449	/ tri(4,i,k)
	450	#endif
	451	ENDDO
	452	ENDDO
	453
[76]	454	!
	455	!-- Indices i=0, j=0 correspond to horizontally averaged pressure.
	456	!-- The respective values of ar should be zero at all k-levels if
	457	!-- acceleration of horizontally averaged vertical velocity is zero.
	458	IF ( ibc_p_b == 1 .AND. ibc_p_t == 1 ) THEN
	459	IF ( j == 0 .AND. nxl_z == 0 ) THEN
	460	#if defined( __parallel )
	461	DO k = 1, nz
	462	ar(nxl_z,j,k) = 0.0
	463	ENDDO
	464	#else
	465	DO k = 1, nz
	466	ar(k,j,nxl_z) = 0.0
	467	ENDDO
	468	#endif
	469	ENDIF
	470	ENDIF
	471
[1]	472	END SUBROUTINE substi
	473
	474
	475	SUBROUTINE split( tri )
	476
	477	!------------------------------------------------------------------------------!
	478	! Splitting of the tridiagonal matrix (Thomas algorithm)
	479	!------------------------------------------------------------------------------!
	480
	481	IMPLICIT NONE
	482
	483	INTEGER :: i, k
	484	REAL :: tri(5,nxl_z:nxr_z,0:nz-1)
	485
	486	!
	487	!-- Splitting.
	488	DO i = nxl_z, nxr_z
	489	tri(4,i,0) = tri(1,i,0)
	490	ENDDO
	491	DO k = 1, nz-1
	492	DO i = nxl_z, nxr_z
	493	tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
	494	tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
	495	ENDDO
	496	ENDDO
	497
	498	END SUBROUTINE split
	499
	500	END SUBROUTINE tridia
	501
	502
	503	#if defined( __parallel )
	504	SUBROUTINE fftxp( ar, direction )
	505
	506	!------------------------------------------------------------------------------!
	507	! Fourier-transformation along x-direction Parallelized version
	508	!------------------------------------------------------------------------------!
	509
	510	IMPLICIT NONE
	511
	512	CHARACTER (LEN=*) :: direction
	513	INTEGER :: j, k
	514	REAL :: ar(0:nxa,nys_x:nyn_xa,nzb_x:nzt_xa)
	515
	516	!
	517	!-- Performing the fft with one of the methods implemented
	518	DO k = nzb_x, nzt_x
	519	DO j = nys_x, nyn_x
	520	CALL fft_x( ar(0:nx,j,k), direction )
	521	ENDDO
	522	ENDDO
	523
	524	END SUBROUTINE fftxp
	525
	526	#else
	527	SUBROUTINE fftx( ar, direction )
	528
	529	!------------------------------------------------------------------------------!
	530	! Fourier-transformation along x-direction Non parallel version
	531	!------------------------------------------------------------------------------!
	532
	533	IMPLICIT NONE
	534
	535	CHARACTER (LEN=*) :: direction
	536	INTEGER :: i, j, k
	537	REAL :: ar(1:nz,0:ny,0:nx)
	538
	539	!
	540	!-- Performing the fft with one of the methods implemented
	541	DO k = 1, nz
	542	DO j = 0, ny
	543	CALL fft_x( ar(k,j,0:nx), direction )
	544	ENDDO
	545	ENDDO
	546
	547	END SUBROUTINE fftx
	548	#endif
	549
	550
	551	#if defined( __parallel )
	552	SUBROUTINE fftyp( ar, direction )
	553
	554	!------------------------------------------------------------------------------!
	555	! Fourier-transformation along y-direction Parallelized version
	556	!------------------------------------------------------------------------------!
	557
	558	IMPLICIT NONE
	559
	560	CHARACTER (LEN=*) :: direction
	561	INTEGER :: i, k
	562	REAL :: ar(0:nya,nxl_y:nxr_ya,nzb_y:nzt_ya)
	563
	564	!
	565	!-- Performing the fft with one of the methods implemented
	566	DO k = nzb_y, nzt_y
	567	DO i = nxl_y, nxr_y
	568	CALL fft_y( ar(0:ny,i,k), direction )
	569	ENDDO
	570	ENDDO
	571
	572	END SUBROUTINE fftyp
	573
	574	#else
	575	SUBROUTINE ffty( ar, direction )
	576
	577	!------------------------------------------------------------------------------!
	578	! Fourier-transformation along y-direction Non parallel version
	579	!------------------------------------------------------------------------------!
	580
	581	IMPLICIT NONE
	582
	583	CHARACTER (LEN=*) :: direction
	584	INTEGER :: i, k
	585	REAL :: ar(1:nz,0:ny,0:nx)
	586
	587	!
	588	!-- Performing the fft with one of the methods implemented
	589	DO k = 1, nz
	590	DO i = 0, nx
	591	CALL fft_y( ar(k,0:ny,i), direction )
	592	ENDDO
	593	ENDDO
	594
	595	END SUBROUTINE ffty
	596	#endif
	597
	598	#if defined( __parallel )
	599	SUBROUTINE ffty_tr_yx( f_in, work, f_out )
	600
	601	!------------------------------------------------------------------------------!
	602	! Fourier-transformation along y with subsequent transposition y --> x for
	603	! a 1d-decomposition along x
	604	!
	605	! ATTENTION: The performance of this routine is much faster on the NEC-SX6,
	606	! if the first index of work_ffty_vec is odd. Otherwise
	607	! memory bank conflicts may occur (especially if the index is a
	608	! multiple of 128). That's why work_ffty_vec is dimensioned as
	609	! 0:ny+1.
	610	! Of course, this will not work if users are using an odd number
	611	! of gridpoints along y.
	612	!------------------------------------------------------------------------------!
	613
	614	USE control_parameters
	615	USE cpulog
	616	USE indices
	617	USE interfaces
	618	USE pegrid
	619	USE transpose_indices
	620
	621	IMPLICIT NONE
	622
	623	INTEGER :: i, iend, iouter, ir, j, k
	624	INTEGER, PARAMETER :: stridex = 4
	625
	626	REAL, DIMENSION(0:ny,stridex) :: work_ffty
	627	#if defined( __nec )
	628	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
	629	#endif
	630	REAL, DIMENSION(1:nza,0:nya,nxl:nxra) :: f_in
	631	REAL, DIMENSION(nnx,1:nza,nys_x:nyn_xa,pdims(1)) :: f_out
	632	REAL, DIMENSION(nxl:nxra,1:nza,0:nya) :: work
	633
	634	!
	635	!-- Carry out the FFT along y, where all data are present due to the
	636	!-- 1d-decomposition along x. Resort the data in a way that x becomes
	637	!-- the first index.
	638	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
	639
	640	IF ( host(1:3) == 'nec' ) THEN
	641	#if defined( __nec )
	642	!
	643	!-- Code optimized for vector processors
[85]	644	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	645	!$OMP DO
	646	DO i = nxl, nxr
	647
	648	DO j = 0, ny
	649	DO k = 1, nz
	650	work_ffty_vec(j,k,i) = f_in(k,j,i)
	651	ENDDO
	652	ENDDO
	653
	654	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' )
	655
	656	ENDDO
	657
	658	!$OMP DO
	659	DO k = 1, nz
	660	DO j = 0, ny
	661	DO i = nxl, nxr
	662	work(i,k,j) = work_ffty_vec(j,k,i)
	663	ENDDO
	664	ENDDO
	665	ENDDO
	666	!$OMP END PARALLEL
	667	#endif
	668
	669	ELSE
	670
	671	!
	672	!-- Cache optimized code.
	673	!-- The i-(x-)direction is split into a strided outer loop and an inner
	674	!-- loop for better cache performance
	675	!$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty)
	676	!$OMP DO
	677	DO iouter = nxl, nxr, stridex
	678
	679	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
	680
	681	DO k = 1, nz
	682
	683	DO i = iouter, iend
	684
	685	ir = i-iouter+1 ! counter within a stride
	686	DO j = 0, ny
	687	work_ffty(j,ir) = f_in(k,j,i)
	688	ENDDO
	689	!
	690	!-- FFT along y
	691	CALL fft_y( work_ffty(:,ir), 'forward' )
	692
	693	ENDDO
	694
	695	!
	696	!-- Resort
	697	DO j = 0, ny
	698	DO i = iouter, iend
	699	work(i,k,j) = work_ffty(j,i-iouter+1)
	700	ENDDO
	701	ENDDO
	702
	703	ENDDO
	704
	705	ENDDO
	706	!$OMP END PARALLEL
	707
	708	ENDIF
	709	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	710
	711	!
	712	!-- Transpose array
	713	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
	714	CALL MPI_ALLTOALL( work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
	715	f_out(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
	716	comm1dx, ierr )
	717	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	718
	719	END SUBROUTINE ffty_tr_yx
	720
	721
	722	SUBROUTINE tr_xy_ffty( f_in, work, f_out )
	723
	724	!------------------------------------------------------------------------------!
	725	! Transposition x --> y with a subsequent backward Fourier transformation for
	726	! a 1d-decomposition along x
	727	!------------------------------------------------------------------------------!
	728
	729	USE control_parameters
	730	USE cpulog
	731	USE indices
	732	USE interfaces
	733	USE pegrid
	734	USE transpose_indices
	735
	736	IMPLICIT NONE
	737
	738	INTEGER :: i, iend, iouter, ir, j, k
	739	INTEGER, PARAMETER :: stridex = 4
	740
	741	REAL, DIMENSION(0:ny,stridex) :: work_ffty
	742	#if defined( __nec )
	743	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
	744	#endif
	745	REAL, DIMENSION(nnx,1:nza,nys_x:nyn_xa,pdims(1)) :: f_in
	746	REAL, DIMENSION(1:nza,0:nya,nxl:nxra) :: f_out
	747	REAL, DIMENSION(nxl:nxra,1:nza,0:nya) :: work
	748
	749	!
	750	!-- Transpose array
	751	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
	752	CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
	753	work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
	754	comm1dx, ierr )
	755	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	756
	757	!
	758	!-- Resort the data in a way that y becomes the first index and carry out the
	759	!-- backward fft along y.
	760	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
	761
	762	IF ( host(1:3) == 'nec' ) THEN
	763	#if defined( __nec )
	764	!
	765	!-- Code optimized for vector processors
[85]	766	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	767	!$OMP DO
	768	DO k = 1, nz
	769	DO j = 0, ny
	770	DO i = nxl, nxr
	771	work_ffty_vec(j,k,i) = work(i,k,j)
	772	ENDDO
	773	ENDDO
	774	ENDDO
	775
	776	!$OMP DO
	777	DO i = nxl, nxr
	778
	779	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' )
	780
	781	DO j = 0, ny
	782	DO k = 1, nz
	783	f_out(k,j,i) = work_ffty_vec(j,k,i)
	784	ENDDO
	785	ENDDO
	786
	787	ENDDO
	788	!$OMP END PARALLEL
	789	#endif
	790
	791	ELSE
	792
	793	!
	794	!-- Cache optimized code.
	795	!-- The i-(x-)direction is split into a strided outer loop and an inner
	796	!-- loop for better cache performance
	797	!$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty )
	798	!$OMP DO
	799	DO iouter = nxl, nxr, stridex
	800
	801	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
	802
	803	DO k = 1, nz
	804	!
	805	!-- Resort
	806	DO j = 0, ny
	807	DO i = iouter, iend
	808	work_ffty(j,i-iouter+1) = work(i,k,j)
	809	ENDDO
	810	ENDDO
	811
	812	DO i = iouter, iend
	813
	814	!
	815	!-- FFT along y
	816	ir = i-iouter+1 ! counter within a stride
	817	CALL fft_y( work_ffty(:,ir), 'backward' )
	818
	819	DO j = 0, ny
	820	f_out(k,j,i) = work_ffty(j,ir)
	821	ENDDO
	822	ENDDO
	823
	824	ENDDO
	825
	826	ENDDO
	827	!$OMP END PARALLEL
	828
	829	ENDIF
	830
	831	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	832
	833	END SUBROUTINE tr_xy_ffty
	834
	835
	836	SUBROUTINE fftx_tri_fftx( ar )
	837
	838	!------------------------------------------------------------------------------!
	839	! FFT along x, solution of the tridiagonal system and backward FFT for
	840	! a 1d-decomposition along x
	841	!
	842	! WARNING: this subroutine may still not work for hybrid parallelization
	843	! with OpenMP (for possible necessary changes see the original
	844	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
	845	!------------------------------------------------------------------------------!
	846
	847	USE control_parameters
	848	USE cpulog
	849	USE grid_variables
	850	USE indices
	851	USE interfaces
	852	USE pegrid
	853	USE transpose_indices
	854
	855	IMPLICIT NONE
	856
	857	character(len=3) :: myth_char
	858
	859	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
	860
	861	REAL, DIMENSION(0:nx) :: work_fftx
	862	REAL, DIMENSION(0:nx,1:nz) :: work_trix
	863	REAL, DIMENSION(nnx,1:nza,nys_x:nyn_xa,pdims(1)) :: ar
	864	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
	865
	866
	867	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'start' )
	868
	869	ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) )
	870
	871	tn = 0 ! Default thread number in case of one thread
	872	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_fftx, work_trix )
	873	DO j = nys_x, nyn_x
	874
	875	!$ tn = omp_get_thread_num()
	876
	877	IF ( host(1:3) == 'nec' ) THEN
	878	!
	879	!-- Code optimized for vector processors
	880	DO k = 1, nz
	881
	882	m = 0
	883	DO n = 1, pdims(1)
	884	DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
	885	work_trix(m,k) = ar(i,k,j,n)
	886	m = m + 1
	887	ENDDO
	888	ENDDO
	889
	890	ENDDO
	891
	892	CALL fft_x_m( work_trix, 'forward' )
	893
	894	ELSE
	895	!
	896	!-- Cache optimized code
	897	DO k = 1, nz
	898
	899	m = 0
	900	DO n = 1, pdims(1)
	901	DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
	902	work_fftx(m) = ar(i,k,j,n)
	903	m = m + 1
	904	ENDDO
	905	ENDDO
	906
	907	CALL fft_x( work_fftx, 'forward' )
	908
	909	DO i = 0, nx
	910	work_trix(i,k) = work_fftx(i)
	911	ENDDO
	912
	913	ENDDO
	914
	915	ENDIF
	916
	917	!
	918	!-- Solve the linear equation system
	919	CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) )
	920
	921	IF ( host(1:3) == 'nec' ) THEN
	922	!
	923	!-- Code optimized for vector processors
	924	CALL fft_x_m( work_trix, 'backward' )
	925
	926	DO k = 1, nz
	927
	928	m = 0
	929	DO n = 1, pdims(1)
	930	DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
	931	ar(i,k,j,n) = work_trix(m,k)
	932	m = m + 1
	933	ENDDO
	934	ENDDO
	935
	936	ENDDO
	937
	938	ELSE
	939	!
	940	!-- Cache optimized code
	941	DO k = 1, nz
	942
	943	DO i = 0, nx
	944	work_fftx(i) = work_trix(i,k)
	945	ENDDO
	946
	947	CALL fft_x( work_fftx, 'backward' )
	948
	949	m = 0
	950	DO n = 1, pdims(1)
	951	DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
	952	ar(i,k,j,n) = work_fftx(m)
	953	m = m + 1
	954	ENDDO
	955	ENDDO
	956
	957	ENDDO
	958
	959	ENDIF
	960
	961	ENDDO
	962
	963	DEALLOCATE( tri )
	964
	965	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'stop' )
	966
	967	END SUBROUTINE fftx_tri_fftx
	968
	969
	970	SUBROUTINE fftx_tr_xy( f_in, work, f_out )
	971
	972	!------------------------------------------------------------------------------!
	973	! Fourier-transformation along x with subsequent transposition x --> y for
	974	! a 1d-decomposition along y
	975	!
	976	! ATTENTION: The NEC-branch of this routine may significantly profit from
	977	! further optimizations. So far, performance is much worse than
	978	! for routine ffty_tr_yx (more than three times slower).
	979	!------------------------------------------------------------------------------!
	980
	981	USE control_parameters
	982	USE cpulog
	983	USE indices
	984	USE interfaces
	985	USE pegrid
	986	USE transpose_indices
	987
	988	IMPLICIT NONE
	989
	990	INTEGER :: i, j, k
	991
	992	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
	993	REAL, DIMENSION(1:nza,nys:nyna,0:nxa) :: f_in
	994	REAL, DIMENSION(nny,1:nza,nxl_y:nxr_ya,pdims(2)) :: f_out
	995	REAL, DIMENSION(nys:nyna,1:nza,0:nxa) :: work
	996
	997	!
	998	!-- Carry out the FFT along x, where all data are present due to the
	999	!-- 1d-decomposition along y. Resort the data in a way that y becomes
	1000	!-- the first index.
	1001	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
	1002
	1003	IF ( host(1:3) == 'nec' ) THEN
	1004	!
	1005	!-- Code for vector processors
[85]	1006	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	1007	!$OMP DO
	1008	DO i = 0, nx
	1009
	1010	DO j = nys, nyn
	1011	DO k = 1, nz
	1012	work_fftx(i,k,j) = f_in(k,j,i)
	1013	ENDDO
	1014	ENDDO
	1015
	1016	ENDDO
	1017
	1018	!$OMP DO
	1019	DO j = nys, nyn
	1020
	1021	CALL fft_x_m( work_fftx(:,:,j), 'forward' )
	1022
	1023	DO k = 1, nz
	1024	DO i = 0, nx
	1025	work(j,k,i) = work_fftx(i,k,j)
	1026	ENDDO
	1027	ENDDO
	1028
	1029	ENDDO
	1030	!$OMP END PARALLEL
	1031
	1032	ELSE
	1033
	1034	!
	1035	!-- Cache optimized code (there might be still a potential for better
	1036	!-- optimization).
	1037	!$OMP PARALLEL PRIVATE (i,j,k,work_fftx)
	1038	!$OMP DO
	1039	DO i = 0, nx
	1040
	1041	DO j = nys, nyn
	1042	DO k = 1, nz
	1043	work_fftx(i,k,j) = f_in(k,j,i)
	1044	ENDDO
	1045	ENDDO
	1046
	1047	ENDDO
	1048
	1049	!$OMP DO
	1050	DO j = nys, nyn
	1051	DO k = 1, nz
	1052
	1053	CALL fft_x( work_fftx(0:nx,k,j), 'forward' )
	1054
	1055	DO i = 0, nx
	1056	work(j,k,i) = work_fftx(i,k,j)
	1057	ENDDO
	1058	ENDDO
	1059
	1060	ENDDO
	1061	!$OMP END PARALLEL
	1062
	1063	ENDIF
	1064	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
	1065
	1066	!
	1067	!-- Transpose array
	1068	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
	1069	CALL MPI_ALLTOALL( work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
	1070	f_out(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
	1071	comm1dy, ierr )
	1072	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	1073
	1074	END SUBROUTINE fftx_tr_xy
	1075
	1076
	1077	SUBROUTINE tr_yx_fftx( f_in, work, f_out )
	1078
	1079	!------------------------------------------------------------------------------!
	1080	! Transposition y --> x with a subsequent backward Fourier transformation for
	1081	! a 1d-decomposition along x
	1082	!------------------------------------------------------------------------------!
	1083
	1084	USE control_parameters
	1085	USE cpulog
	1086	USE indices
	1087	USE interfaces
	1088	USE pegrid
	1089	USE transpose_indices
	1090
	1091	IMPLICIT NONE
	1092
	1093	INTEGER :: i, j, k
	1094
	1095	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
	1096	REAL, DIMENSION(nny,1:nza,nxl_y:nxr_ya,pdims(2)) :: f_in
	1097	REAL, DIMENSION(1:nza,nys:nyna,0:nxa) :: f_out
	1098	REAL, DIMENSION(nys:nyna,1:nza,0:nxa) :: work
	1099
	1100	!
	1101	!-- Transpose array
	1102	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
	1103	CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
	1104	work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
	1105	comm1dy, ierr )
	1106	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	1107
	1108	!
	1109	!-- Carry out the FFT along x, where all data are present due to the
	1110	!-- 1d-decomposition along y. Resort the data in a way that y becomes
	1111	!-- the first index.
	1112	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
	1113
	1114	IF ( host(1:3) == 'nec' ) THEN
	1115	!
	1116	!-- Code optimized for vector processors
[85]	1117	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	1118	!$OMP DO
	1119	DO j = nys, nyn
	1120
	1121	DO k = 1, nz
	1122	DO i = 0, nx
	1123	work_fftx(i,k,j) = work(j,k,i)
	1124	ENDDO
	1125	ENDDO
	1126
	1127	CALL fft_x_m( work_fftx(:,:,j), 'backward' )
	1128
	1129	ENDDO
	1130
	1131	!$OMP DO
	1132	DO i = 0, nx
	1133	DO j = nys, nyn
	1134	DO k = 1, nz
	1135	f_out(k,j,i) = work_fftx(i,k,j)
	1136	ENDDO
	1137	ENDDO
	1138	ENDDO
	1139	!$OMP END PARALLEL
	1140
	1141	ELSE
	1142
	1143	!
	1144	!-- Cache optimized code (there might be still a potential for better
	1145	!-- optimization).
	1146	!$OMP PARALLEL PRIVATE (i,j,k,work_fftx)
	1147	!$OMP DO
	1148	DO j = nys, nyn
	1149	DO k = 1, nz
	1150
	1151	DO i = 0, nx
	1152	work_fftx(i,k,j) = work(j,k,i)
	1153	ENDDO
	1154
	1155	CALL fft_x( work_fftx(0:nx,k,j), 'backward' )
	1156
	1157	ENDDO
	1158	ENDDO
	1159
	1160	!$OMP DO
	1161	DO i = 0, nx
	1162	DO j = nys, nyn
	1163	DO k = 1, nz
	1164	f_out(k,j,i) = work_fftx(i,k,j)
	1165	ENDDO
	1166	ENDDO
	1167	ENDDO
	1168	!$OMP END PARALLEL
	1169
	1170	ENDIF
	1171	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	1172
	1173	END SUBROUTINE tr_yx_fftx
	1174
	1175
	1176	SUBROUTINE ffty_tri_ffty( ar )
	1177
	1178	!------------------------------------------------------------------------------!
	1179	! FFT along y, solution of the tridiagonal system and backward FFT for
	1180	! a 1d-decomposition along y
	1181	!
	1182	! WARNING: this subroutine may still not work for hybrid parallelization
	1183	! with OpenMP (for possible necessary changes see the original
	1184	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
	1185	!------------------------------------------------------------------------------!
	1186
	1187	USE control_parameters
	1188	USE cpulog
	1189	USE grid_variables
	1190	USE indices
	1191	USE interfaces
	1192	USE pegrid
	1193	USE transpose_indices
	1194
	1195	IMPLICIT NONE
	1196
	1197	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
	1198
	1199	REAL, DIMENSION(0:ny) :: work_ffty
	1200	REAL, DIMENSION(0:ny,1:nz) :: work_triy
	1201	REAL, DIMENSION(nny,1:nza,nxl_y:nxr_ya,pdims(2)) :: ar
	1202	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
	1203
	1204
	1205	CALL cpu_log( log_point_s(39), 'fft_y + tridia', 'start' )
	1206
	1207	ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) )
	1208
	1209	tn = 0 ! Default thread number in case of one thread
	1210	!$OMP PARALLEL PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy )
	1211	!$OMP DO
	1212	DO i = nxl_y, nxr_y
	1213
	1214	!$ tn = omp_get_thread_num()
	1215
	1216	IF ( host(1:3) == 'nec' ) THEN
	1217	!
	1218	!-- Code optimized for vector processors
	1219	DO k = 1, nz
	1220
	1221	m = 0
	1222	DO n = 1, pdims(2)
	1223	DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
	1224	work_triy(m,k) = ar(j,k,i,n)
	1225	m = m + 1
	1226	ENDDO
	1227	ENDDO
	1228
	1229	ENDDO
	1230
	1231	CALL fft_y_m( work_triy, ny, 'forward' )
	1232
	1233	ELSE
	1234	!
	1235	!-- Cache optimized code
	1236	DO k = 1, nz
	1237
	1238	m = 0
	1239	DO n = 1, pdims(2)
	1240	DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
	1241	work_ffty(m) = ar(j,k,i,n)
	1242	m = m + 1
	1243	ENDDO
	1244	ENDDO
	1245
	1246	CALL fft_y( work_ffty, 'forward' )
	1247
	1248	DO j = 0, ny
	1249	work_triy(j,k) = work_ffty(j)
	1250	ENDDO
	1251
	1252	ENDDO
	1253
	1254	ENDIF
	1255
	1256	!
	1257	!-- Solve the linear equation system
	1258	CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) )
	1259
	1260	IF ( host(1:3) == 'nec' ) THEN
	1261	!
	1262	!-- Code optimized for vector processors
	1263	CALL fft_y_m( work_triy, ny, 'backward' )
	1264
	1265	DO k = 1, nz
	1266
	1267	m = 0
	1268	DO n = 1, pdims(2)
	1269	DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
	1270	ar(j,k,i,n) = work_triy(m,k)
	1271	m = m + 1
	1272	ENDDO
	1273	ENDDO
	1274
	1275	ENDDO
	1276
	1277	ELSE
	1278	!
	1279	!-- Cache optimized code
	1280	DO k = 1, nz
	1281
	1282	DO j = 0, ny
	1283	work_ffty(j) = work_triy(j,k)
	1284	ENDDO
	1285
	1286	CALL fft_y( work_ffty, 'backward' )
	1287
	1288	m = 0
	1289	DO n = 1, pdims(2)
	1290	DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
	1291	ar(j,k,i,n) = work_ffty(m)
	1292	m = m + 1
	1293	ENDDO
	1294	ENDDO
	1295
	1296	ENDDO
	1297
	1298	ENDIF
	1299
	1300	ENDDO
	1301	!$OMP END PARALLEL
	1302
	1303	DEALLOCATE( tri )
	1304
	1305	CALL cpu_log( log_point_s(39), 'fft_y + tridia', 'stop' )
	1306
	1307	END SUBROUTINE ffty_tri_ffty
	1308
	1309
	1310	SUBROUTINE tridia_1dd( ddx2, ddy2, nx, ny, j, ar, tri )
	1311
	1312	!------------------------------------------------------------------------------!
	1313	! Solves the linear system of equations for a 1d-decomposition along x (see
	1314	! tridia)
	1315	!
	1316	! Attention: when using the intel compiler, array tri must be passed as an
	1317	! argument to the contained subroutines. Otherwise addres faults
	1318	! will occur.
	1319	! On NEC, tri should not be passed (except for routine substi_1dd)
	1320	! because this causes very bad performance.
	1321	!------------------------------------------------------------------------------!
	1322
	1323	USE arrays_3d
	1324	USE control_parameters
	1325
	1326	USE pegrid
	1327
	1328	IMPLICIT NONE
	1329
	1330	INTEGER :: i, j, k, nnyh, nx, ny, omp_get_thread_num, tn
	1331
	1332	REAL :: ddx2, ddy2
	1333
	1334	REAL, DIMENSION(0:nx,1:nz) :: ar
	1335	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
	1336	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	1337
	1338
	1339	nnyh = ( ny + 1 ) / 2
	1340
	1341	!
	1342	!-- Define constant elements of the tridiagonal matrix.
	1343	!-- The compiler on SX6 does loop exchange. If 0:nx is a high power of 2,
	1344	!-- the exchanged loops create bank conflicts. The following directive
	1345	!-- prohibits loop exchange and the loops perform much better.
	1346	! tn = omp_get_thread_num()
	1347	! WRITE( 120+tn, * ) '+++ id=',myid,' nx=',nx,' thread=', omp_get_thread_num()
[82]	1348	! CALL local_flush( 120+tn )
[1]	1349	!CDIR NOLOOPCHG
	1350	DO k = 0, nz-1
	1351	DO i = 0,nx
	1352	tri(2,i,k) = ddzu(k+1) * ddzw(k+1)
	1353	tri(3,i,k) = ddzu(k+2) * ddzw(k+1)
	1354	ENDDO
	1355	ENDDO
	1356	! WRITE( 120+tn, * ) '+++ id=',myid,' end of first tridia loop thread=', omp_get_thread_num()
[82]	1357	! CALL local_flush( 120+tn )
[1]	1358
	1359	IF ( j <= nnyh ) THEN
	1360	#if defined( __lcmuk )
	1361	CALL maketri_1dd( j, tri )
	1362	#else
	1363	CALL maketri_1dd( j )
	1364	#endif
	1365	ELSE
	1366	#if defined( __lcmuk )
	1367	CALL maketri_1dd( ny+1-j, tri )
	1368	#else
	1369	CALL maketri_1dd( ny+1-j )
	1370	#endif
	1371	ENDIF
	1372	#if defined( __lcmuk )
	1373	CALL split_1dd( tri )
	1374	#else
	1375	CALL split_1dd
	1376	#endif
	1377	CALL substi_1dd( ar, tri )
	1378
	1379	CONTAINS
	1380
	1381	#if defined( __lcmuk )
	1382	SUBROUTINE maketri_1dd( j, tri )
	1383	#else
	1384	SUBROUTINE maketri_1dd( j )
	1385	#endif
	1386
	1387	!------------------------------------------------------------------------------!
	1388	! computes the i- and j-dependent component of the matrix
	1389	!------------------------------------------------------------------------------!
	1390
	1391	USE constants
	1392
	1393	IMPLICIT NONE
	1394
	1395	INTEGER :: i, j, k, nnxh
	1396	REAL :: a, c
	1397
	1398	REAL, DIMENSION(0:nx) :: l
	1399
	1400	#if defined( __lcmuk )
	1401	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	1402	#endif
	1403
	1404
	1405	nnxh = ( nx + 1 ) / 2
	1406	!
	1407	!-- Provide the tridiagonal matrix for solution of the Poisson equation in
	1408	!-- Fourier space. The coefficients are computed following the method of
	1409	!-- Schmidt et al. (DFVLR-Mitteilung 84-15), which departs from Stephan
	1410	!-- Siano's original version by discretizing the Poisson equation,
	1411	!-- before it is Fourier-transformed
	1412	DO i = 0, nx
[128]	1413	IF ( i >= 0 .AND. i <= nnxh ) THEN
[1]	1414	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
	1415	FLOAT( nx+1 ) ) ) * ddx2 + &
	1416	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
	1417	FLOAT( ny+1 ) ) ) * ddy2
	1418	ELSE
	1419	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
	1420	FLOAT( nx+1 ) ) ) * ddx2 + &
	1421	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
	1422	FLOAT( ny+1 ) ) ) * ddy2
	1423	ENDIF
	1424	ENDDO
	1425
	1426	DO k = 0, nz-1
	1427	DO i = 0, nx
	1428	a = -1.0 * ddzu(k+2) * ddzw(k+1)
	1429	c = -1.0 * ddzu(k+1) * ddzw(k+1)
	1430	tri(1,i,k) = a + c - l(i)
	1431	ENDDO
	1432	ENDDO
	1433	IF ( ibc_p_b == 1 .OR. ibc_p_b == 2 ) THEN
	1434	DO i = 0, nx
	1435	tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
	1436	ENDDO
	1437	ENDIF
	1438	IF ( ibc_p_t == 1 ) THEN
	1439	DO i = 0, nx
	1440	tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
	1441	ENDDO
	1442	ENDIF
	1443
	1444	END SUBROUTINE maketri_1dd
	1445
	1446
	1447	#if defined( __lcmuk )
	1448	SUBROUTINE split_1dd( tri )
	1449	#else
	1450	SUBROUTINE split_1dd
	1451	#endif
	1452
	1453	!------------------------------------------------------------------------------!
	1454	! Splitting of the tridiagonal matrix (Thomas algorithm)
	1455	!------------------------------------------------------------------------------!
	1456
	1457	IMPLICIT NONE
	1458
	1459	INTEGER :: i, k
	1460
	1461	#if defined( __lcmuk )
	1462	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	1463	#endif
	1464
	1465
	1466	!
	1467	!-- Splitting
	1468	DO i = 0, nx
	1469	tri(4,i,0) = tri(1,i,0)
	1470	ENDDO
	1471	DO k = 1, nz-1
	1472	DO i = 0, nx
	1473	tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
	1474	tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
	1475	ENDDO
	1476	ENDDO
	1477
	1478	END SUBROUTINE split_1dd
	1479
	1480
	1481	SUBROUTINE substi_1dd( ar, tri )
	1482
	1483	!------------------------------------------------------------------------------!
	1484	! Substitution (Forward and Backward) (Thomas algorithm)
	1485	!------------------------------------------------------------------------------!
	1486
	1487	IMPLICIT NONE
	1488
[76]	1489	INTEGER :: i, k
[1]	1490
	1491	REAL, DIMENSION(0:nx,nz) :: ar
	1492	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
	1493	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	1494
	1495	!
	1496	!-- Forward substitution
	1497	DO i = 0, nx
	1498	ar1(i,0) = ar(i,1)
	1499	ENDDO
	1500	DO k = 1, nz-1
	1501	DO i = 0, nx
	1502	ar1(i,k) = ar(i,k+1) - tri(5,i,k) * ar1(i,k-1)
	1503	ENDDO
	1504	ENDDO
	1505
	1506	!
	1507	!-- Backward substitution
	1508	DO i = 0, nx
	1509	ar(i,nz) = ar1(i,nz-1) / tri(4,i,nz-1)
	1510	ENDDO
	1511	DO k = nz-2, 0, -1
	1512	DO i = 0, nx
	1513	ar(i,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,k+2) ) &
	1514	/ tri(4,i,k)
	1515	ENDDO
	1516	ENDDO
	1517
[76]	1518	!
	1519	!-- Indices i=0, j=0 correspond to horizontally averaged pressure.
	1520	!-- The respective values of ar should be zero at all k-levels if
	1521	!-- acceleration of horizontally averaged vertical velocity is zero.
	1522	IF ( ibc_p_b == 1 .AND. ibc_p_t == 1 ) THEN
	1523	IF ( j == 0 ) THEN
	1524	DO k = 1, nz
	1525	ar(0,k) = 0.0
	1526	ENDDO
	1527	ENDIF
	1528	ENDIF
	1529
[1]	1530	END SUBROUTINE substi_1dd
	1531
	1532	END SUBROUTINE tridia_1dd
	1533
	1534	#endif
	1535
	1536	END MODULE poisfft_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |