Home

Context Navigation

source: palm/trunk/SOURCE/poisfft.f90 @ 690

Last change on this file since 690 was 684, checked in by raasch, 14 years ago
last commit documented
Property svn:keywords set to `Id`
File size: 45.5 KB

Rev	Line
[1]	1	MODULE poisfft_mod
	2
	3	!------------------------------------------------------------------------------!
[484]	4	! Current revisions:
[1]	5	! -----------------
[684]	6	!
[1]	7	!
	8	! Former revisions:
	9	! -----------------
[3]	10	! $Id: poisfft.f90 684 2011-02-09 14:49:31Z gryschka $
[77]	11	!
[684]	12	! 683 2011-02-09 14:25:15Z raasch
	13	! openMP parallelization for 2d-domain-decomposition
	14	!
[668]	15	! 667 2010-12-23 12:06:00Z suehring/gryschka
	16	! ddzu replaced by ddzu_pres due to changes in zu(0)
	17	!
[623]	18	! 622 2010-12-10 08:08:13Z raasch
	19	! optional barriers included in order to speed up collective operations
	20	!
[392]	21	! 377 2009-09-04 11:09:00Z raasch
	22	! __lcmuk changed to __lc to avoid problems with Intel compiler on sgi-ice
	23	!
[198]	24	! 164 2008-05-15 08:46:15Z raasch
	25	! Arguments removed from transpose routines
	26	!
[139]	27	! 128 2007-10-26 13:11:14Z raasch
	28	! Bugfix: wavenumber calculation for even nx in routines maketri
	29	!
[90]	30	! 85 2007-05-11 09:35:14Z raasch
	31	! Bugfix: work_fft*_vec removed from some PRIVATE-declarations
	32	!
[77]	33	! 76 2007-03-29 00:58:32Z raasch
	34	! Tridiagonal coefficients adjusted for Neumann boundary conditions both at
	35	! the bottom and the top.
	36	!
[3]	37	! RCS Log replace by Id keyword, revision history cleaned up
	38	!
[1]	39	! Revision 1.24 2006/08/04 15:00:24 raasch
	40	! Default setting of the thread number tn in case of not using OpenMP
	41	!
	42	! Revision 1.23 2006/02/23 12:48:38 raasch
	43	! Additional compiler directive in routine tridia_1dd for preventing loop
	44	! exchange on NEC-SX6
	45	!
	46	! Revision 1.20 2004/04/30 12:38:09 raasch
	47	! Parts of former poisfft_hybrid moved to this subroutine,
	48	! former subroutine changed to a module, renaming of FFT-subroutines and
	49	! -module, FFTs completely substituted by calls of fft_x and fft_y,
	50	! NAG fft used in the non-parallel case completely removed, l in maketri
	51	! is now a 1d-array, variables passed by modules instead of using parameter
	52	! lists, enlarged transposition arrays introduced
	53	!
	54	! Revision 1.1 1997/07/24 11:24:14 raasch
	55	! Initial revision
	56	!
	57	!
	58	! Description:
	59	! ------------
	60	! See below.
	61	!------------------------------------------------------------------------------!
	62
	63	!--------------------------------------------------------------------------!
	64	! poisfft !
	65	! !
	66	! Original version: Stephan Siano (pois3d) !
	67	! !
	68	! Institute of Meteorology and Climatology, University of Hannover !
	69	! Germany !
	70	! !
	71	! Version as of July 23,1996 !
	72	! !
	73	! !
	74	! Version for parallel computers: Siegfried Raasch !
	75	! !
	76	! Version as of July 03,1997 !
	77	! !
	78	! Solves the Poisson equation with a 2D spectral method !
	79	! d^2 p / dx^2 + d^2 p / dy^2 + d^2 p / dz^2 = s !
	80	! !
	81	! Input: !
	82	! real ar contains in the (nnx,nny,nnz) elements, !
	83	! starting from the element (1,nys,nxl), the !
	84	! values for s !
	85	! real work Temporary array !
	86	! !
	87	! Output: !
	88	! real ar contains the solution for p !
	89	!--------------------------------------------------------------------------!
	90
	91	USE fft_xy
	92	USE indices
	93	USE transpose_indices
	94
	95	IMPLICIT NONE
	96
	97	PRIVATE
	98	PUBLIC poisfft, poisfft_init
	99
	100	INTERFACE poisfft
	101	MODULE PROCEDURE poisfft
	102	END INTERFACE poisfft
	103
	104	INTERFACE poisfft_init
	105	MODULE PROCEDURE poisfft_init
	106	END INTERFACE poisfft_init
	107
	108	CONTAINS
	109
	110	SUBROUTINE poisfft_init
	111
	112	CALL fft_init
	113
	114	END SUBROUTINE poisfft_init
	115
	116
	117	SUBROUTINE poisfft( ar, work )
	118
	119	USE cpulog
	120	USE interfaces
	121	USE pegrid
	122
	123	IMPLICIT NONE
	124
	125	REAL, DIMENSION(1:nza,nys:nyna,nxl:nxra) :: ar, work
	126
	127
	128	CALL cpu_log( log_point_s(3), 'poisfft', 'start' )
	129
	130	!
	131	!-- Two-dimensional Fourier Transformation in x- and y-direction.
	132	#if defined( __parallel )
	133	IF ( pdims(2) == 1 ) THEN
	134
	135	!
	136	!-- 1d-domain-decomposition along x:
	137	!-- FFT along y and transposition y --> x
	138	CALL ffty_tr_yx( ar, work, ar )
	139
	140	!
	141	!-- FFT along x, solving the tridiagonal system and backward FFT
	142	CALL fftx_tri_fftx( ar )
	143
	144	!
	145	!-- Transposition x --> y and backward FFT along y
	146	CALL tr_xy_ffty( ar, work, ar )
	147
	148	ELSEIF ( pdims(1) == 1 ) THEN
	149
	150	!
	151	!-- 1d-domain-decomposition along y:
	152	!-- FFT along x and transposition x --> y
	153	CALL fftx_tr_xy( ar, work, ar )
	154
	155	!
	156	!-- FFT along y, solving the tridiagonal system and backward FFT
	157	CALL ffty_tri_ffty( ar )
	158
	159	!
	160	!-- Transposition y --> x and backward FFT along x
	161	CALL tr_yx_fftx( ar, work, ar )
	162
	163	ELSE
	164
	165	!
	166	!-- 2d-domain-decomposition
	167	!-- Transposition z --> x
	168	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
[164]	169	CALL transpose_zx( ar, work, ar )
[1]	170	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
	171
	172	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
	173	CALL fftxp( ar, 'forward' )
	174	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
	175
	176	!
	177	!-- Transposition x --> y
	178	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
[164]	179	CALL transpose_xy( ar, work, ar )
[1]	180	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
	181
	182	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
	183	CALL fftyp( ar, 'forward' )
	184	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	185
	186	!
	187	!-- Transposition y --> z
	188	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
[164]	189	CALL transpose_yz( ar, work, ar )
[1]	190	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
	191
	192	!
	193	!-- Solve the Poisson equation in z-direction in cartesian space.
	194	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
	195	CALL tridia( ar )
	196	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
	197
	198	!
	199	!-- Inverse Fourier Transformation
	200	!-- Transposition z --> y
	201	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
[164]	202	CALL transpose_zy( ar, work, ar )
[1]	203	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
	204
	205	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
	206	CALL fftyp( ar, 'backward' )
	207	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	208
	209	!
	210	!-- Transposition y --> x
	211	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
[164]	212	CALL transpose_yx( ar, work, ar )
[1]	213	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
	214
	215	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
	216	CALL fftxp( ar, 'backward' )
	217	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	218
	219	!
	220	!-- Transposition x --> z
	221	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
[164]	222	CALL transpose_xz( ar, work, ar )
[1]	223	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
	224
	225	ENDIF
	226
	227	#else
	228
	229	!
	230	!-- Two-dimensional Fourier Transformation along x- and y-direction.
	231	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
	232	CALL fftx( ar, 'forward' )
	233	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
	234	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
	235	CALL ffty( ar, 'forward' )
	236	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	237
	238	!
	239	!-- Solve the Poisson equation in z-direction in cartesian space.
	240	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
	241	CALL tridia( ar )
	242	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
	243
	244	!
	245	!-- Inverse Fourier Transformation.
	246	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
	247	CALL ffty( ar, 'backward' )
	248	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	249	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
	250	CALL fftx( ar, 'backward' )
	251	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	252
	253	#endif
	254
	255	CALL cpu_log( log_point_s(3), 'poisfft', 'stop' )
	256
	257	END SUBROUTINE poisfft
	258
	259
	260
	261	SUBROUTINE tridia( ar )
	262
	263	!------------------------------------------------------------------------------!
	264	! solves the linear system of equations:
	265	!
	266	! -(4 pi^2(i^2/(dx^2nnx^2)+j^2/(dy^2nny^2))+
	267	! 1/(dzu(k)dzw(k))+1/(dzu(k-1)dzw(k)))*p(i,j,k)+
	268	! 1/(dzu(k)dzw(k))p(i,j,k+1)+1/(dzu(k-1)dzw(k))p(i,j,k-1)=d(i,j,k)
	269	!
	270	! by using the Thomas algorithm
	271	!------------------------------------------------------------------------------!
	272
	273	USE arrays_3d
	274
	275	IMPLICIT NONE
	276
	277	INTEGER :: i, j, k, nnyh
	278
	279	REAL, DIMENSION(nxl_z:nxr_z,0:nz-1) :: ar1
	280	REAL, DIMENSION(5,nxl_z:nxr_z,0:nz-1) :: tri
	281
	282	#if defined( __parallel )
	283	REAL :: ar(nxl_z:nxr_za,nys_z:nyn_za,1:nza)
	284	#else
	285	REAL :: ar(1:nz,nys_z:nyn_z,nxl_z:nxr_z)
	286	#endif
	287
	288
	289	nnyh = (ny+1) / 2
	290
	291	!
	292	!-- Define constant elements of the tridiagonal matrix.
[683]	293	!$OMP PARALLEL PRIVATE ( k, i )
	294	!$OMP DO
[1]	295	DO k = 0, nz-1
	296	DO i = nxl_z, nxr_z
[667]	297	tri(2,i,k) = ddzu_pres(k+1) * ddzw(k+1)
	298	tri(3,i,k) = ddzu_pres(k+2) * ddzw(k+1)
[1]	299	ENDDO
	300	ENDDO
[683]	301	!$OMP END PARALLEL
[1]	302
	303	#if defined( __parallel )
	304	!
	305	!-- Repeat for all y-levels.
[683]	306	!$OMP PARALLEL FIRSTPRIVATE( tri ) PRIVATE ( ar1, j )
	307	!$OMP DO
[1]	308	DO j = nys_z, nyn_z
	309	IF ( j <= nnyh ) THEN
	310	CALL maketri( tri, j )
	311	ELSE
	312	CALL maketri( tri, ny+1-j )
	313	ENDIF
	314	CALL split( tri )
	315	CALL substi( ar, ar1, tri, j )
	316	ENDDO
[683]	317	!$OMP END PARALLEL
[1]	318	#else
	319	!
	320	!-- First y-level.
	321	CALL maketri( tri, nys_z )
	322	CALL split( tri )
	323	CALL substi( ar, ar1, tri, 0 )
	324
	325	!
	326	!-- Further y-levels.
	327	DO j = 1, nnyh - 1
	328	CALL maketri( tri, j )
	329	CALL split( tri )
	330	CALL substi( ar, ar1, tri, j )
	331	CALL substi( ar, ar1, tri, ny+1-j )
	332	ENDDO
	333	CALL maketri( tri, nnyh )
	334	CALL split( tri )
	335	CALL substi( ar, ar1, tri, nnyh+nys )
	336	#endif
	337
	338	CONTAINS
	339
	340	SUBROUTINE maketri( tri, j )
	341
	342	!------------------------------------------------------------------------------!
	343	! Computes the i- and j-dependent component of the matrix
	344	!------------------------------------------------------------------------------!
	345
	346	USE arrays_3d
	347	USE constants
	348	USE control_parameters
	349	USE grid_variables
	350
	351	IMPLICIT NONE
	352
	353	INTEGER :: i, j, k, nnxh
	354	REAL :: a, c
	355	REAL :: ll(nxl_z:nxr_z)
	356	REAL :: tri(5,nxl_z:nxr_z,0:nz-1)
	357
	358
	359	nnxh = ( nx + 1 ) / 2
	360
	361	!
	362	!-- Provide the tridiagonal matrix for solution of the Poisson equation in
	363	!-- Fourier space. The coefficients are computed following the method of
	364	!-- Schmidt et al. (DFVLR-Mitteilung 84-15), which departs from Stephan
	365	!-- Siano's original version by discretizing the Poisson equation,
	366	!-- before it is Fourier-transformed
	367	#if defined( __parallel )
	368	DO i = nxl_z, nxr_z
[128]	369	IF ( i >= 0 .AND. i <= nnxh ) THEN
[1]	370	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
	371	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
	372	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
	373	FLOAT( ny+1 ) ) ) / ( dy * dy )
	374	ELSE
	375	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
	376	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
	377	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
	378	FLOAT( ny+1 ) ) ) / ( dy * dy )
	379	ENDIF
	380	DO k = 0,nz-1
[667]	381	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
	382	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
[1]	383	tri(1,i,k) = a + c - ll(i)
	384	ENDDO
	385	ENDDO
	386	#else
	387	DO i = 0, nnxh
	388	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / FLOAT( nx+1 ) ) ) / &
	389	( dx * dx ) + &
	390	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / FLOAT( ny+1 ) ) ) / &
	391	( dy * dy )
	392	DO k = 0, nz-1
[667]	393	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
	394	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
[1]	395	tri(1,i,k) = a + c - ll(i)
	396	IF ( i >= 1 .and. i < nnxh ) THEN
	397	tri(1,nx+1-i,k) = tri(1,i,k)
	398	ENDIF
	399	ENDDO
	400	ENDDO
	401	#endif
	402	IF ( ibc_p_b == 1 .OR. ibc_p_b == 2 ) THEN
	403	DO i = nxl_z, nxr_z
	404	tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
	405	ENDDO
	406	ENDIF
	407	IF ( ibc_p_t == 1 ) THEN
	408	DO i = nxl_z, nxr_z
	409	tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
	410	ENDDO
	411	ENDIF
	412
	413	END SUBROUTINE maketri
	414
	415
	416	SUBROUTINE substi( ar, ar1, tri, j )
	417
	418	!------------------------------------------------------------------------------!
	419	! Substitution (Forward and Backward) (Thomas algorithm)
	420	!------------------------------------------------------------------------------!
	421
[76]	422	USE control_parameters
	423
[1]	424	IMPLICIT NONE
	425
	426	INTEGER :: i, j, k
	427	REAL :: ar1(nxl_z:nxr_z,0:nz-1)
	428	REAL :: tri(5,nxl_z:nxr_z,0:nz-1)
	429	#if defined( __parallel )
	430	REAL :: ar(nxl_z:nxr_za,nys_z:nyn_za,1:nza)
	431	#else
	432	REAL :: ar(1:nz,nys_z:nyn_z,nxl_z:nxr_z)
	433	#endif
	434
	435	!
	436	!-- Forward substitution.
	437	DO i = nxl_z, nxr_z
	438	#if defined( __parallel )
	439	ar1(i,0) = ar(i,j,1)
	440	#else
	441	ar1(i,0) = ar(1,j,i)
	442	#endif
	443	ENDDO
	444	DO k = 1, nz - 1
	445	DO i = nxl_z, nxr_z
	446	#if defined( __parallel )
	447	ar1(i,k) = ar(i,j,k+1) - tri(5,i,k) * ar1(i,k-1)
	448	#else
	449	ar1(i,k) = ar(k+1,j,i) - tri(5,i,k) * ar1(i,k-1)
	450	#endif
	451	ENDDO
	452	ENDDO
	453
	454	!
	455	!-- Backward substitution.
	456	DO i = nxl_z, nxr_z
	457	#if defined( __parallel )
	458	ar(i,j,nz) = ar1(i,nz-1) / tri(4,i,nz-1)
	459	#else
	460	ar(nz,j,i) = ar1(i,nz-1) / tri(4,i,nz-1)
	461	#endif
	462	ENDDO
	463	DO k = nz-2, 0, -1
	464	DO i = nxl_z, nxr_z
	465	#if defined( __parallel )
	466	ar(i,j,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,j,k+2) ) &
	467	/ tri(4,i,k)
	468	#else
	469	ar(k+1,j,i) = ( ar1(i,k) - tri(3,i,k) * ar(k+2,j,i) ) &
	470	/ tri(4,i,k)
	471	#endif
	472	ENDDO
	473	ENDDO
	474
[76]	475	!
	476	!-- Indices i=0, j=0 correspond to horizontally averaged pressure.
	477	!-- The respective values of ar should be zero at all k-levels if
	478	!-- acceleration of horizontally averaged vertical velocity is zero.
	479	IF ( ibc_p_b == 1 .AND. ibc_p_t == 1 ) THEN
	480	IF ( j == 0 .AND. nxl_z == 0 ) THEN
	481	#if defined( __parallel )
	482	DO k = 1, nz
	483	ar(nxl_z,j,k) = 0.0
	484	ENDDO
	485	#else
	486	DO k = 1, nz
	487	ar(k,j,nxl_z) = 0.0
	488	ENDDO
	489	#endif
	490	ENDIF
	491	ENDIF
	492
[1]	493	END SUBROUTINE substi
	494
	495
	496	SUBROUTINE split( tri )
	497
	498	!------------------------------------------------------------------------------!
	499	! Splitting of the tridiagonal matrix (Thomas algorithm)
	500	!------------------------------------------------------------------------------!
	501
	502	IMPLICIT NONE
	503
	504	INTEGER :: i, k
	505	REAL :: tri(5,nxl_z:nxr_z,0:nz-1)
	506
	507	!
	508	!-- Splitting.
	509	DO i = nxl_z, nxr_z
	510	tri(4,i,0) = tri(1,i,0)
	511	ENDDO
	512	DO k = 1, nz-1
	513	DO i = nxl_z, nxr_z
	514	tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
	515	tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
	516	ENDDO
	517	ENDDO
	518
	519	END SUBROUTINE split
	520
	521	END SUBROUTINE tridia
	522
	523
	524	#if defined( __parallel )
	525	SUBROUTINE fftxp( ar, direction )
	526
	527	!------------------------------------------------------------------------------!
	528	! Fourier-transformation along x-direction Parallelized version
	529	!------------------------------------------------------------------------------!
	530
	531	IMPLICIT NONE
	532
	533	CHARACTER (LEN=*) :: direction
	534	INTEGER :: j, k
	535	REAL :: ar(0:nxa,nys_x:nyn_xa,nzb_x:nzt_xa)
	536
	537	!
	538	!-- Performing the fft with one of the methods implemented
[683]	539	!$OMP PARALLEL PRIVATE ( j, k )
	540	!$OMP DO
[1]	541	DO k = nzb_x, nzt_x
	542	DO j = nys_x, nyn_x
	543	CALL fft_x( ar(0:nx,j,k), direction )
	544	ENDDO
	545	ENDDO
[683]	546	!$OMP END PARALLEL
[1]	547
	548	END SUBROUTINE fftxp
	549
	550	#else
	551	SUBROUTINE fftx( ar, direction )
	552
	553	!------------------------------------------------------------------------------!
	554	! Fourier-transformation along x-direction Non parallel version
	555	!------------------------------------------------------------------------------!
	556
	557	IMPLICIT NONE
	558
	559	CHARACTER (LEN=*) :: direction
	560	INTEGER :: i, j, k
	561	REAL :: ar(1:nz,0:ny,0:nx)
	562
	563	!
	564	!-- Performing the fft with one of the methods implemented
[683]	565	!$OMP PARALLEL PRIVATE ( j, k )
	566	!$OMP DO
[1]	567	DO k = 1, nz
	568	DO j = 0, ny
	569	CALL fft_x( ar(k,j,0:nx), direction )
	570	ENDDO
	571	ENDDO
[683]	572	!$OMP END PARALLEL
[1]	573
	574	END SUBROUTINE fftx
	575	#endif
	576
	577
	578	#if defined( __parallel )
	579	SUBROUTINE fftyp( ar, direction )
	580
	581	!------------------------------------------------------------------------------!
	582	! Fourier-transformation along y-direction Parallelized version
	583	!------------------------------------------------------------------------------!
	584
	585	IMPLICIT NONE
	586
	587	CHARACTER (LEN=*) :: direction
	588	INTEGER :: i, k
	589	REAL :: ar(0:nya,nxl_y:nxr_ya,nzb_y:nzt_ya)
	590
	591	!
	592	!-- Performing the fft with one of the methods implemented
[683]	593	!$OMP PARALLEL PRIVATE ( i, k )
	594	!$OMP DO
[1]	595	DO k = nzb_y, nzt_y
	596	DO i = nxl_y, nxr_y
	597	CALL fft_y( ar(0:ny,i,k), direction )
	598	ENDDO
	599	ENDDO
[683]	600	!$OMP END PARALLEL
[1]	601
	602	END SUBROUTINE fftyp
	603
	604	#else
	605	SUBROUTINE ffty( ar, direction )
	606
	607	!------------------------------------------------------------------------------!
	608	! Fourier-transformation along y-direction Non parallel version
	609	!------------------------------------------------------------------------------!
	610
	611	IMPLICIT NONE
	612
	613	CHARACTER (LEN=*) :: direction
	614	INTEGER :: i, k
	615	REAL :: ar(1:nz,0:ny,0:nx)
	616
	617	!
	618	!-- Performing the fft with one of the methods implemented
[683]	619	!$OMP PARALLEL PRIVATE ( i, k )
	620	!$OMP DO
[1]	621	DO k = 1, nz
	622	DO i = 0, nx
	623	CALL fft_y( ar(k,0:ny,i), direction )
	624	ENDDO
	625	ENDDO
[683]	626	!$OMP END PARALLEL
[1]	627
	628	END SUBROUTINE ffty
	629	#endif
	630
	631	#if defined( __parallel )
	632	SUBROUTINE ffty_tr_yx( f_in, work, f_out )
	633
	634	!------------------------------------------------------------------------------!
	635	! Fourier-transformation along y with subsequent transposition y --> x for
	636	! a 1d-decomposition along x
	637	!
	638	! ATTENTION: The performance of this routine is much faster on the NEC-SX6,
	639	! if the first index of work_ffty_vec is odd. Otherwise
	640	! memory bank conflicts may occur (especially if the index is a
	641	! multiple of 128). That's why work_ffty_vec is dimensioned as
	642	! 0:ny+1.
	643	! Of course, this will not work if users are using an odd number
	644	! of gridpoints along y.
	645	!------------------------------------------------------------------------------!
	646
	647	USE control_parameters
	648	USE cpulog
	649	USE indices
	650	USE interfaces
	651	USE pegrid
	652	USE transpose_indices
	653
	654	IMPLICIT NONE
	655
	656	INTEGER :: i, iend, iouter, ir, j, k
	657	INTEGER, PARAMETER :: stridex = 4
	658
	659	REAL, DIMENSION(0:ny,stridex) :: work_ffty
	660	#if defined( __nec )
	661	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
	662	#endif
	663	REAL, DIMENSION(1:nza,0:nya,nxl:nxra) :: f_in
	664	REAL, DIMENSION(nnx,1:nza,nys_x:nyn_xa,pdims(1)) :: f_out
	665	REAL, DIMENSION(nxl:nxra,1:nza,0:nya) :: work
	666
	667	!
	668	!-- Carry out the FFT along y, where all data are present due to the
	669	!-- 1d-decomposition along x. Resort the data in a way that x becomes
	670	!-- the first index.
	671	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
	672
	673	IF ( host(1:3) == 'nec' ) THEN
	674	#if defined( __nec )
	675	!
	676	!-- Code optimized for vector processors
[85]	677	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	678	!$OMP DO
	679	DO i = nxl, nxr
	680
	681	DO j = 0, ny
	682	DO k = 1, nz
	683	work_ffty_vec(j,k,i) = f_in(k,j,i)
	684	ENDDO
	685	ENDDO
	686
	687	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' )
	688
	689	ENDDO
	690
	691	!$OMP DO
	692	DO k = 1, nz
	693	DO j = 0, ny
	694	DO i = nxl, nxr
	695	work(i,k,j) = work_ffty_vec(j,k,i)
	696	ENDDO
	697	ENDDO
	698	ENDDO
	699	!$OMP END PARALLEL
	700	#endif
	701
	702	ELSE
	703
	704	!
	705	!-- Cache optimized code.
	706	!-- The i-(x-)direction is split into a strided outer loop and an inner
	707	!-- loop for better cache performance
	708	!$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty)
	709	!$OMP DO
	710	DO iouter = nxl, nxr, stridex
	711
	712	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
	713
	714	DO k = 1, nz
	715
	716	DO i = iouter, iend
	717
	718	ir = i-iouter+1 ! counter within a stride
	719	DO j = 0, ny
	720	work_ffty(j,ir) = f_in(k,j,i)
	721	ENDDO
	722	!
	723	!-- FFT along y
	724	CALL fft_y( work_ffty(:,ir), 'forward' )
	725
	726	ENDDO
	727
	728	!
	729	!-- Resort
	730	DO j = 0, ny
	731	DO i = iouter, iend
	732	work(i,k,j) = work_ffty(j,i-iouter+1)
	733	ENDDO
	734	ENDDO
	735
	736	ENDDO
	737
	738	ENDDO
	739	!$OMP END PARALLEL
	740
	741	ENDIF
	742	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	743
	744	!
	745	!-- Transpose array
	746	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	747	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	748	CALL MPI_ALLTOALL( work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
	749	f_out(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
	750	comm1dx, ierr )
	751	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	752
	753	END SUBROUTINE ffty_tr_yx
	754
	755
	756	SUBROUTINE tr_xy_ffty( f_in, work, f_out )
	757
	758	!------------------------------------------------------------------------------!
	759	! Transposition x --> y with a subsequent backward Fourier transformation for
	760	! a 1d-decomposition along x
	761	!------------------------------------------------------------------------------!
	762
	763	USE control_parameters
	764	USE cpulog
	765	USE indices
	766	USE interfaces
	767	USE pegrid
	768	USE transpose_indices
	769
	770	IMPLICIT NONE
	771
	772	INTEGER :: i, iend, iouter, ir, j, k
	773	INTEGER, PARAMETER :: stridex = 4
	774
	775	REAL, DIMENSION(0:ny,stridex) :: work_ffty
	776	#if defined( __nec )
	777	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
	778	#endif
	779	REAL, DIMENSION(nnx,1:nza,nys_x:nyn_xa,pdims(1)) :: f_in
	780	REAL, DIMENSION(1:nza,0:nya,nxl:nxra) :: f_out
	781	REAL, DIMENSION(nxl:nxra,1:nza,0:nya) :: work
	782
	783	!
	784	!-- Transpose array
	785	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	786	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	787	CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
	788	work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
	789	comm1dx, ierr )
	790	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	791
	792	!
	793	!-- Resort the data in a way that y becomes the first index and carry out the
	794	!-- backward fft along y.
	795	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
	796
	797	IF ( host(1:3) == 'nec' ) THEN
	798	#if defined( __nec )
	799	!
	800	!-- Code optimized for vector processors
[85]	801	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	802	!$OMP DO
	803	DO k = 1, nz
	804	DO j = 0, ny
	805	DO i = nxl, nxr
	806	work_ffty_vec(j,k,i) = work(i,k,j)
	807	ENDDO
	808	ENDDO
	809	ENDDO
	810
	811	!$OMP DO
	812	DO i = nxl, nxr
	813
	814	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' )
	815
	816	DO j = 0, ny
	817	DO k = 1, nz
	818	f_out(k,j,i) = work_ffty_vec(j,k,i)
	819	ENDDO
	820	ENDDO
	821
	822	ENDDO
	823	!$OMP END PARALLEL
	824	#endif
	825
	826	ELSE
	827
	828	!
	829	!-- Cache optimized code.
	830	!-- The i-(x-)direction is split into a strided outer loop and an inner
	831	!-- loop for better cache performance
	832	!$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty )
	833	!$OMP DO
	834	DO iouter = nxl, nxr, stridex
	835
	836	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
	837
	838	DO k = 1, nz
	839	!
	840	!-- Resort
	841	DO j = 0, ny
	842	DO i = iouter, iend
	843	work_ffty(j,i-iouter+1) = work(i,k,j)
	844	ENDDO
	845	ENDDO
	846
	847	DO i = iouter, iend
	848
	849	!
	850	!-- FFT along y
	851	ir = i-iouter+1 ! counter within a stride
	852	CALL fft_y( work_ffty(:,ir), 'backward' )
	853
	854	DO j = 0, ny
	855	f_out(k,j,i) = work_ffty(j,ir)
	856	ENDDO
	857	ENDDO
	858
	859	ENDDO
	860
	861	ENDDO
	862	!$OMP END PARALLEL
	863
	864	ENDIF
	865
	866	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	867
	868	END SUBROUTINE tr_xy_ffty
	869
	870
	871	SUBROUTINE fftx_tri_fftx( ar )
	872
	873	!------------------------------------------------------------------------------!
	874	! FFT along x, solution of the tridiagonal system and backward FFT for
	875	! a 1d-decomposition along x
	876	!
	877	! WARNING: this subroutine may still not work for hybrid parallelization
	878	! with OpenMP (for possible necessary changes see the original
	879	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
	880	!------------------------------------------------------------------------------!
	881
	882	USE control_parameters
	883	USE cpulog
	884	USE grid_variables
	885	USE indices
	886	USE interfaces
	887	USE pegrid
	888	USE transpose_indices
	889
	890	IMPLICIT NONE
	891
	892	character(len=3) :: myth_char
	893
	894	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
	895
	896	REAL, DIMENSION(0:nx) :: work_fftx
	897	REAL, DIMENSION(0:nx,1:nz) :: work_trix
	898	REAL, DIMENSION(nnx,1:nza,nys_x:nyn_xa,pdims(1)) :: ar
	899	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
	900
	901
	902	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'start' )
	903
	904	ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) )
	905
	906	tn = 0 ! Default thread number in case of one thread
	907	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_fftx, work_trix )
	908	DO j = nys_x, nyn_x
	909
	910	!$ tn = omp_get_thread_num()
	911
	912	IF ( host(1:3) == 'nec' ) THEN
	913	!
	914	!-- Code optimized for vector processors
	915	DO k = 1, nz
	916
	917	m = 0
	918	DO n = 1, pdims(1)
	919	DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
	920	work_trix(m,k) = ar(i,k,j,n)
	921	m = m + 1
	922	ENDDO
	923	ENDDO
	924
	925	ENDDO
	926
	927	CALL fft_x_m( work_trix, 'forward' )
	928
	929	ELSE
	930	!
	931	!-- Cache optimized code
	932	DO k = 1, nz
	933
	934	m = 0
	935	DO n = 1, pdims(1)
	936	DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
	937	work_fftx(m) = ar(i,k,j,n)
	938	m = m + 1
	939	ENDDO
	940	ENDDO
	941
	942	CALL fft_x( work_fftx, 'forward' )
	943
	944	DO i = 0, nx
	945	work_trix(i,k) = work_fftx(i)
	946	ENDDO
	947
	948	ENDDO
	949
	950	ENDIF
	951
	952	!
	953	!-- Solve the linear equation system
	954	CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) )
	955
	956	IF ( host(1:3) == 'nec' ) THEN
	957	!
	958	!-- Code optimized for vector processors
	959	CALL fft_x_m( work_trix, 'backward' )
	960
	961	DO k = 1, nz
	962
	963	m = 0
	964	DO n = 1, pdims(1)
	965	DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
	966	ar(i,k,j,n) = work_trix(m,k)
	967	m = m + 1
	968	ENDDO
	969	ENDDO
	970
	971	ENDDO
	972
	973	ELSE
	974	!
	975	!-- Cache optimized code
	976	DO k = 1, nz
	977
	978	DO i = 0, nx
	979	work_fftx(i) = work_trix(i,k)
	980	ENDDO
	981
	982	CALL fft_x( work_fftx, 'backward' )
	983
	984	m = 0
	985	DO n = 1, pdims(1)
	986	DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
	987	ar(i,k,j,n) = work_fftx(m)
	988	m = m + 1
	989	ENDDO
	990	ENDDO
	991
	992	ENDDO
	993
	994	ENDIF
	995
	996	ENDDO
	997
	998	DEALLOCATE( tri )
	999
	1000	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'stop' )
	1001
	1002	END SUBROUTINE fftx_tri_fftx
	1003
	1004
	1005	SUBROUTINE fftx_tr_xy( f_in, work, f_out )
	1006
	1007	!------------------------------------------------------------------------------!
	1008	! Fourier-transformation along x with subsequent transposition x --> y for
	1009	! a 1d-decomposition along y
	1010	!
	1011	! ATTENTION: The NEC-branch of this routine may significantly profit from
	1012	! further optimizations. So far, performance is much worse than
	1013	! for routine ffty_tr_yx (more than three times slower).
	1014	!------------------------------------------------------------------------------!
	1015
	1016	USE control_parameters
	1017	USE cpulog
	1018	USE indices
	1019	USE interfaces
	1020	USE pegrid
	1021	USE transpose_indices
	1022
	1023	IMPLICIT NONE
	1024
	1025	INTEGER :: i, j, k
	1026
	1027	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
	1028	REAL, DIMENSION(1:nza,nys:nyna,0:nxa) :: f_in
	1029	REAL, DIMENSION(nny,1:nza,nxl_y:nxr_ya,pdims(2)) :: f_out
	1030	REAL, DIMENSION(nys:nyna,1:nza,0:nxa) :: work
	1031
	1032	!
	1033	!-- Carry out the FFT along x, where all data are present due to the
	1034	!-- 1d-decomposition along y. Resort the data in a way that y becomes
	1035	!-- the first index.
	1036	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
	1037
	1038	IF ( host(1:3) == 'nec' ) THEN
	1039	!
	1040	!-- Code for vector processors
[85]	1041	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	1042	!$OMP DO
	1043	DO i = 0, nx
	1044
	1045	DO j = nys, nyn
	1046	DO k = 1, nz
	1047	work_fftx(i,k,j) = f_in(k,j,i)
	1048	ENDDO
	1049	ENDDO
	1050
	1051	ENDDO
	1052
	1053	!$OMP DO
	1054	DO j = nys, nyn
	1055
	1056	CALL fft_x_m( work_fftx(:,:,j), 'forward' )
	1057
	1058	DO k = 1, nz
	1059	DO i = 0, nx
	1060	work(j,k,i) = work_fftx(i,k,j)
	1061	ENDDO
	1062	ENDDO
	1063
	1064	ENDDO
	1065	!$OMP END PARALLEL
	1066
	1067	ELSE
	1068
	1069	!
	1070	!-- Cache optimized code (there might be still a potential for better
	1071	!-- optimization).
	1072	!$OMP PARALLEL PRIVATE (i,j,k,work_fftx)
	1073	!$OMP DO
	1074	DO i = 0, nx
	1075
	1076	DO j = nys, nyn
	1077	DO k = 1, nz
	1078	work_fftx(i,k,j) = f_in(k,j,i)
	1079	ENDDO
	1080	ENDDO
	1081
	1082	ENDDO
	1083
	1084	!$OMP DO
	1085	DO j = nys, nyn
	1086	DO k = 1, nz
	1087
	1088	CALL fft_x( work_fftx(0:nx,k,j), 'forward' )
	1089
	1090	DO i = 0, nx
	1091	work(j,k,i) = work_fftx(i,k,j)
	1092	ENDDO
	1093	ENDDO
	1094
	1095	ENDDO
	1096	!$OMP END PARALLEL
	1097
	1098	ENDIF
	1099	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
	1100
	1101	!
	1102	!-- Transpose array
	1103	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	1104	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	1105	CALL MPI_ALLTOALL( work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
	1106	f_out(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
	1107	comm1dy, ierr )
	1108	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	1109
	1110	END SUBROUTINE fftx_tr_xy
	1111
	1112
	1113	SUBROUTINE tr_yx_fftx( f_in, work, f_out )
	1114
	1115	!------------------------------------------------------------------------------!
	1116	! Transposition y --> x with a subsequent backward Fourier transformation for
	1117	! a 1d-decomposition along x
	1118	!------------------------------------------------------------------------------!
	1119
	1120	USE control_parameters
	1121	USE cpulog
	1122	USE indices
	1123	USE interfaces
	1124	USE pegrid
	1125	USE transpose_indices
	1126
	1127	IMPLICIT NONE
	1128
	1129	INTEGER :: i, j, k
	1130
	1131	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
	1132	REAL, DIMENSION(nny,1:nza,nxl_y:nxr_ya,pdims(2)) :: f_in
	1133	REAL, DIMENSION(1:nza,nys:nyna,0:nxa) :: f_out
	1134	REAL, DIMENSION(nys:nyna,1:nza,0:nxa) :: work
	1135
	1136	!
	1137	!-- Transpose array
	1138	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	1139	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	1140	CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
	1141	work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
	1142	comm1dy, ierr )
	1143	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	1144
	1145	!
	1146	!-- Carry out the FFT along x, where all data are present due to the
	1147	!-- 1d-decomposition along y. Resort the data in a way that y becomes
	1148	!-- the first index.
	1149	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
	1150
	1151	IF ( host(1:3) == 'nec' ) THEN
	1152	!
	1153	!-- Code optimized for vector processors
[85]	1154	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	1155	!$OMP DO
	1156	DO j = nys, nyn
	1157
	1158	DO k = 1, nz
	1159	DO i = 0, nx
	1160	work_fftx(i,k,j) = work(j,k,i)
	1161	ENDDO
	1162	ENDDO
	1163
	1164	CALL fft_x_m( work_fftx(:,:,j), 'backward' )
	1165
	1166	ENDDO
	1167
	1168	!$OMP DO
	1169	DO i = 0, nx
	1170	DO j = nys, nyn
	1171	DO k = 1, nz
	1172	f_out(k,j,i) = work_fftx(i,k,j)
	1173	ENDDO
	1174	ENDDO
	1175	ENDDO
	1176	!$OMP END PARALLEL
	1177
	1178	ELSE
	1179
	1180	!
	1181	!-- Cache optimized code (there might be still a potential for better
	1182	!-- optimization).
	1183	!$OMP PARALLEL PRIVATE (i,j,k,work_fftx)
	1184	!$OMP DO
	1185	DO j = nys, nyn
	1186	DO k = 1, nz
	1187
	1188	DO i = 0, nx
	1189	work_fftx(i,k,j) = work(j,k,i)
	1190	ENDDO
	1191
	1192	CALL fft_x( work_fftx(0:nx,k,j), 'backward' )
	1193
	1194	ENDDO
	1195	ENDDO
	1196
	1197	!$OMP DO
	1198	DO i = 0, nx
	1199	DO j = nys, nyn
	1200	DO k = 1, nz
	1201	f_out(k,j,i) = work_fftx(i,k,j)
	1202	ENDDO
	1203	ENDDO
	1204	ENDDO
	1205	!$OMP END PARALLEL
	1206
	1207	ENDIF
	1208	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	1209
	1210	END SUBROUTINE tr_yx_fftx
	1211
	1212
	1213	SUBROUTINE ffty_tri_ffty( ar )
	1214
	1215	!------------------------------------------------------------------------------!
	1216	! FFT along y, solution of the tridiagonal system and backward FFT for
	1217	! a 1d-decomposition along y
	1218	!
	1219	! WARNING: this subroutine may still not work for hybrid parallelization
	1220	! with OpenMP (for possible necessary changes see the original
	1221	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
	1222	!------------------------------------------------------------------------------!
	1223
	1224	USE control_parameters
	1225	USE cpulog
	1226	USE grid_variables
	1227	USE indices
	1228	USE interfaces
	1229	USE pegrid
	1230	USE transpose_indices
	1231
	1232	IMPLICIT NONE
	1233
	1234	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
	1235
	1236	REAL, DIMENSION(0:ny) :: work_ffty
	1237	REAL, DIMENSION(0:ny,1:nz) :: work_triy
	1238	REAL, DIMENSION(nny,1:nza,nxl_y:nxr_ya,pdims(2)) :: ar
	1239	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
	1240
	1241
	1242	CALL cpu_log( log_point_s(39), 'fft_y + tridia', 'start' )
	1243
	1244	ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) )
	1245
	1246	tn = 0 ! Default thread number in case of one thread
	1247	!$OMP PARALLEL PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy )
	1248	!$OMP DO
	1249	DO i = nxl_y, nxr_y
	1250
	1251	!$ tn = omp_get_thread_num()
	1252
	1253	IF ( host(1:3) == 'nec' ) THEN
	1254	!
	1255	!-- Code optimized for vector processors
	1256	DO k = 1, nz
	1257
	1258	m = 0
	1259	DO n = 1, pdims(2)
	1260	DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
	1261	work_triy(m,k) = ar(j,k,i,n)
	1262	m = m + 1
	1263	ENDDO
	1264	ENDDO
	1265
	1266	ENDDO
	1267
	1268	CALL fft_y_m( work_triy, ny, 'forward' )
	1269
	1270	ELSE
	1271	!
	1272	!-- Cache optimized code
	1273	DO k = 1, nz
	1274
	1275	m = 0
	1276	DO n = 1, pdims(2)
	1277	DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
	1278	work_ffty(m) = ar(j,k,i,n)
	1279	m = m + 1
	1280	ENDDO
	1281	ENDDO
	1282
	1283	CALL fft_y( work_ffty, 'forward' )
	1284
	1285	DO j = 0, ny
	1286	work_triy(j,k) = work_ffty(j)
	1287	ENDDO
	1288
	1289	ENDDO
	1290
	1291	ENDIF
	1292
	1293	!
	1294	!-- Solve the linear equation system
	1295	CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) )
	1296
	1297	IF ( host(1:3) == 'nec' ) THEN
	1298	!
	1299	!-- Code optimized for vector processors
	1300	CALL fft_y_m( work_triy, ny, 'backward' )
	1301
	1302	DO k = 1, nz
	1303
	1304	m = 0
	1305	DO n = 1, pdims(2)
	1306	DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
	1307	ar(j,k,i,n) = work_triy(m,k)
	1308	m = m + 1
	1309	ENDDO
	1310	ENDDO
	1311
	1312	ENDDO
	1313
	1314	ELSE
	1315	!
	1316	!-- Cache optimized code
	1317	DO k = 1, nz
	1318
	1319	DO j = 0, ny
	1320	work_ffty(j) = work_triy(j,k)
	1321	ENDDO
	1322
	1323	CALL fft_y( work_ffty, 'backward' )
	1324
	1325	m = 0
	1326	DO n = 1, pdims(2)
	1327	DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
	1328	ar(j,k,i,n) = work_ffty(m)
	1329	m = m + 1
	1330	ENDDO
	1331	ENDDO
	1332
	1333	ENDDO
	1334
	1335	ENDIF
	1336
	1337	ENDDO
	1338	!$OMP END PARALLEL
	1339
	1340	DEALLOCATE( tri )
	1341
	1342	CALL cpu_log( log_point_s(39), 'fft_y + tridia', 'stop' )
	1343
	1344	END SUBROUTINE ffty_tri_ffty
	1345
	1346
	1347	SUBROUTINE tridia_1dd( ddx2, ddy2, nx, ny, j, ar, tri )
	1348
	1349	!------------------------------------------------------------------------------!
	1350	! Solves the linear system of equations for a 1d-decomposition along x (see
	1351	! tridia)
	1352	!
	1353	! Attention: when using the intel compiler, array tri must be passed as an
	1354	! argument to the contained subroutines. Otherwise addres faults
	1355	! will occur.
	1356	! On NEC, tri should not be passed (except for routine substi_1dd)
	1357	! because this causes very bad performance.
	1358	!------------------------------------------------------------------------------!
	1359
	1360	USE arrays_3d
	1361	USE control_parameters
	1362
	1363	USE pegrid
	1364
	1365	IMPLICIT NONE
	1366
	1367	INTEGER :: i, j, k, nnyh, nx, ny, omp_get_thread_num, tn
	1368
	1369	REAL :: ddx2, ddy2
	1370
	1371	REAL, DIMENSION(0:nx,1:nz) :: ar
	1372	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
	1373	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	1374
	1375
	1376	nnyh = ( ny + 1 ) / 2
	1377
	1378	!
	1379	!-- Define constant elements of the tridiagonal matrix.
	1380	!-- The compiler on SX6 does loop exchange. If 0:nx is a high power of 2,
	1381	!-- the exchanged loops create bank conflicts. The following directive
	1382	!-- prohibits loop exchange and the loops perform much better.
	1383	! tn = omp_get_thread_num()
	1384	! WRITE( 120+tn, * ) '+++ id=',myid,' nx=',nx,' thread=', omp_get_thread_num()
[82]	1385	! CALL local_flush( 120+tn )
[1]	1386	!CDIR NOLOOPCHG
	1387	DO k = 0, nz-1
	1388	DO i = 0,nx
[667]	1389	tri(2,i,k) = ddzu_pres(k+1) * ddzw(k+1)
	1390	tri(3,i,k) = ddzu_pres(k+2) * ddzw(k+1)
[1]	1391	ENDDO
	1392	ENDDO
	1393	! WRITE( 120+tn, * ) '+++ id=',myid,' end of first tridia loop thread=', omp_get_thread_num()
[82]	1394	! CALL local_flush( 120+tn )
[1]	1395
	1396	IF ( j <= nnyh ) THEN
[377]	1397	#if defined( __lc )
[1]	1398	CALL maketri_1dd( j, tri )
	1399	#else
	1400	CALL maketri_1dd( j )
	1401	#endif
	1402	ELSE
[377]	1403	#if defined( __lc )
[1]	1404	CALL maketri_1dd( ny+1-j, tri )
	1405	#else
	1406	CALL maketri_1dd( ny+1-j )
	1407	#endif
	1408	ENDIF
[377]	1409	#if defined( __lc )
[1]	1410	CALL split_1dd( tri )
	1411	#else
	1412	CALL split_1dd
	1413	#endif
	1414	CALL substi_1dd( ar, tri )
	1415
	1416	CONTAINS
	1417
[377]	1418	#if defined( __lc )
[1]	1419	SUBROUTINE maketri_1dd( j, tri )
	1420	#else
	1421	SUBROUTINE maketri_1dd( j )
	1422	#endif
	1423
	1424	!------------------------------------------------------------------------------!
	1425	! computes the i- and j-dependent component of the matrix
	1426	!------------------------------------------------------------------------------!
	1427
	1428	USE constants
	1429
	1430	IMPLICIT NONE
	1431
	1432	INTEGER :: i, j, k, nnxh
	1433	REAL :: a, c
	1434
	1435	REAL, DIMENSION(0:nx) :: l
	1436
[377]	1437	#if defined( __lc )
[1]	1438	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	1439	#endif
	1440
	1441
	1442	nnxh = ( nx + 1 ) / 2
	1443	!
	1444	!-- Provide the tridiagonal matrix for solution of the Poisson equation in
	1445	!-- Fourier space. The coefficients are computed following the method of
	1446	!-- Schmidt et al. (DFVLR-Mitteilung 84-15), which departs from Stephan
	1447	!-- Siano's original version by discretizing the Poisson equation,
	1448	!-- before it is Fourier-transformed
	1449	DO i = 0, nx
[128]	1450	IF ( i >= 0 .AND. i <= nnxh ) THEN
[1]	1451	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
	1452	FLOAT( nx+1 ) ) ) * ddx2 + &
	1453	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
	1454	FLOAT( ny+1 ) ) ) * ddy2
	1455	ELSE
	1456	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
	1457	FLOAT( nx+1 ) ) ) * ddx2 + &
	1458	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
	1459	FLOAT( ny+1 ) ) ) * ddy2
	1460	ENDIF
	1461	ENDDO
	1462
	1463	DO k = 0, nz-1
	1464	DO i = 0, nx
[667]	1465	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
	1466	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
[1]	1467	tri(1,i,k) = a + c - l(i)
	1468	ENDDO
	1469	ENDDO
	1470	IF ( ibc_p_b == 1 .OR. ibc_p_b == 2 ) THEN
	1471	DO i = 0, nx
	1472	tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
	1473	ENDDO
	1474	ENDIF
	1475	IF ( ibc_p_t == 1 ) THEN
	1476	DO i = 0, nx
	1477	tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
	1478	ENDDO
	1479	ENDIF
	1480
	1481	END SUBROUTINE maketri_1dd
	1482
	1483
[377]	1484	#if defined( __lc )
[1]	1485	SUBROUTINE split_1dd( tri )
	1486	#else
	1487	SUBROUTINE split_1dd
	1488	#endif
	1489
	1490	!------------------------------------------------------------------------------!
	1491	! Splitting of the tridiagonal matrix (Thomas algorithm)
	1492	!------------------------------------------------------------------------------!
	1493
	1494	IMPLICIT NONE
	1495
	1496	INTEGER :: i, k
	1497
[377]	1498	#if defined( __lc )
[1]	1499	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	1500	#endif
	1501
	1502
	1503	!
	1504	!-- Splitting
	1505	DO i = 0, nx
	1506	tri(4,i,0) = tri(1,i,0)
	1507	ENDDO
	1508	DO k = 1, nz-1
	1509	DO i = 0, nx
	1510	tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
	1511	tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
	1512	ENDDO
	1513	ENDDO
	1514
	1515	END SUBROUTINE split_1dd
	1516
	1517
	1518	SUBROUTINE substi_1dd( ar, tri )
	1519
	1520	!------------------------------------------------------------------------------!
	1521	! Substitution (Forward and Backward) (Thomas algorithm)
	1522	!------------------------------------------------------------------------------!
	1523
	1524	IMPLICIT NONE
	1525
[76]	1526	INTEGER :: i, k
[1]	1527
	1528	REAL, DIMENSION(0:nx,nz) :: ar
	1529	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
	1530	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	1531
	1532	!
	1533	!-- Forward substitution
	1534	DO i = 0, nx
	1535	ar1(i,0) = ar(i,1)
	1536	ENDDO
	1537	DO k = 1, nz-1
	1538	DO i = 0, nx
	1539	ar1(i,k) = ar(i,k+1) - tri(5,i,k) * ar1(i,k-1)
	1540	ENDDO
	1541	ENDDO
	1542
	1543	!
	1544	!-- Backward substitution
	1545	DO i = 0, nx
	1546	ar(i,nz) = ar1(i,nz-1) / tri(4,i,nz-1)
	1547	ENDDO
	1548	DO k = nz-2, 0, -1
	1549	DO i = 0, nx
	1550	ar(i,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,k+2) ) &
	1551	/ tri(4,i,k)
	1552	ENDDO
	1553	ENDDO
	1554
[76]	1555	!
	1556	!-- Indices i=0, j=0 correspond to horizontally averaged pressure.
	1557	!-- The respective values of ar should be zero at all k-levels if
	1558	!-- acceleration of horizontally averaged vertical velocity is zero.
	1559	IF ( ibc_p_b == 1 .AND. ibc_p_t == 1 ) THEN
	1560	IF ( j == 0 ) THEN
	1561	DO k = 1, nz
	1562	ar(0,k) = 0.0
	1563	ENDDO
	1564	ENDIF
	1565	ENDIF
	1566
[1]	1567	END SUBROUTINE substi_1dd
	1568
	1569	END SUBROUTINE tridia_1dd
	1570
	1571	#endif
	1572
	1573	END MODULE poisfft_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |