Home

Context Navigation

source: palm/trunk/SOURCE/poisfft.f90 @ 215

Last change on this file since 215 was 198, checked in by raasch, 16 years ago
file headers updated for the next release 3.5
Property svn:keywords set to `Id`
File size: 44.5 KB

Rev	Line
[1]	1	MODULE poisfft_mod
	2
	3	!------------------------------------------------------------------------------!
	4	! Actual revisions:
	5	! -----------------
[198]	6	!
[1]	7	!
	8	! Former revisions:
	9	! -----------------
[3]	10	! $Id: poisfft.f90 198 2008-09-17 08:55:28Z raasch $
[77]	11	!
[198]	12	! 164 2008-05-15 08:46:15Z raasch
	13	! Arguments removed from transpose routines
	14	!
[139]	15	! 128 2007-10-26 13:11:14Z raasch
	16	! Bugfix: wavenumber calculation for even nx in routines maketri
	17	!
[90]	18	! 85 2007-05-11 09:35:14Z raasch
	19	! Bugfix: work_fft*_vec removed from some PRIVATE-declarations
	20	!
[77]	21	! 76 2007-03-29 00:58:32Z raasch
	22	! Tridiagonal coefficients adjusted for Neumann boundary conditions both at
	23	! the bottom and the top.
	24	!
[3]	25	! RCS Log replace by Id keyword, revision history cleaned up
	26	!
[1]	27	! Revision 1.24 2006/08/04 15:00:24 raasch
	28	! Default setting of the thread number tn in case of not using OpenMP
	29	!
	30	! Revision 1.23 2006/02/23 12:48:38 raasch
	31	! Additional compiler directive in routine tridia_1dd for preventing loop
	32	! exchange on NEC-SX6
	33	!
	34	! Revision 1.20 2004/04/30 12:38:09 raasch
	35	! Parts of former poisfft_hybrid moved to this subroutine,
	36	! former subroutine changed to a module, renaming of FFT-subroutines and
	37	! -module, FFTs completely substituted by calls of fft_x and fft_y,
	38	! NAG fft used in the non-parallel case completely removed, l in maketri
	39	! is now a 1d-array, variables passed by modules instead of using parameter
	40	! lists, enlarged transposition arrays introduced
	41	!
	42	! Revision 1.1 1997/07/24 11:24:14 raasch
	43	! Initial revision
	44	!
	45	!
	46	! Description:
	47	! ------------
	48	! See below.
	49	!------------------------------------------------------------------------------!
	50
	51	!--------------------------------------------------------------------------!
	52	! poisfft !
	53	! !
	54	! Original version: Stephan Siano (pois3d) !
	55	! !
	56	! Institute of Meteorology and Climatology, University of Hannover !
	57	! Germany !
	58	! !
	59	! Version as of July 23,1996 !
	60	! !
	61	! !
	62	! Version for parallel computers: Siegfried Raasch !
	63	! !
	64	! Version as of July 03,1997 !
	65	! !
	66	! Solves the Poisson equation with a 2D spectral method !
	67	! d^2 p / dx^2 + d^2 p / dy^2 + d^2 p / dz^2 = s !
	68	! !
	69	! Input: !
	70	! real ar contains in the (nnx,nny,nnz) elements, !
	71	! starting from the element (1,nys,nxl), the !
	72	! values for s !
	73	! real work Temporary array !
	74	! !
	75	! Output: !
	76	! real ar contains the solution for p !
	77	!--------------------------------------------------------------------------!
	78
	79	USE fft_xy
	80	USE indices
	81	USE transpose_indices
	82
	83	IMPLICIT NONE
	84
	85	PRIVATE
	86	PUBLIC poisfft, poisfft_init
	87
	88	INTERFACE poisfft
	89	MODULE PROCEDURE poisfft
	90	END INTERFACE poisfft
	91
	92	INTERFACE poisfft_init
	93	MODULE PROCEDURE poisfft_init
	94	END INTERFACE poisfft_init
	95
	96	CONTAINS
	97
	98	SUBROUTINE poisfft_init
	99
	100	CALL fft_init
	101
	102	END SUBROUTINE poisfft_init
	103
	104
	105	SUBROUTINE poisfft( ar, work )
	106
	107	USE cpulog
	108	USE interfaces
	109	USE pegrid
	110
	111	IMPLICIT NONE
	112
	113	REAL, DIMENSION(1:nza,nys:nyna,nxl:nxra) :: ar, work
	114
	115
	116	CALL cpu_log( log_point_s(3), 'poisfft', 'start' )
	117
	118	!
	119	!-- Two-dimensional Fourier Transformation in x- and y-direction.
	120	#if defined( __parallel )
	121	IF ( pdims(2) == 1 ) THEN
	122
	123	!
	124	!-- 1d-domain-decomposition along x:
	125	!-- FFT along y and transposition y --> x
	126	CALL ffty_tr_yx( ar, work, ar )
	127
	128	!
	129	!-- FFT along x, solving the tridiagonal system and backward FFT
	130	CALL fftx_tri_fftx( ar )
	131
	132	!
	133	!-- Transposition x --> y and backward FFT along y
	134	CALL tr_xy_ffty( ar, work, ar )
	135
	136	ELSEIF ( pdims(1) == 1 ) THEN
	137
	138	!
	139	!-- 1d-domain-decomposition along y:
	140	!-- FFT along x and transposition x --> y
	141	CALL fftx_tr_xy( ar, work, ar )
	142
	143	!
	144	!-- FFT along y, solving the tridiagonal system and backward FFT
	145	CALL ffty_tri_ffty( ar )
	146
	147	!
	148	!-- Transposition y --> x and backward FFT along x
	149	CALL tr_yx_fftx( ar, work, ar )
	150
	151	ELSE
	152
	153	!
	154	!-- 2d-domain-decomposition
	155	!-- Transposition z --> x
	156	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
[164]	157	CALL transpose_zx( ar, work, ar )
[1]	158	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
	159
	160	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
	161	CALL fftxp( ar, 'forward' )
	162	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
	163
	164	!
	165	!-- Transposition x --> y
	166	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
[164]	167	CALL transpose_xy( ar, work, ar )
[1]	168	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
	169
	170	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
	171	CALL fftyp( ar, 'forward' )
	172	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	173
	174	!
	175	!-- Transposition y --> z
	176	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
[164]	177	CALL transpose_yz( ar, work, ar )
[1]	178	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
	179
	180	!
	181	!-- Solve the Poisson equation in z-direction in cartesian space.
	182	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
	183	CALL tridia( ar )
	184	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
	185
	186	!
	187	!-- Inverse Fourier Transformation
	188	!-- Transposition z --> y
	189	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
[164]	190	CALL transpose_zy( ar, work, ar )
[1]	191	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
	192
	193	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
	194	CALL fftyp( ar, 'backward' )
	195	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	196
	197	!
	198	!-- Transposition y --> x
	199	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
[164]	200	CALL transpose_yx( ar, work, ar )
[1]	201	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
	202
	203	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
	204	CALL fftxp( ar, 'backward' )
	205	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	206
	207	!
	208	!-- Transposition x --> z
	209	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
[164]	210	CALL transpose_xz( ar, work, ar )
[1]	211	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
	212
	213	ENDIF
	214
	215	#else
	216
	217	!
	218	!-- Two-dimensional Fourier Transformation along x- and y-direction.
	219	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
	220	CALL fftx( ar, 'forward' )
	221	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
	222	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
	223	CALL ffty( ar, 'forward' )
	224	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	225
	226	!
	227	!-- Solve the Poisson equation in z-direction in cartesian space.
	228	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
	229	CALL tridia( ar )
	230	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
	231
	232	!
	233	!-- Inverse Fourier Transformation.
	234	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
	235	CALL ffty( ar, 'backward' )
	236	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	237	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
	238	CALL fftx( ar, 'backward' )
	239	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	240
	241	#endif
	242
	243	CALL cpu_log( log_point_s(3), 'poisfft', 'stop' )
	244
	245	END SUBROUTINE poisfft
	246
	247
	248
	249	SUBROUTINE tridia( ar )
	250
	251	!------------------------------------------------------------------------------!
	252	! solves the linear system of equations:
	253	!
	254	! -(4 pi^2(i^2/(dx^2nnx^2)+j^2/(dy^2nny^2))+
	255	! 1/(dzu(k)dzw(k))+1/(dzu(k-1)dzw(k)))*p(i,j,k)+
	256	! 1/(dzu(k)dzw(k))p(i,j,k+1)+1/(dzu(k-1)dzw(k))p(i,j,k-1)=d(i,j,k)
	257	!
	258	! by using the Thomas algorithm
	259	!------------------------------------------------------------------------------!
	260
	261	USE arrays_3d
	262
	263	IMPLICIT NONE
	264
	265	INTEGER :: i, j, k, nnyh
	266
	267	REAL, DIMENSION(nxl_z:nxr_z,0:nz-1) :: ar1
	268	REAL, DIMENSION(5,nxl_z:nxr_z,0:nz-1) :: tri
	269
	270	#if defined( __parallel )
	271	REAL :: ar(nxl_z:nxr_za,nys_z:nyn_za,1:nza)
	272	#else
	273	REAL :: ar(1:nz,nys_z:nyn_z,nxl_z:nxr_z)
	274	#endif
	275
	276
	277	nnyh = (ny+1) / 2
	278
	279	!
	280	!-- Define constant elements of the tridiagonal matrix.
	281	DO k = 0, nz-1
	282	DO i = nxl_z, nxr_z
	283	tri(2,i,k) = ddzu(k+1) * ddzw(k+1)
	284	tri(3,i,k) = ddzu(k+2) * ddzw(k+1)
	285	ENDDO
	286	ENDDO
	287
	288	#if defined( __parallel )
	289	!
	290	!-- Repeat for all y-levels.
	291	DO j = nys_z, nyn_z
	292	IF ( j <= nnyh ) THEN
	293	CALL maketri( tri, j )
	294	ELSE
	295	CALL maketri( tri, ny+1-j )
	296	ENDIF
	297	CALL split( tri )
	298	CALL substi( ar, ar1, tri, j )
	299	ENDDO
	300	#else
	301	!
	302	!-- First y-level.
	303	CALL maketri( tri, nys_z )
	304	CALL split( tri )
	305	CALL substi( ar, ar1, tri, 0 )
	306
	307	!
	308	!-- Further y-levels.
	309	DO j = 1, nnyh - 1
	310	CALL maketri( tri, j )
	311	CALL split( tri )
	312	CALL substi( ar, ar1, tri, j )
	313	CALL substi( ar, ar1, tri, ny+1-j )
	314	ENDDO
	315	CALL maketri( tri, nnyh )
	316	CALL split( tri )
	317	CALL substi( ar, ar1, tri, nnyh+nys )
	318	#endif
	319
	320	CONTAINS
	321
	322	SUBROUTINE maketri( tri, j )
	323
	324	!------------------------------------------------------------------------------!
	325	! Computes the i- and j-dependent component of the matrix
	326	!------------------------------------------------------------------------------!
	327
	328	USE arrays_3d
	329	USE constants
	330	USE control_parameters
	331	USE grid_variables
	332
	333	IMPLICIT NONE
	334
	335	INTEGER :: i, j, k, nnxh
	336	REAL :: a, c
	337	REAL :: ll(nxl_z:nxr_z)
	338	REAL :: tri(5,nxl_z:nxr_z,0:nz-1)
	339
	340
	341	nnxh = ( nx + 1 ) / 2
	342
	343	!
	344	!-- Provide the tridiagonal matrix for solution of the Poisson equation in
	345	!-- Fourier space. The coefficients are computed following the method of
	346	!-- Schmidt et al. (DFVLR-Mitteilung 84-15), which departs from Stephan
	347	!-- Siano's original version by discretizing the Poisson equation,
	348	!-- before it is Fourier-transformed
	349	#if defined( __parallel )
	350	DO i = nxl_z, nxr_z
[128]	351	IF ( i >= 0 .AND. i <= nnxh ) THEN
[1]	352	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
	353	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
	354	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
	355	FLOAT( ny+1 ) ) ) / ( dy * dy )
	356	ELSE
	357	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
	358	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
	359	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
	360	FLOAT( ny+1 ) ) ) / ( dy * dy )
	361	ENDIF
	362	DO k = 0,nz-1
	363	a = -1.0 * ddzu(k+2) * ddzw(k+1)
	364	c = -1.0 * ddzu(k+1) * ddzw(k+1)
	365	tri(1,i,k) = a + c - ll(i)
	366	ENDDO
	367	ENDDO
	368	#else
	369	DO i = 0, nnxh
	370	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / FLOAT( nx+1 ) ) ) / &
	371	( dx * dx ) + &
	372	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / FLOAT( ny+1 ) ) ) / &
	373	( dy * dy )
	374	DO k = 0, nz-1
	375	a = -1.0 * ddzu(k+2) * ddzw(k+1)
	376	c = -1.0 * ddzu(k+1) * ddzw(k+1)
	377	tri(1,i,k) = a + c - ll(i)
	378	IF ( i >= 1 .and. i < nnxh ) THEN
	379	tri(1,nx+1-i,k) = tri(1,i,k)
	380	ENDIF
	381	ENDDO
	382	ENDDO
	383	#endif
	384	IF ( ibc_p_b == 1 .OR. ibc_p_b == 2 ) THEN
	385	DO i = nxl_z, nxr_z
	386	tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
	387	ENDDO
	388	ENDIF
	389	IF ( ibc_p_t == 1 ) THEN
	390	DO i = nxl_z, nxr_z
	391	tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
	392	ENDDO
	393	ENDIF
	394
	395	END SUBROUTINE maketri
	396
	397
	398	SUBROUTINE substi( ar, ar1, tri, j )
	399
	400	!------------------------------------------------------------------------------!
	401	! Substitution (Forward and Backward) (Thomas algorithm)
	402	!------------------------------------------------------------------------------!
	403
[76]	404	USE control_parameters
	405
[1]	406	IMPLICIT NONE
	407
	408	INTEGER :: i, j, k
	409	REAL :: ar1(nxl_z:nxr_z,0:nz-1)
	410	REAL :: tri(5,nxl_z:nxr_z,0:nz-1)
	411	#if defined( __parallel )
	412	REAL :: ar(nxl_z:nxr_za,nys_z:nyn_za,1:nza)
	413	#else
	414	REAL :: ar(1:nz,nys_z:nyn_z,nxl_z:nxr_z)
	415	#endif
	416
	417	!
	418	!-- Forward substitution.
	419	DO i = nxl_z, nxr_z
	420	#if defined( __parallel )
	421	ar1(i,0) = ar(i,j,1)
	422	#else
	423	ar1(i,0) = ar(1,j,i)
	424	#endif
	425	ENDDO
	426	DO k = 1, nz - 1
	427	DO i = nxl_z, nxr_z
	428	#if defined( __parallel )
	429	ar1(i,k) = ar(i,j,k+1) - tri(5,i,k) * ar1(i,k-1)
	430	#else
	431	ar1(i,k) = ar(k+1,j,i) - tri(5,i,k) * ar1(i,k-1)
	432	#endif
	433	ENDDO
	434	ENDDO
	435
	436	!
	437	!-- Backward substitution.
	438	DO i = nxl_z, nxr_z
	439	#if defined( __parallel )
	440	ar(i,j,nz) = ar1(i,nz-1) / tri(4,i,nz-1)
	441	#else
	442	ar(nz,j,i) = ar1(i,nz-1) / tri(4,i,nz-1)
	443	#endif
	444	ENDDO
	445	DO k = nz-2, 0, -1
	446	DO i = nxl_z, nxr_z
	447	#if defined( __parallel )
	448	ar(i,j,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,j,k+2) ) &
	449	/ tri(4,i,k)
	450	#else
	451	ar(k+1,j,i) = ( ar1(i,k) - tri(3,i,k) * ar(k+2,j,i) ) &
	452	/ tri(4,i,k)
	453	#endif
	454	ENDDO
	455	ENDDO
	456
[76]	457	!
	458	!-- Indices i=0, j=0 correspond to horizontally averaged pressure.
	459	!-- The respective values of ar should be zero at all k-levels if
	460	!-- acceleration of horizontally averaged vertical velocity is zero.
	461	IF ( ibc_p_b == 1 .AND. ibc_p_t == 1 ) THEN
	462	IF ( j == 0 .AND. nxl_z == 0 ) THEN
	463	#if defined( __parallel )
	464	DO k = 1, nz
	465	ar(nxl_z,j,k) = 0.0
	466	ENDDO
	467	#else
	468	DO k = 1, nz
	469	ar(k,j,nxl_z) = 0.0
	470	ENDDO
	471	#endif
	472	ENDIF
	473	ENDIF
	474
[1]	475	END SUBROUTINE substi
	476
	477
	478	SUBROUTINE split( tri )
	479
	480	!------------------------------------------------------------------------------!
	481	! Splitting of the tridiagonal matrix (Thomas algorithm)
	482	!------------------------------------------------------------------------------!
	483
	484	IMPLICIT NONE
	485
	486	INTEGER :: i, k
	487	REAL :: tri(5,nxl_z:nxr_z,0:nz-1)
	488
	489	!
	490	!-- Splitting.
	491	DO i = nxl_z, nxr_z
	492	tri(4,i,0) = tri(1,i,0)
	493	ENDDO
	494	DO k = 1, nz-1
	495	DO i = nxl_z, nxr_z
	496	tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
	497	tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
	498	ENDDO
	499	ENDDO
	500
	501	END SUBROUTINE split
	502
	503	END SUBROUTINE tridia
	504
	505
	506	#if defined( __parallel )
	507	SUBROUTINE fftxp( ar, direction )
	508
	509	!------------------------------------------------------------------------------!
	510	! Fourier-transformation along x-direction Parallelized version
	511	!------------------------------------------------------------------------------!
	512
	513	IMPLICIT NONE
	514
	515	CHARACTER (LEN=*) :: direction
	516	INTEGER :: j, k
	517	REAL :: ar(0:nxa,nys_x:nyn_xa,nzb_x:nzt_xa)
	518
	519	!
	520	!-- Performing the fft with one of the methods implemented
	521	DO k = nzb_x, nzt_x
	522	DO j = nys_x, nyn_x
	523	CALL fft_x( ar(0:nx,j,k), direction )
	524	ENDDO
	525	ENDDO
	526
	527	END SUBROUTINE fftxp
	528
	529	#else
	530	SUBROUTINE fftx( ar, direction )
	531
	532	!------------------------------------------------------------------------------!
	533	! Fourier-transformation along x-direction Non parallel version
	534	!------------------------------------------------------------------------------!
	535
	536	IMPLICIT NONE
	537
	538	CHARACTER (LEN=*) :: direction
	539	INTEGER :: i, j, k
	540	REAL :: ar(1:nz,0:ny,0:nx)
	541
	542	!
	543	!-- Performing the fft with one of the methods implemented
	544	DO k = 1, nz
	545	DO j = 0, ny
	546	CALL fft_x( ar(k,j,0:nx), direction )
	547	ENDDO
	548	ENDDO
	549
	550	END SUBROUTINE fftx
	551	#endif
	552
	553
	554	#if defined( __parallel )
	555	SUBROUTINE fftyp( ar, direction )
	556
	557	!------------------------------------------------------------------------------!
	558	! Fourier-transformation along y-direction Parallelized version
	559	!------------------------------------------------------------------------------!
	560
	561	IMPLICIT NONE
	562
	563	CHARACTER (LEN=*) :: direction
	564	INTEGER :: i, k
	565	REAL :: ar(0:nya,nxl_y:nxr_ya,nzb_y:nzt_ya)
	566
	567	!
	568	!-- Performing the fft with one of the methods implemented
	569	DO k = nzb_y, nzt_y
	570	DO i = nxl_y, nxr_y
	571	CALL fft_y( ar(0:ny,i,k), direction )
	572	ENDDO
	573	ENDDO
	574
	575	END SUBROUTINE fftyp
	576
	577	#else
	578	SUBROUTINE ffty( ar, direction )
	579
	580	!------------------------------------------------------------------------------!
	581	! Fourier-transformation along y-direction Non parallel version
	582	!------------------------------------------------------------------------------!
	583
	584	IMPLICIT NONE
	585
	586	CHARACTER (LEN=*) :: direction
	587	INTEGER :: i, k
	588	REAL :: ar(1:nz,0:ny,0:nx)
	589
	590	!
	591	!-- Performing the fft with one of the methods implemented
	592	DO k = 1, nz
	593	DO i = 0, nx
	594	CALL fft_y( ar(k,0:ny,i), direction )
	595	ENDDO
	596	ENDDO
	597
	598	END SUBROUTINE ffty
	599	#endif
	600
	601	#if defined( __parallel )
	602	SUBROUTINE ffty_tr_yx( f_in, work, f_out )
	603
	604	!------------------------------------------------------------------------------!
	605	! Fourier-transformation along y with subsequent transposition y --> x for
	606	! a 1d-decomposition along x
	607	!
	608	! ATTENTION: The performance of this routine is much faster on the NEC-SX6,
	609	! if the first index of work_ffty_vec is odd. Otherwise
	610	! memory bank conflicts may occur (especially if the index is a
	611	! multiple of 128). That's why work_ffty_vec is dimensioned as
	612	! 0:ny+1.
	613	! Of course, this will not work if users are using an odd number
	614	! of gridpoints along y.
	615	!------------------------------------------------------------------------------!
	616
	617	USE control_parameters
	618	USE cpulog
	619	USE indices
	620	USE interfaces
	621	USE pegrid
	622	USE transpose_indices
	623
	624	IMPLICIT NONE
	625
	626	INTEGER :: i, iend, iouter, ir, j, k
	627	INTEGER, PARAMETER :: stridex = 4
	628
	629	REAL, DIMENSION(0:ny,stridex) :: work_ffty
	630	#if defined( __nec )
	631	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
	632	#endif
	633	REAL, DIMENSION(1:nza,0:nya,nxl:nxra) :: f_in
	634	REAL, DIMENSION(nnx,1:nza,nys_x:nyn_xa,pdims(1)) :: f_out
	635	REAL, DIMENSION(nxl:nxra,1:nza,0:nya) :: work
	636
	637	!
	638	!-- Carry out the FFT along y, where all data are present due to the
	639	!-- 1d-decomposition along x. Resort the data in a way that x becomes
	640	!-- the first index.
	641	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
	642
	643	IF ( host(1:3) == 'nec' ) THEN
	644	#if defined( __nec )
	645	!
	646	!-- Code optimized for vector processors
[85]	647	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	648	!$OMP DO
	649	DO i = nxl, nxr
	650
	651	DO j = 0, ny
	652	DO k = 1, nz
	653	work_ffty_vec(j,k,i) = f_in(k,j,i)
	654	ENDDO
	655	ENDDO
	656
	657	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' )
	658
	659	ENDDO
	660
	661	!$OMP DO
	662	DO k = 1, nz
	663	DO j = 0, ny
	664	DO i = nxl, nxr
	665	work(i,k,j) = work_ffty_vec(j,k,i)
	666	ENDDO
	667	ENDDO
	668	ENDDO
	669	!$OMP END PARALLEL
	670	#endif
	671
	672	ELSE
	673
	674	!
	675	!-- Cache optimized code.
	676	!-- The i-(x-)direction is split into a strided outer loop and an inner
	677	!-- loop for better cache performance
	678	!$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty)
	679	!$OMP DO
	680	DO iouter = nxl, nxr, stridex
	681
	682	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
	683
	684	DO k = 1, nz
	685
	686	DO i = iouter, iend
	687
	688	ir = i-iouter+1 ! counter within a stride
	689	DO j = 0, ny
	690	work_ffty(j,ir) = f_in(k,j,i)
	691	ENDDO
	692	!
	693	!-- FFT along y
	694	CALL fft_y( work_ffty(:,ir), 'forward' )
	695
	696	ENDDO
	697
	698	!
	699	!-- Resort
	700	DO j = 0, ny
	701	DO i = iouter, iend
	702	work(i,k,j) = work_ffty(j,i-iouter+1)
	703	ENDDO
	704	ENDDO
	705
	706	ENDDO
	707
	708	ENDDO
	709	!$OMP END PARALLEL
	710
	711	ENDIF
	712	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	713
	714	!
	715	!-- Transpose array
	716	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
	717	CALL MPI_ALLTOALL( work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
	718	f_out(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
	719	comm1dx, ierr )
	720	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	721
	722	END SUBROUTINE ffty_tr_yx
	723
	724
	725	SUBROUTINE tr_xy_ffty( f_in, work, f_out )
	726
	727	!------------------------------------------------------------------------------!
	728	! Transposition x --> y with a subsequent backward Fourier transformation for
	729	! a 1d-decomposition along x
	730	!------------------------------------------------------------------------------!
	731
	732	USE control_parameters
	733	USE cpulog
	734	USE indices
	735	USE interfaces
	736	USE pegrid
	737	USE transpose_indices
	738
	739	IMPLICIT NONE
	740
	741	INTEGER :: i, iend, iouter, ir, j, k
	742	INTEGER, PARAMETER :: stridex = 4
	743
	744	REAL, DIMENSION(0:ny,stridex) :: work_ffty
	745	#if defined( __nec )
	746	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
	747	#endif
	748	REAL, DIMENSION(nnx,1:nza,nys_x:nyn_xa,pdims(1)) :: f_in
	749	REAL, DIMENSION(1:nza,0:nya,nxl:nxra) :: f_out
	750	REAL, DIMENSION(nxl:nxra,1:nza,0:nya) :: work
	751
	752	!
	753	!-- Transpose array
	754	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
	755	CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
	756	work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
	757	comm1dx, ierr )
	758	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	759
	760	!
	761	!-- Resort the data in a way that y becomes the first index and carry out the
	762	!-- backward fft along y.
	763	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
	764
	765	IF ( host(1:3) == 'nec' ) THEN
	766	#if defined( __nec )
	767	!
	768	!-- Code optimized for vector processors
[85]	769	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	770	!$OMP DO
	771	DO k = 1, nz
	772	DO j = 0, ny
	773	DO i = nxl, nxr
	774	work_ffty_vec(j,k,i) = work(i,k,j)
	775	ENDDO
	776	ENDDO
	777	ENDDO
	778
	779	!$OMP DO
	780	DO i = nxl, nxr
	781
	782	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' )
	783
	784	DO j = 0, ny
	785	DO k = 1, nz
	786	f_out(k,j,i) = work_ffty_vec(j,k,i)
	787	ENDDO
	788	ENDDO
	789
	790	ENDDO
	791	!$OMP END PARALLEL
	792	#endif
	793
	794	ELSE
	795
	796	!
	797	!-- Cache optimized code.
	798	!-- The i-(x-)direction is split into a strided outer loop and an inner
	799	!-- loop for better cache performance
	800	!$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty )
	801	!$OMP DO
	802	DO iouter = nxl, nxr, stridex
	803
	804	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
	805
	806	DO k = 1, nz
	807	!
	808	!-- Resort
	809	DO j = 0, ny
	810	DO i = iouter, iend
	811	work_ffty(j,i-iouter+1) = work(i,k,j)
	812	ENDDO
	813	ENDDO
	814
	815	DO i = iouter, iend
	816
	817	!
	818	!-- FFT along y
	819	ir = i-iouter+1 ! counter within a stride
	820	CALL fft_y( work_ffty(:,ir), 'backward' )
	821
	822	DO j = 0, ny
	823	f_out(k,j,i) = work_ffty(j,ir)
	824	ENDDO
	825	ENDDO
	826
	827	ENDDO
	828
	829	ENDDO
	830	!$OMP END PARALLEL
	831
	832	ENDIF
	833
	834	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	835
	836	END SUBROUTINE tr_xy_ffty
	837
	838
	839	SUBROUTINE fftx_tri_fftx( ar )
	840
	841	!------------------------------------------------------------------------------!
	842	! FFT along x, solution of the tridiagonal system and backward FFT for
	843	! a 1d-decomposition along x
	844	!
	845	! WARNING: this subroutine may still not work for hybrid parallelization
	846	! with OpenMP (for possible necessary changes see the original
	847	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
	848	!------------------------------------------------------------------------------!
	849
	850	USE control_parameters
	851	USE cpulog
	852	USE grid_variables
	853	USE indices
	854	USE interfaces
	855	USE pegrid
	856	USE transpose_indices
	857
	858	IMPLICIT NONE
	859
	860	character(len=3) :: myth_char
	861
	862	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
	863
	864	REAL, DIMENSION(0:nx) :: work_fftx
	865	REAL, DIMENSION(0:nx,1:nz) :: work_trix
	866	REAL, DIMENSION(nnx,1:nza,nys_x:nyn_xa,pdims(1)) :: ar
	867	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
	868
	869
	870	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'start' )
	871
	872	ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) )
	873
	874	tn = 0 ! Default thread number in case of one thread
	875	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_fftx, work_trix )
	876	DO j = nys_x, nyn_x
	877
	878	!$ tn = omp_get_thread_num()
	879
	880	IF ( host(1:3) == 'nec' ) THEN
	881	!
	882	!-- Code optimized for vector processors
	883	DO k = 1, nz
	884
	885	m = 0
	886	DO n = 1, pdims(1)
	887	DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
	888	work_trix(m,k) = ar(i,k,j,n)
	889	m = m + 1
	890	ENDDO
	891	ENDDO
	892
	893	ENDDO
	894
	895	CALL fft_x_m( work_trix, 'forward' )
	896
	897	ELSE
	898	!
	899	!-- Cache optimized code
	900	DO k = 1, nz
	901
	902	m = 0
	903	DO n = 1, pdims(1)
	904	DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
	905	work_fftx(m) = ar(i,k,j,n)
	906	m = m + 1
	907	ENDDO
	908	ENDDO
	909
	910	CALL fft_x( work_fftx, 'forward' )
	911
	912	DO i = 0, nx
	913	work_trix(i,k) = work_fftx(i)
	914	ENDDO
	915
	916	ENDDO
	917
	918	ENDIF
	919
	920	!
	921	!-- Solve the linear equation system
	922	CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) )
	923
	924	IF ( host(1:3) == 'nec' ) THEN
	925	!
	926	!-- Code optimized for vector processors
	927	CALL fft_x_m( work_trix, 'backward' )
	928
	929	DO k = 1, nz
	930
	931	m = 0
	932	DO n = 1, pdims(1)
	933	DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
	934	ar(i,k,j,n) = work_trix(m,k)
	935	m = m + 1
	936	ENDDO
	937	ENDDO
	938
	939	ENDDO
	940
	941	ELSE
	942	!
	943	!-- Cache optimized code
	944	DO k = 1, nz
	945
	946	DO i = 0, nx
	947	work_fftx(i) = work_trix(i,k)
	948	ENDDO
	949
	950	CALL fft_x( work_fftx, 'backward' )
	951
	952	m = 0
	953	DO n = 1, pdims(1)
	954	DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
	955	ar(i,k,j,n) = work_fftx(m)
	956	m = m + 1
	957	ENDDO
	958	ENDDO
	959
	960	ENDDO
	961
	962	ENDIF
	963
	964	ENDDO
	965
	966	DEALLOCATE( tri )
	967
	968	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'stop' )
	969
	970	END SUBROUTINE fftx_tri_fftx
	971
	972
	973	SUBROUTINE fftx_tr_xy( f_in, work, f_out )
	974
	975	!------------------------------------------------------------------------------!
	976	! Fourier-transformation along x with subsequent transposition x --> y for
	977	! a 1d-decomposition along y
	978	!
	979	! ATTENTION: The NEC-branch of this routine may significantly profit from
	980	! further optimizations. So far, performance is much worse than
	981	! for routine ffty_tr_yx (more than three times slower).
	982	!------------------------------------------------------------------------------!
	983
	984	USE control_parameters
	985	USE cpulog
	986	USE indices
	987	USE interfaces
	988	USE pegrid
	989	USE transpose_indices
	990
	991	IMPLICIT NONE
	992
	993	INTEGER :: i, j, k
	994
	995	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
	996	REAL, DIMENSION(1:nza,nys:nyna,0:nxa) :: f_in
	997	REAL, DIMENSION(nny,1:nza,nxl_y:nxr_ya,pdims(2)) :: f_out
	998	REAL, DIMENSION(nys:nyna,1:nza,0:nxa) :: work
	999
	1000	!
	1001	!-- Carry out the FFT along x, where all data are present due to the
	1002	!-- 1d-decomposition along y. Resort the data in a way that y becomes
	1003	!-- the first index.
	1004	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
	1005
	1006	IF ( host(1:3) == 'nec' ) THEN
	1007	!
	1008	!-- Code for vector processors
[85]	1009	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	1010	!$OMP DO
	1011	DO i = 0, nx
	1012
	1013	DO j = nys, nyn
	1014	DO k = 1, nz
	1015	work_fftx(i,k,j) = f_in(k,j,i)
	1016	ENDDO
	1017	ENDDO
	1018
	1019	ENDDO
	1020
	1021	!$OMP DO
	1022	DO j = nys, nyn
	1023
	1024	CALL fft_x_m( work_fftx(:,:,j), 'forward' )
	1025
	1026	DO k = 1, nz
	1027	DO i = 0, nx
	1028	work(j,k,i) = work_fftx(i,k,j)
	1029	ENDDO
	1030	ENDDO
	1031
	1032	ENDDO
	1033	!$OMP END PARALLEL
	1034
	1035	ELSE
	1036
	1037	!
	1038	!-- Cache optimized code (there might be still a potential for better
	1039	!-- optimization).
	1040	!$OMP PARALLEL PRIVATE (i,j,k,work_fftx)
	1041	!$OMP DO
	1042	DO i = 0, nx
	1043
	1044	DO j = nys, nyn
	1045	DO k = 1, nz
	1046	work_fftx(i,k,j) = f_in(k,j,i)
	1047	ENDDO
	1048	ENDDO
	1049
	1050	ENDDO
	1051
	1052	!$OMP DO
	1053	DO j = nys, nyn
	1054	DO k = 1, nz
	1055
	1056	CALL fft_x( work_fftx(0:nx,k,j), 'forward' )
	1057
	1058	DO i = 0, nx
	1059	work(j,k,i) = work_fftx(i,k,j)
	1060	ENDDO
	1061	ENDDO
	1062
	1063	ENDDO
	1064	!$OMP END PARALLEL
	1065
	1066	ENDIF
	1067	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
	1068
	1069	!
	1070	!-- Transpose array
	1071	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
	1072	CALL MPI_ALLTOALL( work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
	1073	f_out(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
	1074	comm1dy, ierr )
	1075	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	1076
	1077	END SUBROUTINE fftx_tr_xy
	1078
	1079
	1080	SUBROUTINE tr_yx_fftx( f_in, work, f_out )
	1081
	1082	!------------------------------------------------------------------------------!
	1083	! Transposition y --> x with a subsequent backward Fourier transformation for
	1084	! a 1d-decomposition along x
	1085	!------------------------------------------------------------------------------!
	1086
	1087	USE control_parameters
	1088	USE cpulog
	1089	USE indices
	1090	USE interfaces
	1091	USE pegrid
	1092	USE transpose_indices
	1093
	1094	IMPLICIT NONE
	1095
	1096	INTEGER :: i, j, k
	1097
	1098	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
	1099	REAL, DIMENSION(nny,1:nza,nxl_y:nxr_ya,pdims(2)) :: f_in
	1100	REAL, DIMENSION(1:nza,nys:nyna,0:nxa) :: f_out
	1101	REAL, DIMENSION(nys:nyna,1:nza,0:nxa) :: work
	1102
	1103	!
	1104	!-- Transpose array
	1105	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
	1106	CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
	1107	work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
	1108	comm1dy, ierr )
	1109	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	1110
	1111	!
	1112	!-- Carry out the FFT along x, where all data are present due to the
	1113	!-- 1d-decomposition along y. Resort the data in a way that y becomes
	1114	!-- the first index.
	1115	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
	1116
	1117	IF ( host(1:3) == 'nec' ) THEN
	1118	!
	1119	!-- Code optimized for vector processors
[85]	1120	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	1121	!$OMP DO
	1122	DO j = nys, nyn
	1123
	1124	DO k = 1, nz
	1125	DO i = 0, nx
	1126	work_fftx(i,k,j) = work(j,k,i)
	1127	ENDDO
	1128	ENDDO
	1129
	1130	CALL fft_x_m( work_fftx(:,:,j), 'backward' )
	1131
	1132	ENDDO
	1133
	1134	!$OMP DO
	1135	DO i = 0, nx
	1136	DO j = nys, nyn
	1137	DO k = 1, nz
	1138	f_out(k,j,i) = work_fftx(i,k,j)
	1139	ENDDO
	1140	ENDDO
	1141	ENDDO
	1142	!$OMP END PARALLEL
	1143
	1144	ELSE
	1145
	1146	!
	1147	!-- Cache optimized code (there might be still a potential for better
	1148	!-- optimization).
	1149	!$OMP PARALLEL PRIVATE (i,j,k,work_fftx)
	1150	!$OMP DO
	1151	DO j = nys, nyn
	1152	DO k = 1, nz
	1153
	1154	DO i = 0, nx
	1155	work_fftx(i,k,j) = work(j,k,i)
	1156	ENDDO
	1157
	1158	CALL fft_x( work_fftx(0:nx,k,j), 'backward' )
	1159
	1160	ENDDO
	1161	ENDDO
	1162
	1163	!$OMP DO
	1164	DO i = 0, nx
	1165	DO j = nys, nyn
	1166	DO k = 1, nz
	1167	f_out(k,j,i) = work_fftx(i,k,j)
	1168	ENDDO
	1169	ENDDO
	1170	ENDDO
	1171	!$OMP END PARALLEL
	1172
	1173	ENDIF
	1174	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	1175
	1176	END SUBROUTINE tr_yx_fftx
	1177
	1178
	1179	SUBROUTINE ffty_tri_ffty( ar )
	1180
	1181	!------------------------------------------------------------------------------!
	1182	! FFT along y, solution of the tridiagonal system and backward FFT for
	1183	! a 1d-decomposition along y
	1184	!
	1185	! WARNING: this subroutine may still not work for hybrid parallelization
	1186	! with OpenMP (for possible necessary changes see the original
	1187	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
	1188	!------------------------------------------------------------------------------!
	1189
	1190	USE control_parameters
	1191	USE cpulog
	1192	USE grid_variables
	1193	USE indices
	1194	USE interfaces
	1195	USE pegrid
	1196	USE transpose_indices
	1197
	1198	IMPLICIT NONE
	1199
	1200	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
	1201
	1202	REAL, DIMENSION(0:ny) :: work_ffty
	1203	REAL, DIMENSION(0:ny,1:nz) :: work_triy
	1204	REAL, DIMENSION(nny,1:nza,nxl_y:nxr_ya,pdims(2)) :: ar
	1205	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
	1206
	1207
	1208	CALL cpu_log( log_point_s(39), 'fft_y + tridia', 'start' )
	1209
	1210	ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) )
	1211
	1212	tn = 0 ! Default thread number in case of one thread
	1213	!$OMP PARALLEL PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy )
	1214	!$OMP DO
	1215	DO i = nxl_y, nxr_y
	1216
	1217	!$ tn = omp_get_thread_num()
	1218
	1219	IF ( host(1:3) == 'nec' ) THEN
	1220	!
	1221	!-- Code optimized for vector processors
	1222	DO k = 1, nz
	1223
	1224	m = 0
	1225	DO n = 1, pdims(2)
	1226	DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
	1227	work_triy(m,k) = ar(j,k,i,n)
	1228	m = m + 1
	1229	ENDDO
	1230	ENDDO
	1231
	1232	ENDDO
	1233
	1234	CALL fft_y_m( work_triy, ny, 'forward' )
	1235
	1236	ELSE
	1237	!
	1238	!-- Cache optimized code
	1239	DO k = 1, nz
	1240
	1241	m = 0
	1242	DO n = 1, pdims(2)
	1243	DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
	1244	work_ffty(m) = ar(j,k,i,n)
	1245	m = m + 1
	1246	ENDDO
	1247	ENDDO
	1248
	1249	CALL fft_y( work_ffty, 'forward' )
	1250
	1251	DO j = 0, ny
	1252	work_triy(j,k) = work_ffty(j)
	1253	ENDDO
	1254
	1255	ENDDO
	1256
	1257	ENDIF
	1258
	1259	!
	1260	!-- Solve the linear equation system
	1261	CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) )
	1262
	1263	IF ( host(1:3) == 'nec' ) THEN
	1264	!
	1265	!-- Code optimized for vector processors
	1266	CALL fft_y_m( work_triy, ny, 'backward' )
	1267
	1268	DO k = 1, nz
	1269
	1270	m = 0
	1271	DO n = 1, pdims(2)
	1272	DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
	1273	ar(j,k,i,n) = work_triy(m,k)
	1274	m = m + 1
	1275	ENDDO
	1276	ENDDO
	1277
	1278	ENDDO
	1279
	1280	ELSE
	1281	!
	1282	!-- Cache optimized code
	1283	DO k = 1, nz
	1284
	1285	DO j = 0, ny
	1286	work_ffty(j) = work_triy(j,k)
	1287	ENDDO
	1288
	1289	CALL fft_y( work_ffty, 'backward' )
	1290
	1291	m = 0
	1292	DO n = 1, pdims(2)
	1293	DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
	1294	ar(j,k,i,n) = work_ffty(m)
	1295	m = m + 1
	1296	ENDDO
	1297	ENDDO
	1298
	1299	ENDDO
	1300
	1301	ENDIF
	1302
	1303	ENDDO
	1304	!$OMP END PARALLEL
	1305
	1306	DEALLOCATE( tri )
	1307
	1308	CALL cpu_log( log_point_s(39), 'fft_y + tridia', 'stop' )
	1309
	1310	END SUBROUTINE ffty_tri_ffty
	1311
	1312
	1313	SUBROUTINE tridia_1dd( ddx2, ddy2, nx, ny, j, ar, tri )
	1314
	1315	!------------------------------------------------------------------------------!
	1316	! Solves the linear system of equations for a 1d-decomposition along x (see
	1317	! tridia)
	1318	!
	1319	! Attention: when using the intel compiler, array tri must be passed as an
	1320	! argument to the contained subroutines. Otherwise addres faults
	1321	! will occur.
	1322	! On NEC, tri should not be passed (except for routine substi_1dd)
	1323	! because this causes very bad performance.
	1324	!------------------------------------------------------------------------------!
	1325
	1326	USE arrays_3d
	1327	USE control_parameters
	1328
	1329	USE pegrid
	1330
	1331	IMPLICIT NONE
	1332
	1333	INTEGER :: i, j, k, nnyh, nx, ny, omp_get_thread_num, tn
	1334
	1335	REAL :: ddx2, ddy2
	1336
	1337	REAL, DIMENSION(0:nx,1:nz) :: ar
	1338	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
	1339	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	1340
	1341
	1342	nnyh = ( ny + 1 ) / 2
	1343
	1344	!
	1345	!-- Define constant elements of the tridiagonal matrix.
	1346	!-- The compiler on SX6 does loop exchange. If 0:nx is a high power of 2,
	1347	!-- the exchanged loops create bank conflicts. The following directive
	1348	!-- prohibits loop exchange and the loops perform much better.
	1349	! tn = omp_get_thread_num()
	1350	! WRITE( 120+tn, * ) '+++ id=',myid,' nx=',nx,' thread=', omp_get_thread_num()
[82]	1351	! CALL local_flush( 120+tn )
[1]	1352	!CDIR NOLOOPCHG
	1353	DO k = 0, nz-1
	1354	DO i = 0,nx
	1355	tri(2,i,k) = ddzu(k+1) * ddzw(k+1)
	1356	tri(3,i,k) = ddzu(k+2) * ddzw(k+1)
	1357	ENDDO
	1358	ENDDO
	1359	! WRITE( 120+tn, * ) '+++ id=',myid,' end of first tridia loop thread=', omp_get_thread_num()
[82]	1360	! CALL local_flush( 120+tn )
[1]	1361
	1362	IF ( j <= nnyh ) THEN
	1363	#if defined( __lcmuk )
	1364	CALL maketri_1dd( j, tri )
	1365	#else
	1366	CALL maketri_1dd( j )
	1367	#endif
	1368	ELSE
	1369	#if defined( __lcmuk )
	1370	CALL maketri_1dd( ny+1-j, tri )
	1371	#else
	1372	CALL maketri_1dd( ny+1-j )
	1373	#endif
	1374	ENDIF
	1375	#if defined( __lcmuk )
	1376	CALL split_1dd( tri )
	1377	#else
	1378	CALL split_1dd
	1379	#endif
	1380	CALL substi_1dd( ar, tri )
	1381
	1382	CONTAINS
	1383
	1384	#if defined( __lcmuk )
	1385	SUBROUTINE maketri_1dd( j, tri )
	1386	#else
	1387	SUBROUTINE maketri_1dd( j )
	1388	#endif
	1389
	1390	!------------------------------------------------------------------------------!
	1391	! computes the i- and j-dependent component of the matrix
	1392	!------------------------------------------------------------------------------!
	1393
	1394	USE constants
	1395
	1396	IMPLICIT NONE
	1397
	1398	INTEGER :: i, j, k, nnxh
	1399	REAL :: a, c
	1400
	1401	REAL, DIMENSION(0:nx) :: l
	1402
	1403	#if defined( __lcmuk )
	1404	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	1405	#endif
	1406
	1407
	1408	nnxh = ( nx + 1 ) / 2
	1409	!
	1410	!-- Provide the tridiagonal matrix for solution of the Poisson equation in
	1411	!-- Fourier space. The coefficients are computed following the method of
	1412	!-- Schmidt et al. (DFVLR-Mitteilung 84-15), which departs from Stephan
	1413	!-- Siano's original version by discretizing the Poisson equation,
	1414	!-- before it is Fourier-transformed
	1415	DO i = 0, nx
[128]	1416	IF ( i >= 0 .AND. i <= nnxh ) THEN
[1]	1417	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
	1418	FLOAT( nx+1 ) ) ) * ddx2 + &
	1419	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
	1420	FLOAT( ny+1 ) ) ) * ddy2
	1421	ELSE
	1422	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
	1423	FLOAT( nx+1 ) ) ) * ddx2 + &
	1424	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
	1425	FLOAT( ny+1 ) ) ) * ddy2
	1426	ENDIF
	1427	ENDDO
	1428
	1429	DO k = 0, nz-1
	1430	DO i = 0, nx
	1431	a = -1.0 * ddzu(k+2) * ddzw(k+1)
	1432	c = -1.0 * ddzu(k+1) * ddzw(k+1)
	1433	tri(1,i,k) = a + c - l(i)
	1434	ENDDO
	1435	ENDDO
	1436	IF ( ibc_p_b == 1 .OR. ibc_p_b == 2 ) THEN
	1437	DO i = 0, nx
	1438	tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
	1439	ENDDO
	1440	ENDIF
	1441	IF ( ibc_p_t == 1 ) THEN
	1442	DO i = 0, nx
	1443	tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
	1444	ENDDO
	1445	ENDIF
	1446
	1447	END SUBROUTINE maketri_1dd
	1448
	1449
	1450	#if defined( __lcmuk )
	1451	SUBROUTINE split_1dd( tri )
	1452	#else
	1453	SUBROUTINE split_1dd
	1454	#endif
	1455
	1456	!------------------------------------------------------------------------------!
	1457	! Splitting of the tridiagonal matrix (Thomas algorithm)
	1458	!------------------------------------------------------------------------------!
	1459
	1460	IMPLICIT NONE
	1461
	1462	INTEGER :: i, k
	1463
	1464	#if defined( __lcmuk )
	1465	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	1466	#endif
	1467
	1468
	1469	!
	1470	!-- Splitting
	1471	DO i = 0, nx
	1472	tri(4,i,0) = tri(1,i,0)
	1473	ENDDO
	1474	DO k = 1, nz-1
	1475	DO i = 0, nx
	1476	tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
	1477	tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
	1478	ENDDO
	1479	ENDDO
	1480
	1481	END SUBROUTINE split_1dd
	1482
	1483
	1484	SUBROUTINE substi_1dd( ar, tri )
	1485
	1486	!------------------------------------------------------------------------------!
	1487	! Substitution (Forward and Backward) (Thomas algorithm)
	1488	!------------------------------------------------------------------------------!
	1489
	1490	IMPLICIT NONE
	1491
[76]	1492	INTEGER :: i, k
[1]	1493
	1494	REAL, DIMENSION(0:nx,nz) :: ar
	1495	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
	1496	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	1497
	1498	!
	1499	!-- Forward substitution
	1500	DO i = 0, nx
	1501	ar1(i,0) = ar(i,1)
	1502	ENDDO
	1503	DO k = 1, nz-1
	1504	DO i = 0, nx
	1505	ar1(i,k) = ar(i,k+1) - tri(5,i,k) * ar1(i,k-1)
	1506	ENDDO
	1507	ENDDO
	1508
	1509	!
	1510	!-- Backward substitution
	1511	DO i = 0, nx
	1512	ar(i,nz) = ar1(i,nz-1) / tri(4,i,nz-1)
	1513	ENDDO
	1514	DO k = nz-2, 0, -1
	1515	DO i = 0, nx
	1516	ar(i,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,k+2) ) &
	1517	/ tri(4,i,k)
	1518	ENDDO
	1519	ENDDO
	1520
[76]	1521	!
	1522	!-- Indices i=0, j=0 correspond to horizontally averaged pressure.
	1523	!-- The respective values of ar should be zero at all k-levels if
	1524	!-- acceleration of horizontally averaged vertical velocity is zero.
	1525	IF ( ibc_p_b == 1 .AND. ibc_p_t == 1 ) THEN
	1526	IF ( j == 0 ) THEN
	1527	DO k = 1, nz
	1528	ar(0,k) = 0.0
	1529	ENDDO
	1530	ENDIF
	1531	ENDIF
	1532
[1]	1533	END SUBROUTINE substi_1dd
	1534
	1535	END SUBROUTINE tridia_1dd
	1536
	1537	#endif
	1538
	1539	END MODULE poisfft_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |