Home

Context Navigation

source: palm/trunk/SOURCE/poisfft.f90 @ 86

Last change on this file since 86 was 85, checked in by raasch, 18 years ago
openmp bugfixes found by NEC benchmarker
Property svn:keywords set to `Id`
File size: 44.9 KB

Rev	Line
[1]	1	MODULE poisfft_mod
	2
	3	!------------------------------------------------------------------------------!
	4	! Actual revisions:
	5	! -----------------
[85]	6	! Bugfix: work_fft*_vec removed from some PRIVATE-declarations
[1]	7	!
	8	! Former revisions:
	9	! -----------------
[3]	10	! $Id: poisfft.f90 85 2007-05-11 09:35:14Z raasch $
[77]	11	!
	12	! 76 2007-03-29 00:58:32Z raasch
	13	! Tridiagonal coefficients adjusted for Neumann boundary conditions both at
	14	! the bottom and the top.
	15	!
[3]	16	! RCS Log replace by Id keyword, revision history cleaned up
	17	!
[1]	18	! Revision 1.24 2006/08/04 15:00:24 raasch
	19	! Default setting of the thread number tn in case of not using OpenMP
	20	!
	21	! Revision 1.23 2006/02/23 12:48:38 raasch
	22	! Additional compiler directive in routine tridia_1dd for preventing loop
	23	! exchange on NEC-SX6
	24	!
	25	! Revision 1.20 2004/04/30 12:38:09 raasch
	26	! Parts of former poisfft_hybrid moved to this subroutine,
	27	! former subroutine changed to a module, renaming of FFT-subroutines and
	28	! -module, FFTs completely substituted by calls of fft_x and fft_y,
	29	! NAG fft used in the non-parallel case completely removed, l in maketri
	30	! is now a 1d-array, variables passed by modules instead of using parameter
	31	! lists, enlarged transposition arrays introduced
	32	!
	33	! Revision 1.1 1997/07/24 11:24:14 raasch
	34	! Initial revision
	35	!
	36	!
	37	! Description:
	38	! ------------
	39	! See below.
	40	!------------------------------------------------------------------------------!
	41
	42	!--------------------------------------------------------------------------!
	43	! poisfft !
	44	! !
	45	! Original version: Stephan Siano (pois3d) !
	46	! !
	47	! Institute of Meteorology and Climatology, University of Hannover !
	48	! Germany !
	49	! !
	50	! Version as of July 23,1996 !
	51	! !
	52	! !
	53	! Version for parallel computers: Siegfried Raasch !
	54	! !
	55	! Version as of July 03,1997 !
	56	! !
	57	! Solves the Poisson equation with a 2D spectral method !
	58	! d^2 p / dx^2 + d^2 p / dy^2 + d^2 p / dz^2 = s !
	59	! !
	60	! Input: !
	61	! real ar contains in the (nnx,nny,nnz) elements, !
	62	! starting from the element (1,nys,nxl), the !
	63	! values for s !
	64	! real work Temporary array !
	65	! !
	66	! Output: !
	67	! real ar contains the solution for p !
	68	!--------------------------------------------------------------------------!
	69
	70	USE fft_xy
	71	USE indices
	72	USE transpose_indices
	73
	74	IMPLICIT NONE
	75
	76	PRIVATE
	77	PUBLIC poisfft, poisfft_init
	78
	79	INTERFACE poisfft
	80	MODULE PROCEDURE poisfft
	81	END INTERFACE poisfft
	82
	83	INTERFACE poisfft_init
	84	MODULE PROCEDURE poisfft_init
	85	END INTERFACE poisfft_init
	86
	87	CONTAINS
	88
	89	SUBROUTINE poisfft_init
	90
	91	CALL fft_init
	92
	93	END SUBROUTINE poisfft_init
	94
	95
	96	SUBROUTINE poisfft( ar, work )
	97
	98	USE cpulog
	99	USE interfaces
	100	USE pegrid
	101
	102	IMPLICIT NONE
	103
	104	REAL, DIMENSION(1:nza,nys:nyna,nxl:nxra) :: ar, work
	105
	106
	107	CALL cpu_log( log_point_s(3), 'poisfft', 'start' )
	108
	109	!
	110	!-- Two-dimensional Fourier Transformation in x- and y-direction.
	111	#if defined( __parallel )
	112	IF ( pdims(2) == 1 ) THEN
	113
	114	!
	115	!-- 1d-domain-decomposition along x:
	116	!-- FFT along y and transposition y --> x
	117	CALL ffty_tr_yx( ar, work, ar )
	118
	119	!
	120	!-- FFT along x, solving the tridiagonal system and backward FFT
	121	CALL fftx_tri_fftx( ar )
	122
	123	!
	124	!-- Transposition x --> y and backward FFT along y
	125	CALL tr_xy_ffty( ar, work, ar )
	126
	127	ELSEIF ( pdims(1) == 1 ) THEN
	128
	129	!
	130	!-- 1d-domain-decomposition along y:
	131	!-- FFT along x and transposition x --> y
	132	CALL fftx_tr_xy( ar, work, ar )
	133
	134	!
	135	!-- FFT along y, solving the tridiagonal system and backward FFT
	136	CALL ffty_tri_ffty( ar )
	137
	138	!
	139	!-- Transposition y --> x and backward FFT along x
	140	CALL tr_yx_fftx( ar, work, ar )
	141
	142	ELSE
	143
	144	!
	145	!-- 2d-domain-decomposition
	146	!-- Transposition z --> x
	147	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
	148	CALL transpose_zx( ar, work, ar, work, ar )
	149	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
	150
	151	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
	152	CALL fftxp( ar, 'forward' )
	153	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
	154
	155	!
	156	!-- Transposition x --> y
	157	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
	158	CALL transpose_xy( ar, work, ar, work, ar )
	159	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
	160
	161	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
	162	CALL fftyp( ar, 'forward' )
	163	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	164
	165	!
	166	!-- Transposition y --> z
	167	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
	168	CALL transpose_yz( ar, work, ar, work, ar )
	169	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
	170
	171	!
	172	!-- Solve the Poisson equation in z-direction in cartesian space.
	173	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
	174	CALL tridia( ar )
	175	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
	176
	177	!
	178	!-- Inverse Fourier Transformation
	179	!-- Transposition z --> y
	180	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
	181	CALL transpose_zy( ar, work, ar, work, ar )
	182	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
	183
	184	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
	185	CALL fftyp( ar, 'backward' )
	186	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	187
	188	!
	189	!-- Transposition y --> x
	190	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
	191	CALL transpose_yx( ar, work, ar, work, ar )
	192	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
	193
	194	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
	195	CALL fftxp( ar, 'backward' )
	196	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	197
	198	!
	199	!-- Transposition x --> z
	200	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
	201	CALL transpose_xz( ar, work, ar, work, ar )
	202	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
	203
	204	ENDIF
	205
	206	#else
	207
	208	!
	209	!-- Two-dimensional Fourier Transformation along x- and y-direction.
	210	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
	211	CALL fftx( ar, 'forward' )
	212	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
	213	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
	214	CALL ffty( ar, 'forward' )
	215	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	216
	217	!
	218	!-- Solve the Poisson equation in z-direction in cartesian space.
	219	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
	220	CALL tridia( ar )
	221	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
	222
	223	!
	224	!-- Inverse Fourier Transformation.
	225	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
	226	CALL ffty( ar, 'backward' )
	227	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	228	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
	229	CALL fftx( ar, 'backward' )
	230	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	231
	232	#endif
	233
	234	CALL cpu_log( log_point_s(3), 'poisfft', 'stop' )
	235
	236	END SUBROUTINE poisfft
	237
	238
	239
	240	SUBROUTINE tridia( ar )
	241
	242	!------------------------------------------------------------------------------!
	243	! solves the linear system of equations:
	244	!
	245	! -(4 pi^2(i^2/(dx^2nnx^2)+j^2/(dy^2nny^2))+
	246	! 1/(dzu(k)dzw(k))+1/(dzu(k-1)dzw(k)))*p(i,j,k)+
	247	! 1/(dzu(k)dzw(k))p(i,j,k+1)+1/(dzu(k-1)dzw(k))p(i,j,k-1)=d(i,j,k)
	248	!
	249	! by using the Thomas algorithm
	250	!------------------------------------------------------------------------------!
	251
	252	USE arrays_3d
	253
	254	IMPLICIT NONE
	255
	256	INTEGER :: i, j, k, nnyh
	257
	258	REAL, DIMENSION(nxl_z:nxr_z,0:nz-1) :: ar1
	259	REAL, DIMENSION(5,nxl_z:nxr_z,0:nz-1) :: tri
	260
	261	#if defined( __parallel )
	262	REAL :: ar(nxl_z:nxr_za,nys_z:nyn_za,1:nza)
	263	#else
	264	REAL :: ar(1:nz,nys_z:nyn_z,nxl_z:nxr_z)
	265	#endif
	266
	267
	268	nnyh = (ny+1) / 2
	269
	270	!
	271	!-- Define constant elements of the tridiagonal matrix.
	272	DO k = 0, nz-1
	273	DO i = nxl_z, nxr_z
	274	tri(2,i,k) = ddzu(k+1) * ddzw(k+1)
	275	tri(3,i,k) = ddzu(k+2) * ddzw(k+1)
	276	ENDDO
	277	ENDDO
	278
	279	#if defined( __parallel )
	280	!
	281	!-- Repeat for all y-levels.
	282	DO j = nys_z, nyn_z
	283	IF ( j <= nnyh ) THEN
	284	CALL maketri( tri, j )
	285	ELSE
	286	CALL maketri( tri, ny+1-j )
	287	ENDIF
	288	CALL split( tri )
	289	CALL substi( ar, ar1, tri, j )
	290	ENDDO
	291	#else
	292	!
	293	!-- First y-level.
	294	CALL maketri( tri, nys_z )
	295	CALL split( tri )
	296	CALL substi( ar, ar1, tri, 0 )
	297
	298	!
	299	!-- Further y-levels.
	300	DO j = 1, nnyh - 1
	301	CALL maketri( tri, j )
	302	CALL split( tri )
	303	CALL substi( ar, ar1, tri, j )
	304	CALL substi( ar, ar1, tri, ny+1-j )
	305	ENDDO
	306	CALL maketri( tri, nnyh )
	307	CALL split( tri )
	308	CALL substi( ar, ar1, tri, nnyh+nys )
	309	#endif
	310
	311	CONTAINS
	312
	313	SUBROUTINE maketri( tri, j )
	314
	315	!------------------------------------------------------------------------------!
	316	! Computes the i- and j-dependent component of the matrix
	317	!------------------------------------------------------------------------------!
	318
	319	USE arrays_3d
	320	USE constants
	321	USE control_parameters
	322	USE grid_variables
	323
	324	IMPLICIT NONE
	325
	326	INTEGER :: i, j, k, nnxh
	327	REAL :: a, c
	328	REAL :: ll(nxl_z:nxr_z)
	329	REAL :: tri(5,nxl_z:nxr_z,0:nz-1)
	330
	331
	332	nnxh = ( nx + 1 ) / 2
	333
	334	!
	335	!-- Provide the tridiagonal matrix for solution of the Poisson equation in
	336	!-- Fourier space. The coefficients are computed following the method of
	337	!-- Schmidt et al. (DFVLR-Mitteilung 84-15), which departs from Stephan
	338	!-- Siano's original version by discretizing the Poisson equation,
	339	!-- before it is Fourier-transformed
	340	#if defined( __parallel )
	341	DO i = nxl_z, nxr_z
	342	IF ( i >= 0 .AND. i < nnxh ) THEN
	343	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
	344	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
	345	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
	346	FLOAT( ny+1 ) ) ) / ( dy * dy )
	347	ELSEIF ( i == nnxh ) THEN
	348	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
	349	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
	350	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
	351	FLOAT( ny+1 ) ) ) / ( dy * dy )
	352	ELSE
	353	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
	354	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
	355	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
	356	FLOAT( ny+1 ) ) ) / ( dy * dy )
	357	ENDIF
	358	DO k = 0,nz-1
	359	a = -1.0 * ddzu(k+2) * ddzw(k+1)
	360	c = -1.0 * ddzu(k+1) * ddzw(k+1)
	361	tri(1,i,k) = a + c - ll(i)
	362	ENDDO
	363	ENDDO
	364	#else
	365	DO i = 0, nnxh
	366	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / FLOAT( nx+1 ) ) ) / &
	367	( dx * dx ) + &
	368	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / FLOAT( ny+1 ) ) ) / &
	369	( dy * dy )
	370	DO k = 0, nz-1
	371	a = -1.0 * ddzu(k+2) * ddzw(k+1)
	372	c = -1.0 * ddzu(k+1) * ddzw(k+1)
	373	tri(1,i,k) = a + c - ll(i)
	374	IF ( i >= 1 .and. i < nnxh ) THEN
	375	tri(1,nx+1-i,k) = tri(1,i,k)
	376	ENDIF
	377	ENDDO
	378	ENDDO
	379	#endif
	380	IF ( ibc_p_b == 1 .OR. ibc_p_b == 2 ) THEN
	381	DO i = nxl_z, nxr_z
	382	tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
	383	ENDDO
	384	ENDIF
	385	IF ( ibc_p_t == 1 ) THEN
	386	DO i = nxl_z, nxr_z
	387	tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
	388	ENDDO
	389	ENDIF
	390
	391	END SUBROUTINE maketri
	392
	393
	394	SUBROUTINE substi( ar, ar1, tri, j )
	395
	396	!------------------------------------------------------------------------------!
	397	! Substitution (Forward and Backward) (Thomas algorithm)
	398	!------------------------------------------------------------------------------!
	399
[76]	400	USE control_parameters
	401
[1]	402	IMPLICIT NONE
	403
	404	INTEGER :: i, j, k
	405	REAL :: ar1(nxl_z:nxr_z,0:nz-1)
	406	REAL :: tri(5,nxl_z:nxr_z,0:nz-1)
	407	#if defined( __parallel )
	408	REAL :: ar(nxl_z:nxr_za,nys_z:nyn_za,1:nza)
	409	#else
	410	REAL :: ar(1:nz,nys_z:nyn_z,nxl_z:nxr_z)
	411	#endif
	412
	413	!
	414	!-- Forward substitution.
	415	DO i = nxl_z, nxr_z
	416	#if defined( __parallel )
	417	ar1(i,0) = ar(i,j,1)
	418	#else
	419	ar1(i,0) = ar(1,j,i)
	420	#endif
	421	ENDDO
	422	DO k = 1, nz - 1
	423	DO i = nxl_z, nxr_z
	424	#if defined( __parallel )
	425	ar1(i,k) = ar(i,j,k+1) - tri(5,i,k) * ar1(i,k-1)
	426	#else
	427	ar1(i,k) = ar(k+1,j,i) - tri(5,i,k) * ar1(i,k-1)
	428	#endif
	429	ENDDO
	430	ENDDO
	431
	432	!
	433	!-- Backward substitution.
	434	DO i = nxl_z, nxr_z
	435	#if defined( __parallel )
	436	ar(i,j,nz) = ar1(i,nz-1) / tri(4,i,nz-1)
	437	#else
	438	ar(nz,j,i) = ar1(i,nz-1) / tri(4,i,nz-1)
	439	#endif
	440	ENDDO
	441	DO k = nz-2, 0, -1
	442	DO i = nxl_z, nxr_z
	443	#if defined( __parallel )
	444	ar(i,j,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,j,k+2) ) &
	445	/ tri(4,i,k)
	446	#else
	447	ar(k+1,j,i) = ( ar1(i,k) - tri(3,i,k) * ar(k+2,j,i) ) &
	448	/ tri(4,i,k)
	449	#endif
	450	ENDDO
	451	ENDDO
	452
[76]	453	!
	454	!-- Indices i=0, j=0 correspond to horizontally averaged pressure.
	455	!-- The respective values of ar should be zero at all k-levels if
	456	!-- acceleration of horizontally averaged vertical velocity is zero.
	457	IF ( ibc_p_b == 1 .AND. ibc_p_t == 1 ) THEN
	458	IF ( j == 0 .AND. nxl_z == 0 ) THEN
	459	#if defined( __parallel )
	460	DO k = 1, nz
	461	ar(nxl_z,j,k) = 0.0
	462	ENDDO
	463	#else
	464	DO k = 1, nz
	465	ar(k,j,nxl_z) = 0.0
	466	ENDDO
	467	#endif
	468	ENDIF
	469	ENDIF
	470
[1]	471	END SUBROUTINE substi
	472
	473
	474	SUBROUTINE split( tri )
	475
	476	!------------------------------------------------------------------------------!
	477	! Splitting of the tridiagonal matrix (Thomas algorithm)
	478	!------------------------------------------------------------------------------!
	479
	480	IMPLICIT NONE
	481
	482	INTEGER :: i, k
	483	REAL :: tri(5,nxl_z:nxr_z,0:nz-1)
	484
	485	!
	486	!-- Splitting.
	487	DO i = nxl_z, nxr_z
	488	tri(4,i,0) = tri(1,i,0)
	489	ENDDO
	490	DO k = 1, nz-1
	491	DO i = nxl_z, nxr_z
	492	tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
	493	tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
	494	ENDDO
	495	ENDDO
	496
	497	END SUBROUTINE split
	498
	499	END SUBROUTINE tridia
	500
	501
	502	#if defined( __parallel )
	503	SUBROUTINE fftxp( ar, direction )
	504
	505	!------------------------------------------------------------------------------!
	506	! Fourier-transformation along x-direction Parallelized version
	507	!------------------------------------------------------------------------------!
	508
	509	IMPLICIT NONE
	510
	511	CHARACTER (LEN=*) :: direction
	512	INTEGER :: j, k
	513	REAL :: ar(0:nxa,nys_x:nyn_xa,nzb_x:nzt_xa)
	514
	515	!
	516	!-- Performing the fft with one of the methods implemented
	517	DO k = nzb_x, nzt_x
	518	DO j = nys_x, nyn_x
	519	CALL fft_x( ar(0:nx,j,k), direction )
	520	ENDDO
	521	ENDDO
	522
	523	END SUBROUTINE fftxp
	524
	525	#else
	526	SUBROUTINE fftx( ar, direction )
	527
	528	!------------------------------------------------------------------------------!
	529	! Fourier-transformation along x-direction Non parallel version
	530	!------------------------------------------------------------------------------!
	531
	532	IMPLICIT NONE
	533
	534	CHARACTER (LEN=*) :: direction
	535	INTEGER :: i, j, k
	536	REAL :: ar(1:nz,0:ny,0:nx)
	537
	538	!
	539	!-- Performing the fft with one of the methods implemented
	540	DO k = 1, nz
	541	DO j = 0, ny
	542	CALL fft_x( ar(k,j,0:nx), direction )
	543	ENDDO
	544	ENDDO
	545
	546	END SUBROUTINE fftx
	547	#endif
	548
	549
	550	#if defined( __parallel )
	551	SUBROUTINE fftyp( ar, direction )
	552
	553	!------------------------------------------------------------------------------!
	554	! Fourier-transformation along y-direction Parallelized version
	555	!------------------------------------------------------------------------------!
	556
	557	IMPLICIT NONE
	558
	559	CHARACTER (LEN=*) :: direction
	560	INTEGER :: i, k
	561	REAL :: ar(0:nya,nxl_y:nxr_ya,nzb_y:nzt_ya)
	562
	563	!
	564	!-- Performing the fft with one of the methods implemented
	565	DO k = nzb_y, nzt_y
	566	DO i = nxl_y, nxr_y
	567	CALL fft_y( ar(0:ny,i,k), direction )
	568	ENDDO
	569	ENDDO
	570
	571	END SUBROUTINE fftyp
	572
	573	#else
	574	SUBROUTINE ffty( ar, direction )
	575
	576	!------------------------------------------------------------------------------!
	577	! Fourier-transformation along y-direction Non parallel version
	578	!------------------------------------------------------------------------------!
	579
	580	IMPLICIT NONE
	581
	582	CHARACTER (LEN=*) :: direction
	583	INTEGER :: i, k
	584	REAL :: ar(1:nz,0:ny,0:nx)
	585
	586	!
	587	!-- Performing the fft with one of the methods implemented
	588	DO k = 1, nz
	589	DO i = 0, nx
	590	CALL fft_y( ar(k,0:ny,i), direction )
	591	ENDDO
	592	ENDDO
	593
	594	END SUBROUTINE ffty
	595	#endif
	596
	597	#if defined( __parallel )
	598	SUBROUTINE ffty_tr_yx( f_in, work, f_out )
	599
	600	!------------------------------------------------------------------------------!
	601	! Fourier-transformation along y with subsequent transposition y --> x for
	602	! a 1d-decomposition along x
	603	!
	604	! ATTENTION: The performance of this routine is much faster on the NEC-SX6,
	605	! if the first index of work_ffty_vec is odd. Otherwise
	606	! memory bank conflicts may occur (especially if the index is a
	607	! multiple of 128). That's why work_ffty_vec is dimensioned as
	608	! 0:ny+1.
	609	! Of course, this will not work if users are using an odd number
	610	! of gridpoints along y.
	611	!------------------------------------------------------------------------------!
	612
	613	USE control_parameters
	614	USE cpulog
	615	USE indices
	616	USE interfaces
	617	USE pegrid
	618	USE transpose_indices
	619
	620	IMPLICIT NONE
	621
	622	INTEGER :: i, iend, iouter, ir, j, k
	623	INTEGER, PARAMETER :: stridex = 4
	624
	625	REAL, DIMENSION(0:ny,stridex) :: work_ffty
	626	#if defined( __nec )
	627	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
	628	#endif
	629	REAL, DIMENSION(1:nza,0:nya,nxl:nxra) :: f_in
	630	REAL, DIMENSION(nnx,1:nza,nys_x:nyn_xa,pdims(1)) :: f_out
	631	REAL, DIMENSION(nxl:nxra,1:nza,0:nya) :: work
	632
	633	!
	634	!-- Carry out the FFT along y, where all data are present due to the
	635	!-- 1d-decomposition along x. Resort the data in a way that x becomes
	636	!-- the first index.
	637	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
	638
	639	IF ( host(1:3) == 'nec' ) THEN
	640	#if defined( __nec )
	641	!
	642	!-- Code optimized for vector processors
[85]	643	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	644	!$OMP DO
	645	DO i = nxl, nxr
	646
	647	DO j = 0, ny
	648	DO k = 1, nz
	649	work_ffty_vec(j,k,i) = f_in(k,j,i)
	650	ENDDO
	651	ENDDO
	652
	653	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' )
	654
	655	ENDDO
	656
	657	!$OMP DO
	658	DO k = 1, nz
	659	DO j = 0, ny
	660	DO i = nxl, nxr
	661	work(i,k,j) = work_ffty_vec(j,k,i)
	662	ENDDO
	663	ENDDO
	664	ENDDO
	665	!$OMP END PARALLEL
	666	#endif
	667
	668	ELSE
	669
	670	!
	671	!-- Cache optimized code.
	672	!-- The i-(x-)direction is split into a strided outer loop and an inner
	673	!-- loop for better cache performance
	674	!$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty)
	675	!$OMP DO
	676	DO iouter = nxl, nxr, stridex
	677
	678	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
	679
	680	DO k = 1, nz
	681
	682	DO i = iouter, iend
	683
	684	ir = i-iouter+1 ! counter within a stride
	685	DO j = 0, ny
	686	work_ffty(j,ir) = f_in(k,j,i)
	687	ENDDO
	688	!
	689	!-- FFT along y
	690	CALL fft_y( work_ffty(:,ir), 'forward' )
	691
	692	ENDDO
	693
	694	!
	695	!-- Resort
	696	DO j = 0, ny
	697	DO i = iouter, iend
	698	work(i,k,j) = work_ffty(j,i-iouter+1)
	699	ENDDO
	700	ENDDO
	701
	702	ENDDO
	703
	704	ENDDO
	705	!$OMP END PARALLEL
	706
	707	ENDIF
	708	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	709
	710	!
	711	!-- Transpose array
	712	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
	713	CALL MPI_ALLTOALL( work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
	714	f_out(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
	715	comm1dx, ierr )
	716	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	717
	718	END SUBROUTINE ffty_tr_yx
	719
	720
	721	SUBROUTINE tr_xy_ffty( f_in, work, f_out )
	722
	723	!------------------------------------------------------------------------------!
	724	! Transposition x --> y with a subsequent backward Fourier transformation for
	725	! a 1d-decomposition along x
	726	!------------------------------------------------------------------------------!
	727
	728	USE control_parameters
	729	USE cpulog
	730	USE indices
	731	USE interfaces
	732	USE pegrid
	733	USE transpose_indices
	734
	735	IMPLICIT NONE
	736
	737	INTEGER :: i, iend, iouter, ir, j, k
	738	INTEGER, PARAMETER :: stridex = 4
	739
	740	REAL, DIMENSION(0:ny,stridex) :: work_ffty
	741	#if defined( __nec )
	742	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
	743	#endif
	744	REAL, DIMENSION(nnx,1:nza,nys_x:nyn_xa,pdims(1)) :: f_in
	745	REAL, DIMENSION(1:nza,0:nya,nxl:nxra) :: f_out
	746	REAL, DIMENSION(nxl:nxra,1:nza,0:nya) :: work
	747
	748	!
	749	!-- Transpose array
	750	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
	751	CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
	752	work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
	753	comm1dx, ierr )
	754	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	755
	756	!
	757	!-- Resort the data in a way that y becomes the first index and carry out the
	758	!-- backward fft along y.
	759	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
	760
	761	IF ( host(1:3) == 'nec' ) THEN
	762	#if defined( __nec )
	763	!
	764	!-- Code optimized for vector processors
[85]	765	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	766	!$OMP DO
	767	DO k = 1, nz
	768	DO j = 0, ny
	769	DO i = nxl, nxr
	770	work_ffty_vec(j,k,i) = work(i,k,j)
	771	ENDDO
	772	ENDDO
	773	ENDDO
	774
	775	!$OMP DO
	776	DO i = nxl, nxr
	777
	778	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' )
	779
	780	DO j = 0, ny
	781	DO k = 1, nz
	782	f_out(k,j,i) = work_ffty_vec(j,k,i)
	783	ENDDO
	784	ENDDO
	785
	786	ENDDO
	787	!$OMP END PARALLEL
	788	#endif
	789
	790	ELSE
	791
	792	!
	793	!-- Cache optimized code.
	794	!-- The i-(x-)direction is split into a strided outer loop and an inner
	795	!-- loop for better cache performance
	796	!$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty )
	797	!$OMP DO
	798	DO iouter = nxl, nxr, stridex
	799
	800	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
	801
	802	DO k = 1, nz
	803	!
	804	!-- Resort
	805	DO j = 0, ny
	806	DO i = iouter, iend
	807	work_ffty(j,i-iouter+1) = work(i,k,j)
	808	ENDDO
	809	ENDDO
	810
	811	DO i = iouter, iend
	812
	813	!
	814	!-- FFT along y
	815	ir = i-iouter+1 ! counter within a stride
	816	CALL fft_y( work_ffty(:,ir), 'backward' )
	817
	818	DO j = 0, ny
	819	f_out(k,j,i) = work_ffty(j,ir)
	820	ENDDO
	821	ENDDO
	822
	823	ENDDO
	824
	825	ENDDO
	826	!$OMP END PARALLEL
	827
	828	ENDIF
	829
	830	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	831
	832	END SUBROUTINE tr_xy_ffty
	833
	834
	835	SUBROUTINE fftx_tri_fftx( ar )
	836
	837	!------------------------------------------------------------------------------!
	838	! FFT along x, solution of the tridiagonal system and backward FFT for
	839	! a 1d-decomposition along x
	840	!
	841	! WARNING: this subroutine may still not work for hybrid parallelization
	842	! with OpenMP (for possible necessary changes see the original
	843	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
	844	!------------------------------------------------------------------------------!
	845
	846	USE control_parameters
	847	USE cpulog
	848	USE grid_variables
	849	USE indices
	850	USE interfaces
	851	USE pegrid
	852	USE transpose_indices
	853
	854	IMPLICIT NONE
	855
	856	character(len=3) :: myth_char
	857
	858	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
	859
	860	REAL, DIMENSION(0:nx) :: work_fftx
	861	REAL, DIMENSION(0:nx,1:nz) :: work_trix
	862	REAL, DIMENSION(nnx,1:nza,nys_x:nyn_xa,pdims(1)) :: ar
	863	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
	864
	865
	866	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'start' )
	867
	868	ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) )
	869
	870	tn = 0 ! Default thread number in case of one thread
	871	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_fftx, work_trix )
	872	DO j = nys_x, nyn_x
	873
	874	!$ tn = omp_get_thread_num()
	875
	876	IF ( host(1:3) == 'nec' ) THEN
	877	!
	878	!-- Code optimized for vector processors
	879	DO k = 1, nz
	880
	881	m = 0
	882	DO n = 1, pdims(1)
	883	DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
	884	work_trix(m,k) = ar(i,k,j,n)
	885	m = m + 1
	886	ENDDO
	887	ENDDO
	888
	889	ENDDO
	890
	891	CALL fft_x_m( work_trix, 'forward' )
	892
	893	ELSE
	894	!
	895	!-- Cache optimized code
	896	DO k = 1, nz
	897
	898	m = 0
	899	DO n = 1, pdims(1)
	900	DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
	901	work_fftx(m) = ar(i,k,j,n)
	902	m = m + 1
	903	ENDDO
	904	ENDDO
	905
	906	CALL fft_x( work_fftx, 'forward' )
	907
	908	DO i = 0, nx
	909	work_trix(i,k) = work_fftx(i)
	910	ENDDO
	911
	912	ENDDO
	913
	914	ENDIF
	915
	916	!
	917	!-- Solve the linear equation system
	918	CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) )
	919
	920	IF ( host(1:3) == 'nec' ) THEN
	921	!
	922	!-- Code optimized for vector processors
	923	CALL fft_x_m( work_trix, 'backward' )
	924
	925	DO k = 1, nz
	926
	927	m = 0
	928	DO n = 1, pdims(1)
	929	DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
	930	ar(i,k,j,n) = work_trix(m,k)
	931	m = m + 1
	932	ENDDO
	933	ENDDO
	934
	935	ENDDO
	936
	937	ELSE
	938	!
	939	!-- Cache optimized code
	940	DO k = 1, nz
	941
	942	DO i = 0, nx
	943	work_fftx(i) = work_trix(i,k)
	944	ENDDO
	945
	946	CALL fft_x( work_fftx, 'backward' )
	947
	948	m = 0
	949	DO n = 1, pdims(1)
	950	DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
	951	ar(i,k,j,n) = work_fftx(m)
	952	m = m + 1
	953	ENDDO
	954	ENDDO
	955
	956	ENDDO
	957
	958	ENDIF
	959
	960	ENDDO
	961
	962	DEALLOCATE( tri )
	963
	964	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'stop' )
	965
	966	END SUBROUTINE fftx_tri_fftx
	967
	968
	969	SUBROUTINE fftx_tr_xy( f_in, work, f_out )
	970
	971	!------------------------------------------------------------------------------!
	972	! Fourier-transformation along x with subsequent transposition x --> y for
	973	! a 1d-decomposition along y
	974	!
	975	! ATTENTION: The NEC-branch of this routine may significantly profit from
	976	! further optimizations. So far, performance is much worse than
	977	! for routine ffty_tr_yx (more than three times slower).
	978	!------------------------------------------------------------------------------!
	979
	980	USE control_parameters
	981	USE cpulog
	982	USE indices
	983	USE interfaces
	984	USE pegrid
	985	USE transpose_indices
	986
	987	IMPLICIT NONE
	988
	989	INTEGER :: i, j, k
	990
	991	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
	992	REAL, DIMENSION(1:nza,nys:nyna,0:nxa) :: f_in
	993	REAL, DIMENSION(nny,1:nza,nxl_y:nxr_ya,pdims(2)) :: f_out
	994	REAL, DIMENSION(nys:nyna,1:nza,0:nxa) :: work
	995
	996	!
	997	!-- Carry out the FFT along x, where all data are present due to the
	998	!-- 1d-decomposition along y. Resort the data in a way that y becomes
	999	!-- the first index.
	1000	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
	1001
	1002	IF ( host(1:3) == 'nec' ) THEN
	1003	!
	1004	!-- Code for vector processors
[85]	1005	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	1006	!$OMP DO
	1007	DO i = 0, nx
	1008
	1009	DO j = nys, nyn
	1010	DO k = 1, nz
	1011	work_fftx(i,k,j) = f_in(k,j,i)
	1012	ENDDO
	1013	ENDDO
	1014
	1015	ENDDO
	1016
	1017	!$OMP DO
	1018	DO j = nys, nyn
	1019
	1020	CALL fft_x_m( work_fftx(:,:,j), 'forward' )
	1021
	1022	DO k = 1, nz
	1023	DO i = 0, nx
	1024	work(j,k,i) = work_fftx(i,k,j)
	1025	ENDDO
	1026	ENDDO
	1027
	1028	ENDDO
	1029	!$OMP END PARALLEL
	1030
	1031	ELSE
	1032
	1033	!
	1034	!-- Cache optimized code (there might be still a potential for better
	1035	!-- optimization).
	1036	!$OMP PARALLEL PRIVATE (i,j,k,work_fftx)
	1037	!$OMP DO
	1038	DO i = 0, nx
	1039
	1040	DO j = nys, nyn
	1041	DO k = 1, nz
	1042	work_fftx(i,k,j) = f_in(k,j,i)
	1043	ENDDO
	1044	ENDDO
	1045
	1046	ENDDO
	1047
	1048	!$OMP DO
	1049	DO j = nys, nyn
	1050	DO k = 1, nz
	1051
	1052	CALL fft_x( work_fftx(0:nx,k,j), 'forward' )
	1053
	1054	DO i = 0, nx
	1055	work(j,k,i) = work_fftx(i,k,j)
	1056	ENDDO
	1057	ENDDO
	1058
	1059	ENDDO
	1060	!$OMP END PARALLEL
	1061
	1062	ENDIF
	1063	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
	1064
	1065	!
	1066	!-- Transpose array
	1067	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
	1068	CALL MPI_ALLTOALL( work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
	1069	f_out(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
	1070	comm1dy, ierr )
	1071	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	1072
	1073	END SUBROUTINE fftx_tr_xy
	1074
	1075
	1076	SUBROUTINE tr_yx_fftx( f_in, work, f_out )
	1077
	1078	!------------------------------------------------------------------------------!
	1079	! Transposition y --> x with a subsequent backward Fourier transformation for
	1080	! a 1d-decomposition along x
	1081	!------------------------------------------------------------------------------!
	1082
	1083	USE control_parameters
	1084	USE cpulog
	1085	USE indices
	1086	USE interfaces
	1087	USE pegrid
	1088	USE transpose_indices
	1089
	1090	IMPLICIT NONE
	1091
	1092	INTEGER :: i, j, k
	1093
	1094	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
	1095	REAL, DIMENSION(nny,1:nza,nxl_y:nxr_ya,pdims(2)) :: f_in
	1096	REAL, DIMENSION(1:nza,nys:nyna,0:nxa) :: f_out
	1097	REAL, DIMENSION(nys:nyna,1:nza,0:nxa) :: work
	1098
	1099	!
	1100	!-- Transpose array
	1101	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
	1102	CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
	1103	work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
	1104	comm1dy, ierr )
	1105	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	1106
	1107	!
	1108	!-- Carry out the FFT along x, where all data are present due to the
	1109	!-- 1d-decomposition along y. Resort the data in a way that y becomes
	1110	!-- the first index.
	1111	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
	1112
	1113	IF ( host(1:3) == 'nec' ) THEN
	1114	!
	1115	!-- Code optimized for vector processors
[85]	1116	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	1117	!$OMP DO
	1118	DO j = nys, nyn
	1119
	1120	DO k = 1, nz
	1121	DO i = 0, nx
	1122	work_fftx(i,k,j) = work(j,k,i)
	1123	ENDDO
	1124	ENDDO
	1125
	1126	CALL fft_x_m( work_fftx(:,:,j), 'backward' )
	1127
	1128	ENDDO
	1129
	1130	!$OMP DO
	1131	DO i = 0, nx
	1132	DO j = nys, nyn
	1133	DO k = 1, nz
	1134	f_out(k,j,i) = work_fftx(i,k,j)
	1135	ENDDO
	1136	ENDDO
	1137	ENDDO
	1138	!$OMP END PARALLEL
	1139
	1140	ELSE
	1141
	1142	!
	1143	!-- Cache optimized code (there might be still a potential for better
	1144	!-- optimization).
	1145	!$OMP PARALLEL PRIVATE (i,j,k,work_fftx)
	1146	!$OMP DO
	1147	DO j = nys, nyn
	1148	DO k = 1, nz
	1149
	1150	DO i = 0, nx
	1151	work_fftx(i,k,j) = work(j,k,i)
	1152	ENDDO
	1153
	1154	CALL fft_x( work_fftx(0:nx,k,j), 'backward' )
	1155
	1156	ENDDO
	1157	ENDDO
	1158
	1159	!$OMP DO
	1160	DO i = 0, nx
	1161	DO j = nys, nyn
	1162	DO k = 1, nz
	1163	f_out(k,j,i) = work_fftx(i,k,j)
	1164	ENDDO
	1165	ENDDO
	1166	ENDDO
	1167	!$OMP END PARALLEL
	1168
	1169	ENDIF
	1170	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	1171
	1172	END SUBROUTINE tr_yx_fftx
	1173
	1174
	1175	SUBROUTINE ffty_tri_ffty( ar )
	1176
	1177	!------------------------------------------------------------------------------!
	1178	! FFT along y, solution of the tridiagonal system and backward FFT for
	1179	! a 1d-decomposition along y
	1180	!
	1181	! WARNING: this subroutine may still not work for hybrid parallelization
	1182	! with OpenMP (for possible necessary changes see the original
	1183	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
	1184	!------------------------------------------------------------------------------!
	1185
	1186	USE control_parameters
	1187	USE cpulog
	1188	USE grid_variables
	1189	USE indices
	1190	USE interfaces
	1191	USE pegrid
	1192	USE transpose_indices
	1193
	1194	IMPLICIT NONE
	1195
	1196	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
	1197
	1198	REAL, DIMENSION(0:ny) :: work_ffty
	1199	REAL, DIMENSION(0:ny,1:nz) :: work_triy
	1200	REAL, DIMENSION(nny,1:nza,nxl_y:nxr_ya,pdims(2)) :: ar
	1201	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
	1202
	1203
	1204	CALL cpu_log( log_point_s(39), 'fft_y + tridia', 'start' )
	1205
	1206	ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) )
	1207
	1208	tn = 0 ! Default thread number in case of one thread
	1209	!$OMP PARALLEL PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy )
	1210	!$OMP DO
	1211	DO i = nxl_y, nxr_y
	1212
	1213	!$ tn = omp_get_thread_num()
	1214
	1215	IF ( host(1:3) == 'nec' ) THEN
	1216	!
	1217	!-- Code optimized for vector processors
	1218	DO k = 1, nz
	1219
	1220	m = 0
	1221	DO n = 1, pdims(2)
	1222	DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
	1223	work_triy(m,k) = ar(j,k,i,n)
	1224	m = m + 1
	1225	ENDDO
	1226	ENDDO
	1227
	1228	ENDDO
	1229
	1230	CALL fft_y_m( work_triy, ny, 'forward' )
	1231
	1232	ELSE
	1233	!
	1234	!-- Cache optimized code
	1235	DO k = 1, nz
	1236
	1237	m = 0
	1238	DO n = 1, pdims(2)
	1239	DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
	1240	work_ffty(m) = ar(j,k,i,n)
	1241	m = m + 1
	1242	ENDDO
	1243	ENDDO
	1244
	1245	CALL fft_y( work_ffty, 'forward' )
	1246
	1247	DO j = 0, ny
	1248	work_triy(j,k) = work_ffty(j)
	1249	ENDDO
	1250
	1251	ENDDO
	1252
	1253	ENDIF
	1254
	1255	!
	1256	!-- Solve the linear equation system
	1257	CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) )
	1258
	1259	IF ( host(1:3) == 'nec' ) THEN
	1260	!
	1261	!-- Code optimized for vector processors
	1262	CALL fft_y_m( work_triy, ny, 'backward' )
	1263
	1264	DO k = 1, nz
	1265
	1266	m = 0
	1267	DO n = 1, pdims(2)
	1268	DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
	1269	ar(j,k,i,n) = work_triy(m,k)
	1270	m = m + 1
	1271	ENDDO
	1272	ENDDO
	1273
	1274	ENDDO
	1275
	1276	ELSE
	1277	!
	1278	!-- Cache optimized code
	1279	DO k = 1, nz
	1280
	1281	DO j = 0, ny
	1282	work_ffty(j) = work_triy(j,k)
	1283	ENDDO
	1284
	1285	CALL fft_y( work_ffty, 'backward' )
	1286
	1287	m = 0
	1288	DO n = 1, pdims(2)
	1289	DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
	1290	ar(j,k,i,n) = work_ffty(m)
	1291	m = m + 1
	1292	ENDDO
	1293	ENDDO
	1294
	1295	ENDDO
	1296
	1297	ENDIF
	1298
	1299	ENDDO
	1300	!$OMP END PARALLEL
	1301
	1302	DEALLOCATE( tri )
	1303
	1304	CALL cpu_log( log_point_s(39), 'fft_y + tridia', 'stop' )
	1305
	1306	END SUBROUTINE ffty_tri_ffty
	1307
	1308
	1309	SUBROUTINE tridia_1dd( ddx2, ddy2, nx, ny, j, ar, tri )
	1310
	1311	!------------------------------------------------------------------------------!
	1312	! Solves the linear system of equations for a 1d-decomposition along x (see
	1313	! tridia)
	1314	!
	1315	! Attention: when using the intel compiler, array tri must be passed as an
	1316	! argument to the contained subroutines. Otherwise addres faults
	1317	! will occur.
	1318	! On NEC, tri should not be passed (except for routine substi_1dd)
	1319	! because this causes very bad performance.
	1320	!------------------------------------------------------------------------------!
	1321
	1322	USE arrays_3d
	1323	USE control_parameters
	1324
	1325	USE pegrid
	1326
	1327	IMPLICIT NONE
	1328
	1329	INTEGER :: i, j, k, nnyh, nx, ny, omp_get_thread_num, tn
	1330
	1331	REAL :: ddx2, ddy2
	1332
	1333	REAL, DIMENSION(0:nx,1:nz) :: ar
	1334	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
	1335	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	1336
	1337
	1338	nnyh = ( ny + 1 ) / 2
	1339
	1340	!
	1341	!-- Define constant elements of the tridiagonal matrix.
	1342	!-- The compiler on SX6 does loop exchange. If 0:nx is a high power of 2,
	1343	!-- the exchanged loops create bank conflicts. The following directive
	1344	!-- prohibits loop exchange and the loops perform much better.
	1345	! tn = omp_get_thread_num()
	1346	! WRITE( 120+tn, * ) '+++ id=',myid,' nx=',nx,' thread=', omp_get_thread_num()
[82]	1347	! CALL local_flush( 120+tn )
[1]	1348	!CDIR NOLOOPCHG
	1349	DO k = 0, nz-1
	1350	DO i = 0,nx
	1351	tri(2,i,k) = ddzu(k+1) * ddzw(k+1)
	1352	tri(3,i,k) = ddzu(k+2) * ddzw(k+1)
	1353	ENDDO
	1354	ENDDO
	1355	! WRITE( 120+tn, * ) '+++ id=',myid,' end of first tridia loop thread=', omp_get_thread_num()
[82]	1356	! CALL local_flush( 120+tn )
[1]	1357
	1358	IF ( j <= nnyh ) THEN
	1359	#if defined( __lcmuk )
	1360	CALL maketri_1dd( j, tri )
	1361	#else
	1362	CALL maketri_1dd( j )
	1363	#endif
	1364	ELSE
	1365	#if defined( __lcmuk )
	1366	CALL maketri_1dd( ny+1-j, tri )
	1367	#else
	1368	CALL maketri_1dd( ny+1-j )
	1369	#endif
	1370	ENDIF
	1371	#if defined( __lcmuk )
	1372	CALL split_1dd( tri )
	1373	#else
	1374	CALL split_1dd
	1375	#endif
	1376	CALL substi_1dd( ar, tri )
	1377
	1378	CONTAINS
	1379
	1380	#if defined( __lcmuk )
	1381	SUBROUTINE maketri_1dd( j, tri )
	1382	#else
	1383	SUBROUTINE maketri_1dd( j )
	1384	#endif
	1385
	1386	!------------------------------------------------------------------------------!
	1387	! computes the i- and j-dependent component of the matrix
	1388	!------------------------------------------------------------------------------!
	1389
	1390	USE constants
	1391
	1392	IMPLICIT NONE
	1393
	1394	INTEGER :: i, j, k, nnxh
	1395	REAL :: a, c
	1396
	1397	REAL, DIMENSION(0:nx) :: l
	1398
	1399	#if defined( __lcmuk )
	1400	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	1401	#endif
	1402
	1403
	1404	nnxh = ( nx + 1 ) / 2
	1405	!
	1406	!-- Provide the tridiagonal matrix for solution of the Poisson equation in
	1407	!-- Fourier space. The coefficients are computed following the method of
	1408	!-- Schmidt et al. (DFVLR-Mitteilung 84-15), which departs from Stephan
	1409	!-- Siano's original version by discretizing the Poisson equation,
	1410	!-- before it is Fourier-transformed
	1411	DO i = 0, nx
	1412	IF ( i >= 0 .AND. i < nnxh ) THEN
	1413	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
	1414	FLOAT( nx+1 ) ) ) * ddx2 + &
	1415	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
	1416	FLOAT( ny+1 ) ) ) * ddy2
	1417	ELSEIF ( i == nnxh ) THEN
	1418	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
	1419	FLOAT( nx+1 ) ) ) * ddx2 + &
	1420	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
	1421	FLOAT( ny+1 ) ) ) * ddy2
	1422	ELSE
	1423	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
	1424	FLOAT( nx+1 ) ) ) * ddx2 + &
	1425	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
	1426	FLOAT( ny+1 ) ) ) * ddy2
	1427	ENDIF
	1428	ENDDO
	1429
	1430	DO k = 0, nz-1
	1431	DO i = 0, nx
	1432	a = -1.0 * ddzu(k+2) * ddzw(k+1)
	1433	c = -1.0 * ddzu(k+1) * ddzw(k+1)
	1434	tri(1,i,k) = a + c - l(i)
	1435	ENDDO
	1436	ENDDO
	1437	IF ( ibc_p_b == 1 .OR. ibc_p_b == 2 ) THEN
	1438	DO i = 0, nx
	1439	tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
	1440	ENDDO
	1441	ENDIF
	1442	IF ( ibc_p_t == 1 ) THEN
	1443	DO i = 0, nx
	1444	tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
	1445	ENDDO
	1446	ENDIF
	1447
	1448	END SUBROUTINE maketri_1dd
	1449
	1450
	1451	#if defined( __lcmuk )
	1452	SUBROUTINE split_1dd( tri )
	1453	#else
	1454	SUBROUTINE split_1dd
	1455	#endif
	1456
	1457	!------------------------------------------------------------------------------!
	1458	! Splitting of the tridiagonal matrix (Thomas algorithm)
	1459	!------------------------------------------------------------------------------!
	1460
	1461	IMPLICIT NONE
	1462
	1463	INTEGER :: i, k
	1464
	1465	#if defined( __lcmuk )
	1466	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	1467	#endif
	1468
	1469
	1470	!
	1471	!-- Splitting
	1472	DO i = 0, nx
	1473	tri(4,i,0) = tri(1,i,0)
	1474	ENDDO
	1475	DO k = 1, nz-1
	1476	DO i = 0, nx
	1477	tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
	1478	tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
	1479	ENDDO
	1480	ENDDO
	1481
	1482	END SUBROUTINE split_1dd
	1483
	1484
	1485	SUBROUTINE substi_1dd( ar, tri )
	1486
	1487	!------------------------------------------------------------------------------!
	1488	! Substitution (Forward and Backward) (Thomas algorithm)
	1489	!------------------------------------------------------------------------------!
	1490
	1491	IMPLICIT NONE
	1492
[76]	1493	INTEGER :: i, k
[1]	1494
	1495	REAL, DIMENSION(0:nx,nz) :: ar
	1496	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
	1497	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	1498
	1499	!
	1500	!-- Forward substitution
	1501	DO i = 0, nx
	1502	ar1(i,0) = ar(i,1)
	1503	ENDDO
	1504	DO k = 1, nz-1
	1505	DO i = 0, nx
	1506	ar1(i,k) = ar(i,k+1) - tri(5,i,k) * ar1(i,k-1)
	1507	ENDDO
	1508	ENDDO
	1509
	1510	!
	1511	!-- Backward substitution
	1512	DO i = 0, nx
	1513	ar(i,nz) = ar1(i,nz-1) / tri(4,i,nz-1)
	1514	ENDDO
	1515	DO k = nz-2, 0, -1
	1516	DO i = 0, nx
	1517	ar(i,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,k+2) ) &
	1518	/ tri(4,i,k)
	1519	ENDDO
	1520	ENDDO
	1521
[76]	1522	!
	1523	!-- Indices i=0, j=0 correspond to horizontally averaged pressure.
	1524	!-- The respective values of ar should be zero at all k-levels if
	1525	!-- acceleration of horizontally averaged vertical velocity is zero.
	1526	IF ( ibc_p_b == 1 .AND. ibc_p_t == 1 ) THEN
	1527	IF ( j == 0 ) THEN
	1528	DO k = 1, nz
	1529	ar(0,k) = 0.0
	1530	ENDDO
	1531	ENDIF
	1532	ENDIF
	1533
[1]	1534	END SUBROUTINE substi_1dd
	1535
	1536	END SUBROUTINE tridia_1dd
	1537
	1538	#endif
	1539
	1540	END MODULE poisfft_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |