Home

Context Navigation

source: palm/trunk/SOURCE/poisfft.f90 @ 9

Last change on this file since 9 was 4, checked in by raasch, 18 years ago
Id keyword set as property for all *.f90 files
Property svn:keywords set to `Id`
File size: 43.8 KB

Rev	Line
[1]	1	MODULE poisfft_mod
	2
	3	!------------------------------------------------------------------------------!
	4	! Actual revisions:
	5	! -----------------
	6	!
	7	!
	8	! Former revisions:
	9	! -----------------
[3]	10	! $Id: poisfft.f90 4 2007-02-13 11:33:16Z raasch $
	11	! RCS Log replace by Id keyword, revision history cleaned up
	12	!
[1]	13	! Revision 1.24 2006/08/04 15:00:24 raasch
	14	! Default setting of the thread number tn in case of not using OpenMP
	15	!
	16	! Revision 1.23 2006/02/23 12:48:38 raasch
	17	! Additional compiler directive in routine tridia_1dd for preventing loop
	18	! exchange on NEC-SX6
	19	!
	20	! Revision 1.20 2004/04/30 12:38:09 raasch
	21	! Parts of former poisfft_hybrid moved to this subroutine,
	22	! former subroutine changed to a module, renaming of FFT-subroutines and
	23	! -module, FFTs completely substituted by calls of fft_x and fft_y,
	24	! NAG fft used in the non-parallel case completely removed, l in maketri
	25	! is now a 1d-array, variables passed by modules instead of using parameter
	26	! lists, enlarged transposition arrays introduced
	27	!
	28	! Revision 1.1 1997/07/24 11:24:14 raasch
	29	! Initial revision
	30	!
	31	!
	32	! Description:
	33	! ------------
	34	! See below.
	35	!------------------------------------------------------------------------------!
	36
	37	!--------------------------------------------------------------------------!
	38	! poisfft !
	39	! !
	40	! Original version: Stephan Siano (pois3d) !
	41	! !
	42	! Institute of Meteorology and Climatology, University of Hannover !
	43	! Germany !
	44	! !
	45	! Version as of July 23,1996 !
	46	! !
	47	! !
	48	! Version for parallel computers: Siegfried Raasch !
	49	! !
	50	! Version as of July 03,1997 !
	51	! !
	52	! Solves the Poisson equation with a 2D spectral method !
	53	! d^2 p / dx^2 + d^2 p / dy^2 + d^2 p / dz^2 = s !
	54	! !
	55	! Input: !
	56	! real ar contains in the (nnx,nny,nnz) elements, !
	57	! starting from the element (1,nys,nxl), the !
	58	! values for s !
	59	! real work Temporary array !
	60	! !
	61	! Output: !
	62	! real ar contains the solution for p !
	63	!--------------------------------------------------------------------------!
	64
	65	USE fft_xy
	66	USE indices
	67	USE transpose_indices
	68
	69	IMPLICIT NONE
	70
	71	PRIVATE
	72	PUBLIC poisfft, poisfft_init
	73
	74	INTERFACE poisfft
	75	MODULE PROCEDURE poisfft
	76	END INTERFACE poisfft
	77
	78	INTERFACE poisfft_init
	79	MODULE PROCEDURE poisfft_init
	80	END INTERFACE poisfft_init
	81
	82	CONTAINS
	83
	84	SUBROUTINE poisfft_init
	85
	86	CALL fft_init
	87
	88	END SUBROUTINE poisfft_init
	89
	90
	91	SUBROUTINE poisfft( ar, work )
	92
	93	USE cpulog
	94	USE interfaces
	95	USE pegrid
	96
	97	IMPLICIT NONE
	98
	99	REAL, DIMENSION(1:nza,nys:nyna,nxl:nxra) :: ar, work
	100
	101
	102	CALL cpu_log( log_point_s(3), 'poisfft', 'start' )
	103
	104	!
	105	!-- Two-dimensional Fourier Transformation in x- and y-direction.
	106	#if defined( __parallel )
	107	IF ( pdims(2) == 1 ) THEN
	108
	109	!
	110	!-- 1d-domain-decomposition along x:
	111	!-- FFT along y and transposition y --> x
	112	CALL ffty_tr_yx( ar, work, ar )
	113
	114	!
	115	!-- FFT along x, solving the tridiagonal system and backward FFT
	116	CALL fftx_tri_fftx( ar )
	117
	118	!
	119	!-- Transposition x --> y and backward FFT along y
	120	CALL tr_xy_ffty( ar, work, ar )
	121
	122	ELSEIF ( pdims(1) == 1 ) THEN
	123
	124	!
	125	!-- 1d-domain-decomposition along y:
	126	!-- FFT along x and transposition x --> y
	127	CALL fftx_tr_xy( ar, work, ar )
	128
	129	!
	130	!-- FFT along y, solving the tridiagonal system and backward FFT
	131	CALL ffty_tri_ffty( ar )
	132
	133	!
	134	!-- Transposition y --> x and backward FFT along x
	135	CALL tr_yx_fftx( ar, work, ar )
	136
	137	ELSE
	138
	139	!
	140	!-- 2d-domain-decomposition
	141	!-- Transposition z --> x
	142	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
	143	CALL transpose_zx( ar, work, ar, work, ar )
	144	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
	145
	146	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
	147	CALL fftxp( ar, 'forward' )
	148	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
	149
	150	!
	151	!-- Transposition x --> y
	152	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
	153	CALL transpose_xy( ar, work, ar, work, ar )
	154	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
	155
	156	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
	157	CALL fftyp( ar, 'forward' )
	158	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	159
	160	!
	161	!-- Transposition y --> z
	162	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
	163	CALL transpose_yz( ar, work, ar, work, ar )
	164	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
	165
	166	!
	167	!-- Solve the Poisson equation in z-direction in cartesian space.
	168	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
	169	CALL tridia( ar )
	170	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
	171
	172	!
	173	!-- Inverse Fourier Transformation
	174	!-- Transposition z --> y
	175	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
	176	CALL transpose_zy( ar, work, ar, work, ar )
	177	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
	178
	179	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
	180	CALL fftyp( ar, 'backward' )
	181	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	182
	183	!
	184	!-- Transposition y --> x
	185	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
	186	CALL transpose_yx( ar, work, ar, work, ar )
	187	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
	188
	189	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
	190	CALL fftxp( ar, 'backward' )
	191	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	192
	193	!
	194	!-- Transposition x --> z
	195	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
	196	CALL transpose_xz( ar, work, ar, work, ar )
	197	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
	198
	199	ENDIF
	200
	201	#else
	202
	203	!
	204	!-- Two-dimensional Fourier Transformation along x- and y-direction.
	205	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
	206	CALL fftx( ar, 'forward' )
	207	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
	208	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
	209	CALL ffty( ar, 'forward' )
	210	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	211
	212	!
	213	!-- Solve the Poisson equation in z-direction in cartesian space.
	214	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
	215	CALL tridia( ar )
	216	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
	217
	218	!
	219	!-- Inverse Fourier Transformation.
	220	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
	221	CALL ffty( ar, 'backward' )
	222	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	223	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
	224	CALL fftx( ar, 'backward' )
	225	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	226
	227	#endif
	228
	229	CALL cpu_log( log_point_s(3), 'poisfft', 'stop' )
	230
	231	END SUBROUTINE poisfft
	232
	233
	234
	235	SUBROUTINE tridia( ar )
	236
	237	!------------------------------------------------------------------------------!
	238	! solves the linear system of equations:
	239	!
	240	! -(4 pi^2(i^2/(dx^2nnx^2)+j^2/(dy^2nny^2))+
	241	! 1/(dzu(k)dzw(k))+1/(dzu(k-1)dzw(k)))*p(i,j,k)+
	242	! 1/(dzu(k)dzw(k))p(i,j,k+1)+1/(dzu(k-1)dzw(k))p(i,j,k-1)=d(i,j,k)
	243	!
	244	! by using the Thomas algorithm
	245	!------------------------------------------------------------------------------!
	246
	247	USE arrays_3d
	248
	249	IMPLICIT NONE
	250
	251	INTEGER :: i, j, k, nnyh
	252
	253	REAL, DIMENSION(nxl_z:nxr_z,0:nz-1) :: ar1
	254	REAL, DIMENSION(5,nxl_z:nxr_z,0:nz-1) :: tri
	255
	256	#if defined( __parallel )
	257	REAL :: ar(nxl_z:nxr_za,nys_z:nyn_za,1:nza)
	258	#else
	259	REAL :: ar(1:nz,nys_z:nyn_z,nxl_z:nxr_z)
	260	#endif
	261
	262
	263	nnyh = (ny+1) / 2
	264
	265	!
	266	!-- Define constant elements of the tridiagonal matrix.
	267	DO k = 0, nz-1
	268	DO i = nxl_z, nxr_z
	269	tri(2,i,k) = ddzu(k+1) * ddzw(k+1)
	270	tri(3,i,k) = ddzu(k+2) * ddzw(k+1)
	271	ENDDO
	272	ENDDO
	273
	274	#if defined( __parallel )
	275	!
	276	!-- Repeat for all y-levels.
	277	DO j = nys_z, nyn_z
	278	IF ( j <= nnyh ) THEN
	279	CALL maketri( tri, j )
	280	ELSE
	281	CALL maketri( tri, ny+1-j )
	282	ENDIF
	283	CALL split( tri )
	284	CALL substi( ar, ar1, tri, j )
	285	ENDDO
	286	#else
	287	!
	288	!-- First y-level.
	289	CALL maketri( tri, nys_z )
	290	CALL split( tri )
	291	CALL substi( ar, ar1, tri, 0 )
	292
	293	!
	294	!-- Further y-levels.
	295	DO j = 1, nnyh - 1
	296	CALL maketri( tri, j )
	297	CALL split( tri )
	298	CALL substi( ar, ar1, tri, j )
	299	CALL substi( ar, ar1, tri, ny+1-j )
	300	ENDDO
	301	CALL maketri( tri, nnyh )
	302	CALL split( tri )
	303	CALL substi( ar, ar1, tri, nnyh+nys )
	304	#endif
	305
	306	CONTAINS
	307
	308	SUBROUTINE maketri( tri, j )
	309
	310	!------------------------------------------------------------------------------!
	311	! Computes the i- and j-dependent component of the matrix
	312	!------------------------------------------------------------------------------!
	313
	314	USE arrays_3d
	315	USE constants
	316	USE control_parameters
	317	USE grid_variables
	318
	319	IMPLICIT NONE
	320
	321	INTEGER :: i, j, k, nnxh
	322	REAL :: a, c
	323	REAL :: ll(nxl_z:nxr_z)
	324	REAL :: tri(5,nxl_z:nxr_z,0:nz-1)
	325
	326
	327	nnxh = ( nx + 1 ) / 2
	328
	329	!
	330	!-- Provide the tridiagonal matrix for solution of the Poisson equation in
	331	!-- Fourier space. The coefficients are computed following the method of
	332	!-- Schmidt et al. (DFVLR-Mitteilung 84-15), which departs from Stephan
	333	!-- Siano's original version by discretizing the Poisson equation,
	334	!-- before it is Fourier-transformed
	335	#if defined( __parallel )
	336	DO i = nxl_z, nxr_z
	337	IF ( i >= 0 .AND. i < nnxh ) THEN
	338	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
	339	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
	340	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
	341	FLOAT( ny+1 ) ) ) / ( dy * dy )
	342	ELSEIF ( i == nnxh ) THEN
	343	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
	344	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
	345	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
	346	FLOAT( ny+1 ) ) ) / ( dy * dy )
	347	ELSE
	348	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
	349	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
	350	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
	351	FLOAT( ny+1 ) ) ) / ( dy * dy )
	352	ENDIF
	353	DO k = 0,nz-1
	354	a = -1.0 * ddzu(k+2) * ddzw(k+1)
	355	c = -1.0 * ddzu(k+1) * ddzw(k+1)
	356	tri(1,i,k) = a + c - ll(i)
	357	ENDDO
	358	ENDDO
	359	#else
	360	DO i = 0, nnxh
	361	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / FLOAT( nx+1 ) ) ) / &
	362	( dx * dx ) + &
	363	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / FLOAT( ny+1 ) ) ) / &
	364	( dy * dy )
	365	DO k = 0, nz-1
	366	a = -1.0 * ddzu(k+2) * ddzw(k+1)
	367	c = -1.0 * ddzu(k+1) * ddzw(k+1)
	368	tri(1,i,k) = a + c - ll(i)
	369	IF ( i >= 1 .and. i < nnxh ) THEN
	370	tri(1,nx+1-i,k) = tri(1,i,k)
	371	ENDIF
	372	ENDDO
	373	ENDDO
	374	#endif
	375	IF ( ibc_p_b == 1 .OR. ibc_p_b == 2 ) THEN
	376	DO i = nxl_z, nxr_z
	377	tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
	378	ENDDO
	379	ENDIF
	380	IF ( ibc_p_t == 1 ) THEN
	381	DO i = nxl_z, nxr_z
	382	tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
	383	ENDDO
	384	ENDIF
	385
	386	END SUBROUTINE maketri
	387
	388
	389	SUBROUTINE substi( ar, ar1, tri, j )
	390
	391	!------------------------------------------------------------------------------!
	392	! Substitution (Forward and Backward) (Thomas algorithm)
	393	!------------------------------------------------------------------------------!
	394
	395	IMPLICIT NONE
	396
	397	INTEGER :: i, j, k
	398	REAL :: ar1(nxl_z:nxr_z,0:nz-1)
	399	REAL :: tri(5,nxl_z:nxr_z,0:nz-1)
	400	#if defined( __parallel )
	401	REAL :: ar(nxl_z:nxr_za,nys_z:nyn_za,1:nza)
	402	#else
	403	REAL :: ar(1:nz,nys_z:nyn_z,nxl_z:nxr_z)
	404	#endif
	405
	406	!
	407	!-- Forward substitution.
	408	DO i = nxl_z, nxr_z
	409	#if defined( __parallel )
	410	ar1(i,0) = ar(i,j,1)
	411	#else
	412	ar1(i,0) = ar(1,j,i)
	413	#endif
	414	ENDDO
	415	DO k = 1, nz - 1
	416	DO i = nxl_z, nxr_z
	417	#if defined( __parallel )
	418	ar1(i,k) = ar(i,j,k+1) - tri(5,i,k) * ar1(i,k-1)
	419	#else
	420	ar1(i,k) = ar(k+1,j,i) - tri(5,i,k) * ar1(i,k-1)
	421	#endif
	422	ENDDO
	423	ENDDO
	424
	425	!
	426	!-- Backward substitution.
	427	DO i = nxl_z, nxr_z
	428	#if defined( __parallel )
	429	ar(i,j,nz) = ar1(i,nz-1) / tri(4,i,nz-1)
	430	#else
	431	ar(nz,j,i) = ar1(i,nz-1) / tri(4,i,nz-1)
	432	#endif
	433	ENDDO
	434	DO k = nz-2, 0, -1
	435	DO i = nxl_z, nxr_z
	436	#if defined( __parallel )
	437	ar(i,j,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,j,k+2) ) &
	438	/ tri(4,i,k)
	439	#else
	440	ar(k+1,j,i) = ( ar1(i,k) - tri(3,i,k) * ar(k+2,j,i) ) &
	441	/ tri(4,i,k)
	442	#endif
	443	ENDDO
	444	ENDDO
	445
	446	END SUBROUTINE substi
	447
	448
	449	SUBROUTINE split( tri )
	450
	451	!------------------------------------------------------------------------------!
	452	! Splitting of the tridiagonal matrix (Thomas algorithm)
	453	!------------------------------------------------------------------------------!
	454
	455	IMPLICIT NONE
	456
	457	INTEGER :: i, k
	458	REAL :: tri(5,nxl_z:nxr_z,0:nz-1)
	459
	460	!
	461	!-- Splitting.
	462	DO i = nxl_z, nxr_z
	463	tri(4,i,0) = tri(1,i,0)
	464	ENDDO
	465	DO k = 1, nz-1
	466	DO i = nxl_z, nxr_z
	467	tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
	468	tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
	469	ENDDO
	470	ENDDO
	471
	472	END SUBROUTINE split
	473
	474	END SUBROUTINE tridia
	475
	476
	477	#if defined( __parallel )
	478	SUBROUTINE fftxp( ar, direction )
	479
	480	!------------------------------------------------------------------------------!
	481	! Fourier-transformation along x-direction Parallelized version
	482	!------------------------------------------------------------------------------!
	483
	484	IMPLICIT NONE
	485
	486	CHARACTER (LEN=*) :: direction
	487	INTEGER :: j, k
	488	REAL :: ar(0:nxa,nys_x:nyn_xa,nzb_x:nzt_xa)
	489
	490	!
	491	!-- Performing the fft with one of the methods implemented
	492	DO k = nzb_x, nzt_x
	493	DO j = nys_x, nyn_x
	494	CALL fft_x( ar(0:nx,j,k), direction )
	495	ENDDO
	496	ENDDO
	497
	498	END SUBROUTINE fftxp
	499
	500	#else
	501	SUBROUTINE fftx( ar, direction )
	502
	503	!------------------------------------------------------------------------------!
	504	! Fourier-transformation along x-direction Non parallel version
	505	!------------------------------------------------------------------------------!
	506
	507	IMPLICIT NONE
	508
	509	CHARACTER (LEN=*) :: direction
	510	INTEGER :: i, j, k
	511	REAL :: ar(1:nz,0:ny,0:nx)
	512
	513	!
	514	!-- Performing the fft with one of the methods implemented
	515	DO k = 1, nz
	516	DO j = 0, ny
	517	CALL fft_x( ar(k,j,0:nx), direction )
	518	ENDDO
	519	ENDDO
	520
	521	END SUBROUTINE fftx
	522	#endif
	523
	524
	525	#if defined( __parallel )
	526	SUBROUTINE fftyp( ar, direction )
	527
	528	!------------------------------------------------------------------------------!
	529	! Fourier-transformation along y-direction Parallelized version
	530	!------------------------------------------------------------------------------!
	531
	532	IMPLICIT NONE
	533
	534	CHARACTER (LEN=*) :: direction
	535	INTEGER :: i, k
	536	REAL :: ar(0:nya,nxl_y:nxr_ya,nzb_y:nzt_ya)
	537
	538	!
	539	!-- Performing the fft with one of the methods implemented
	540	DO k = nzb_y, nzt_y
	541	DO i = nxl_y, nxr_y
	542	CALL fft_y( ar(0:ny,i,k), direction )
	543	ENDDO
	544	ENDDO
	545
	546	END SUBROUTINE fftyp
	547
	548	#else
	549	SUBROUTINE ffty( ar, direction )
	550
	551	!------------------------------------------------------------------------------!
	552	! Fourier-transformation along y-direction Non parallel version
	553	!------------------------------------------------------------------------------!
	554
	555	IMPLICIT NONE
	556
	557	CHARACTER (LEN=*) :: direction
	558	INTEGER :: i, k
	559	REAL :: ar(1:nz,0:ny,0:nx)
	560
	561	!
	562	!-- Performing the fft with one of the methods implemented
	563	DO k = 1, nz
	564	DO i = 0, nx
	565	CALL fft_y( ar(k,0:ny,i), direction )
	566	ENDDO
	567	ENDDO
	568
	569	END SUBROUTINE ffty
	570	#endif
	571
	572	#if defined( __parallel )
	573	SUBROUTINE ffty_tr_yx( f_in, work, f_out )
	574
	575	!------------------------------------------------------------------------------!
	576	! Fourier-transformation along y with subsequent transposition y --> x for
	577	! a 1d-decomposition along x
	578	!
	579	! ATTENTION: The performance of this routine is much faster on the NEC-SX6,
	580	! if the first index of work_ffty_vec is odd. Otherwise
	581	! memory bank conflicts may occur (especially if the index is a
	582	! multiple of 128). That's why work_ffty_vec is dimensioned as
	583	! 0:ny+1.
	584	! Of course, this will not work if users are using an odd number
	585	! of gridpoints along y.
	586	!------------------------------------------------------------------------------!
	587
	588	USE control_parameters
	589	USE cpulog
	590	USE indices
	591	USE interfaces
	592	USE pegrid
	593	USE transpose_indices
	594
	595	IMPLICIT NONE
	596
	597	INTEGER :: i, iend, iouter, ir, j, k
	598	INTEGER, PARAMETER :: stridex = 4
	599
	600	REAL, DIMENSION(0:ny,stridex) :: work_ffty
	601	#if defined( __nec )
	602	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
	603	#endif
	604	REAL, DIMENSION(1:nza,0:nya,nxl:nxra) :: f_in
	605	REAL, DIMENSION(nnx,1:nza,nys_x:nyn_xa,pdims(1)) :: f_out
	606	REAL, DIMENSION(nxl:nxra,1:nza,0:nya) :: work
	607
	608	!
	609	!-- Carry out the FFT along y, where all data are present due to the
	610	!-- 1d-decomposition along x. Resort the data in a way that x becomes
	611	!-- the first index.
	612	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
	613
	614	IF ( host(1:3) == 'nec' ) THEN
	615	#if defined( __nec )
	616	!
	617	!-- Code optimized for vector processors
	618	!$OMP PARALLEL PRIVATE ( i, j, k, work_ffty_vec )
	619	!$OMP DO
	620	DO i = nxl, nxr
	621
	622	DO j = 0, ny
	623	DO k = 1, nz
	624	work_ffty_vec(j,k,i) = f_in(k,j,i)
	625	ENDDO
	626	ENDDO
	627
	628	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' )
	629
	630	ENDDO
	631
	632	!$OMP DO
	633	DO k = 1, nz
	634	DO j = 0, ny
	635	DO i = nxl, nxr
	636	work(i,k,j) = work_ffty_vec(j,k,i)
	637	ENDDO
	638	ENDDO
	639	ENDDO
	640	!$OMP END PARALLEL
	641	#endif
	642
	643	ELSE
	644
	645	!
	646	!-- Cache optimized code.
	647	!-- The i-(x-)direction is split into a strided outer loop and an inner
	648	!-- loop for better cache performance
	649	!$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty)
	650	!$OMP DO
	651	DO iouter = nxl, nxr, stridex
	652
	653	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
	654
	655	DO k = 1, nz
	656
	657	DO i = iouter, iend
	658
	659	ir = i-iouter+1 ! counter within a stride
	660	DO j = 0, ny
	661	work_ffty(j,ir) = f_in(k,j,i)
	662	ENDDO
	663	!
	664	!-- FFT along y
	665	CALL fft_y( work_ffty(:,ir), 'forward' )
	666
	667	ENDDO
	668
	669	!
	670	!-- Resort
	671	DO j = 0, ny
	672	DO i = iouter, iend
	673	work(i,k,j) = work_ffty(j,i-iouter+1)
	674	ENDDO
	675	ENDDO
	676
	677	ENDDO
	678
	679	ENDDO
	680	!$OMP END PARALLEL
	681
	682	ENDIF
	683	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	684
	685	!
	686	!-- Transpose array
	687	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
	688	CALL MPI_ALLTOALL( work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
	689	f_out(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
	690	comm1dx, ierr )
	691	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	692
	693	END SUBROUTINE ffty_tr_yx
	694
	695
	696	SUBROUTINE tr_xy_ffty( f_in, work, f_out )
	697
	698	!------------------------------------------------------------------------------!
	699	! Transposition x --> y with a subsequent backward Fourier transformation for
	700	! a 1d-decomposition along x
	701	!------------------------------------------------------------------------------!
	702
	703	USE control_parameters
	704	USE cpulog
	705	USE indices
	706	USE interfaces
	707	USE pegrid
	708	USE transpose_indices
	709
	710	IMPLICIT NONE
	711
	712	INTEGER :: i, iend, iouter, ir, j, k
	713	INTEGER, PARAMETER :: stridex = 4
	714
	715	REAL, DIMENSION(0:ny,stridex) :: work_ffty
	716	#if defined( __nec )
	717	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
	718	#endif
	719	REAL, DIMENSION(nnx,1:nza,nys_x:nyn_xa,pdims(1)) :: f_in
	720	REAL, DIMENSION(1:nza,0:nya,nxl:nxra) :: f_out
	721	REAL, DIMENSION(nxl:nxra,1:nza,0:nya) :: work
	722
	723	!
	724	!-- Transpose array
	725	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
	726	CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
	727	work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
	728	comm1dx, ierr )
	729	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	730
	731	!
	732	!-- Resort the data in a way that y becomes the first index and carry out the
	733	!-- backward fft along y.
	734	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
	735
	736	IF ( host(1:3) == 'nec' ) THEN
	737	#if defined( __nec )
	738	!
	739	!-- Code optimized for vector processors
	740	!$OMP PARALLEL PRIVATE ( i, j, k, work_ffty_vec )
	741	!$OMP DO
	742	DO k = 1, nz
	743	DO j = 0, ny
	744	DO i = nxl, nxr
	745	work_ffty_vec(j,k,i) = work(i,k,j)
	746	ENDDO
	747	ENDDO
	748	ENDDO
	749
	750	!$OMP DO
	751	DO i = nxl, nxr
	752
	753	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' )
	754
	755	DO j = 0, ny
	756	DO k = 1, nz
	757	f_out(k,j,i) = work_ffty_vec(j,k,i)
	758	ENDDO
	759	ENDDO
	760
	761	ENDDO
	762	!$OMP END PARALLEL
	763	#endif
	764
	765	ELSE
	766
	767	!
	768	!-- Cache optimized code.
	769	!-- The i-(x-)direction is split into a strided outer loop and an inner
	770	!-- loop for better cache performance
	771	!$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty )
	772	!$OMP DO
	773	DO iouter = nxl, nxr, stridex
	774
	775	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
	776
	777	DO k = 1, nz
	778	!
	779	!-- Resort
	780	DO j = 0, ny
	781	DO i = iouter, iend
	782	work_ffty(j,i-iouter+1) = work(i,k,j)
	783	ENDDO
	784	ENDDO
	785
	786	DO i = iouter, iend
	787
	788	!
	789	!-- FFT along y
	790	ir = i-iouter+1 ! counter within a stride
	791	CALL fft_y( work_ffty(:,ir), 'backward' )
	792
	793	DO j = 0, ny
	794	f_out(k,j,i) = work_ffty(j,ir)
	795	ENDDO
	796	ENDDO
	797
	798	ENDDO
	799
	800	ENDDO
	801	!$OMP END PARALLEL
	802
	803	ENDIF
	804
	805	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	806
	807	END SUBROUTINE tr_xy_ffty
	808
	809
	810	SUBROUTINE fftx_tri_fftx( ar )
	811
	812	!------------------------------------------------------------------------------!
	813	! FFT along x, solution of the tridiagonal system and backward FFT for
	814	! a 1d-decomposition along x
	815	!
	816	! WARNING: this subroutine may still not work for hybrid parallelization
	817	! with OpenMP (for possible necessary changes see the original
	818	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
	819	!------------------------------------------------------------------------------!
	820
	821	USE control_parameters
	822	USE cpulog
	823	USE grid_variables
	824	USE indices
	825	USE interfaces
	826	USE pegrid
	827	USE transpose_indices
	828
	829	IMPLICIT NONE
	830
	831	character(len=3) :: myth_char
	832
	833	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
	834
	835	REAL, DIMENSION(0:nx) :: work_fftx
	836	REAL, DIMENSION(0:nx,1:nz) :: work_trix
	837	REAL, DIMENSION(nnx,1:nza,nys_x:nyn_xa,pdims(1)) :: ar
	838	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
	839
	840
	841	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'start' )
	842
	843	ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) )
	844
	845	tn = 0 ! Default thread number in case of one thread
	846	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_fftx, work_trix )
	847	DO j = nys_x, nyn_x
	848
	849	!$ tn = omp_get_thread_num()
	850
	851	IF ( host(1:3) == 'nec' ) THEN
	852	!
	853	!-- Code optimized for vector processors
	854	DO k = 1, nz
	855
	856	m = 0
	857	DO n = 1, pdims(1)
	858	DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
	859	work_trix(m,k) = ar(i,k,j,n)
	860	m = m + 1
	861	ENDDO
	862	ENDDO
	863
	864	ENDDO
	865
	866	CALL fft_x_m( work_trix, 'forward' )
	867
	868	ELSE
	869	!
	870	!-- Cache optimized code
	871	DO k = 1, nz
	872
	873	m = 0
	874	DO n = 1, pdims(1)
	875	DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
	876	work_fftx(m) = ar(i,k,j,n)
	877	m = m + 1
	878	ENDDO
	879	ENDDO
	880
	881	CALL fft_x( work_fftx, 'forward' )
	882
	883	DO i = 0, nx
	884	work_trix(i,k) = work_fftx(i)
	885	ENDDO
	886
	887	ENDDO
	888
	889	ENDIF
	890
	891	!
	892	!-- Solve the linear equation system
	893	CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) )
	894
	895	IF ( host(1:3) == 'nec' ) THEN
	896	!
	897	!-- Code optimized for vector processors
	898	CALL fft_x_m( work_trix, 'backward' )
	899
	900	DO k = 1, nz
	901
	902	m = 0
	903	DO n = 1, pdims(1)
	904	DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
	905	ar(i,k,j,n) = work_trix(m,k)
	906	m = m + 1
	907	ENDDO
	908	ENDDO
	909
	910	ENDDO
	911
	912	ELSE
	913	!
	914	!-- Cache optimized code
	915	DO k = 1, nz
	916
	917	DO i = 0, nx
	918	work_fftx(i) = work_trix(i,k)
	919	ENDDO
	920
	921	CALL fft_x( work_fftx, 'backward' )
	922
	923	m = 0
	924	DO n = 1, pdims(1)
	925	DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
	926	ar(i,k,j,n) = work_fftx(m)
	927	m = m + 1
	928	ENDDO
	929	ENDDO
	930
	931	ENDDO
	932
	933	ENDIF
	934
	935	ENDDO
	936
	937	DEALLOCATE( tri )
	938
	939	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'stop' )
	940
	941	END SUBROUTINE fftx_tri_fftx
	942
	943
	944	SUBROUTINE fftx_tr_xy( f_in, work, f_out )
	945
	946	!------------------------------------------------------------------------------!
	947	! Fourier-transformation along x with subsequent transposition x --> y for
	948	! a 1d-decomposition along y
	949	!
	950	! ATTENTION: The NEC-branch of this routine may significantly profit from
	951	! further optimizations. So far, performance is much worse than
	952	! for routine ffty_tr_yx (more than three times slower).
	953	!------------------------------------------------------------------------------!
	954
	955	USE control_parameters
	956	USE cpulog
	957	USE indices
	958	USE interfaces
	959	USE pegrid
	960	USE transpose_indices
	961
	962	IMPLICIT NONE
	963
	964	INTEGER :: i, j, k
	965
	966	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
	967	REAL, DIMENSION(1:nza,nys:nyna,0:nxa) :: f_in
	968	REAL, DIMENSION(nny,1:nza,nxl_y:nxr_ya,pdims(2)) :: f_out
	969	REAL, DIMENSION(nys:nyna,1:nza,0:nxa) :: work
	970
	971	!
	972	!-- Carry out the FFT along x, where all data are present due to the
	973	!-- 1d-decomposition along y. Resort the data in a way that y becomes
	974	!-- the first index.
	975	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
	976
	977	IF ( host(1:3) == 'nec' ) THEN
	978	!
	979	!-- Code for vector processors
	980	!$OMP PARALLEL PRIVATE ( i, j, k, work_fftx )
	981	!$OMP DO
	982	DO i = 0, nx
	983
	984	DO j = nys, nyn
	985	DO k = 1, nz
	986	work_fftx(i,k,j) = f_in(k,j,i)
	987	ENDDO
	988	ENDDO
	989
	990	ENDDO
	991
	992	!$OMP DO
	993	DO j = nys, nyn
	994
	995	CALL fft_x_m( work_fftx(:,:,j), 'forward' )
	996
	997	DO k = 1, nz
	998	DO i = 0, nx
	999	work(j,k,i) = work_fftx(i,k,j)
	1000	ENDDO
	1001	ENDDO
	1002
	1003	ENDDO
	1004	!$OMP END PARALLEL
	1005
	1006	ELSE
	1007
	1008	!
	1009	!-- Cache optimized code (there might be still a potential for better
	1010	!-- optimization).
	1011	!$OMP PARALLEL PRIVATE (i,j,k,work_fftx)
	1012	!$OMP DO
	1013	DO i = 0, nx
	1014
	1015	DO j = nys, nyn
	1016	DO k = 1, nz
	1017	work_fftx(i,k,j) = f_in(k,j,i)
	1018	ENDDO
	1019	ENDDO
	1020
	1021	ENDDO
	1022
	1023	!$OMP DO
	1024	DO j = nys, nyn
	1025	DO k = 1, nz
	1026
	1027	CALL fft_x( work_fftx(0:nx,k,j), 'forward' )
	1028
	1029	DO i = 0, nx
	1030	work(j,k,i) = work_fftx(i,k,j)
	1031	ENDDO
	1032	ENDDO
	1033
	1034	ENDDO
	1035	!$OMP END PARALLEL
	1036
	1037	ENDIF
	1038	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
	1039
	1040	!
	1041	!-- Transpose array
	1042	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
	1043	CALL MPI_ALLTOALL( work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
	1044	f_out(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
	1045	comm1dy, ierr )
	1046	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	1047
	1048	END SUBROUTINE fftx_tr_xy
	1049
	1050
	1051	SUBROUTINE tr_yx_fftx( f_in, work, f_out )
	1052
	1053	!------------------------------------------------------------------------------!
	1054	! Transposition y --> x with a subsequent backward Fourier transformation for
	1055	! a 1d-decomposition along x
	1056	!------------------------------------------------------------------------------!
	1057
	1058	USE control_parameters
	1059	USE cpulog
	1060	USE indices
	1061	USE interfaces
	1062	USE pegrid
	1063	USE transpose_indices
	1064
	1065	IMPLICIT NONE
	1066
	1067	INTEGER :: i, j, k
	1068
	1069	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
	1070	REAL, DIMENSION(nny,1:nza,nxl_y:nxr_ya,pdims(2)) :: f_in
	1071	REAL, DIMENSION(1:nza,nys:nyna,0:nxa) :: f_out
	1072	REAL, DIMENSION(nys:nyna,1:nza,0:nxa) :: work
	1073
	1074	!
	1075	!-- Transpose array
	1076	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
	1077	CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
	1078	work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
	1079	comm1dy, ierr )
	1080	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	1081
	1082	!
	1083	!-- Carry out the FFT along x, where all data are present due to the
	1084	!-- 1d-decomposition along y. Resort the data in a way that y becomes
	1085	!-- the first index.
	1086	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
	1087
	1088	IF ( host(1:3) == 'nec' ) THEN
	1089	!
	1090	!-- Code optimized for vector processors
	1091	!$OMP PARALLEL PRIVATE ( i, j, k, work_fftx )
	1092	!$OMP DO
	1093	DO j = nys, nyn
	1094
	1095	DO k = 1, nz
	1096	DO i = 0, nx
	1097	work_fftx(i,k,j) = work(j,k,i)
	1098	ENDDO
	1099	ENDDO
	1100
	1101	CALL fft_x_m( work_fftx(:,:,j), 'backward' )
	1102
	1103	ENDDO
	1104
	1105	!$OMP DO
	1106	DO i = 0, nx
	1107	DO j = nys, nyn
	1108	DO k = 1, nz
	1109	f_out(k,j,i) = work_fftx(i,k,j)
	1110	ENDDO
	1111	ENDDO
	1112	ENDDO
	1113	!$OMP END PARALLEL
	1114
	1115	ELSE
	1116
	1117	!
	1118	!-- Cache optimized code (there might be still a potential for better
	1119	!-- optimization).
	1120	!$OMP PARALLEL PRIVATE (i,j,k,work_fftx)
	1121	!$OMP DO
	1122	DO j = nys, nyn
	1123	DO k = 1, nz
	1124
	1125	DO i = 0, nx
	1126	work_fftx(i,k,j) = work(j,k,i)
	1127	ENDDO
	1128
	1129	CALL fft_x( work_fftx(0:nx,k,j), 'backward' )
	1130
	1131	ENDDO
	1132	ENDDO
	1133
	1134	!$OMP DO
	1135	DO i = 0, nx
	1136	DO j = nys, nyn
	1137	DO k = 1, nz
	1138	f_out(k,j,i) = work_fftx(i,k,j)
	1139	ENDDO
	1140	ENDDO
	1141	ENDDO
	1142	!$OMP END PARALLEL
	1143
	1144	ENDIF
	1145	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	1146
	1147	END SUBROUTINE tr_yx_fftx
	1148
	1149
	1150	SUBROUTINE ffty_tri_ffty( ar )
	1151
	1152	!------------------------------------------------------------------------------!
	1153	! FFT along y, solution of the tridiagonal system and backward FFT for
	1154	! a 1d-decomposition along y
	1155	!
	1156	! WARNING: this subroutine may still not work for hybrid parallelization
	1157	! with OpenMP (for possible necessary changes see the original
	1158	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
	1159	!------------------------------------------------------------------------------!
	1160
	1161	USE control_parameters
	1162	USE cpulog
	1163	USE grid_variables
	1164	USE indices
	1165	USE interfaces
	1166	USE pegrid
	1167	USE transpose_indices
	1168
	1169	IMPLICIT NONE
	1170
	1171	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
	1172
	1173	REAL, DIMENSION(0:ny) :: work_ffty
	1174	REAL, DIMENSION(0:ny,1:nz) :: work_triy
	1175	REAL, DIMENSION(nny,1:nza,nxl_y:nxr_ya,pdims(2)) :: ar
	1176	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
	1177
	1178
	1179	CALL cpu_log( log_point_s(39), 'fft_y + tridia', 'start' )
	1180
	1181	ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) )
	1182
	1183	tn = 0 ! Default thread number in case of one thread
	1184	!$OMP PARALLEL PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy )
	1185	!$OMP DO
	1186	DO i = nxl_y, nxr_y
	1187
	1188	!$ tn = omp_get_thread_num()
	1189
	1190	IF ( host(1:3) == 'nec' ) THEN
	1191	!
	1192	!-- Code optimized for vector processors
	1193	DO k = 1, nz
	1194
	1195	m = 0
	1196	DO n = 1, pdims(2)
	1197	DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
	1198	work_triy(m,k) = ar(j,k,i,n)
	1199	m = m + 1
	1200	ENDDO
	1201	ENDDO
	1202
	1203	ENDDO
	1204
	1205	CALL fft_y_m( work_triy, ny, 'forward' )
	1206
	1207	ELSE
	1208	!
	1209	!-- Cache optimized code
	1210	DO k = 1, nz
	1211
	1212	m = 0
	1213	DO n = 1, pdims(2)
	1214	DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
	1215	work_ffty(m) = ar(j,k,i,n)
	1216	m = m + 1
	1217	ENDDO
	1218	ENDDO
	1219
	1220	CALL fft_y( work_ffty, 'forward' )
	1221
	1222	DO j = 0, ny
	1223	work_triy(j,k) = work_ffty(j)
	1224	ENDDO
	1225
	1226	ENDDO
	1227
	1228	ENDIF
	1229
	1230	!
	1231	!-- Solve the linear equation system
	1232	CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) )
	1233
	1234	IF ( host(1:3) == 'nec' ) THEN
	1235	!
	1236	!-- Code optimized for vector processors
	1237	CALL fft_y_m( work_triy, ny, 'backward' )
	1238
	1239	DO k = 1, nz
	1240
	1241	m = 0
	1242	DO n = 1, pdims(2)
	1243	DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
	1244	ar(j,k,i,n) = work_triy(m,k)
	1245	m = m + 1
	1246	ENDDO
	1247	ENDDO
	1248
	1249	ENDDO
	1250
	1251	ELSE
	1252	!
	1253	!-- Cache optimized code
	1254	DO k = 1, nz
	1255
	1256	DO j = 0, ny
	1257	work_ffty(j) = work_triy(j,k)
	1258	ENDDO
	1259
	1260	CALL fft_y( work_ffty, 'backward' )
	1261
	1262	m = 0
	1263	DO n = 1, pdims(2)
	1264	DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
	1265	ar(j,k,i,n) = work_ffty(m)
	1266	m = m + 1
	1267	ENDDO
	1268	ENDDO
	1269
	1270	ENDDO
	1271
	1272	ENDIF
	1273
	1274	ENDDO
	1275	!$OMP END PARALLEL
	1276
	1277	DEALLOCATE( tri )
	1278
	1279	CALL cpu_log( log_point_s(39), 'fft_y + tridia', 'stop' )
	1280
	1281	END SUBROUTINE ffty_tri_ffty
	1282
	1283
	1284	SUBROUTINE tridia_1dd( ddx2, ddy2, nx, ny, j, ar, tri )
	1285
	1286	!------------------------------------------------------------------------------!
	1287	! Solves the linear system of equations for a 1d-decomposition along x (see
	1288	! tridia)
	1289	!
	1290	! Attention: when using the intel compiler, array tri must be passed as an
	1291	! argument to the contained subroutines. Otherwise addres faults
	1292	! will occur.
	1293	! On NEC, tri should not be passed (except for routine substi_1dd)
	1294	! because this causes very bad performance.
	1295	!------------------------------------------------------------------------------!
	1296
	1297	USE arrays_3d
	1298	USE control_parameters
	1299
	1300	USE pegrid
	1301
	1302	IMPLICIT NONE
	1303
	1304	INTEGER :: i, j, k, nnyh, nx, ny, omp_get_thread_num, tn
	1305
	1306	REAL :: ddx2, ddy2
	1307
	1308	REAL, DIMENSION(0:nx,1:nz) :: ar
	1309	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
	1310	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	1311
	1312
	1313	nnyh = ( ny + 1 ) / 2
	1314
	1315	!
	1316	!-- Define constant elements of the tridiagonal matrix.
	1317	!-- The compiler on SX6 does loop exchange. If 0:nx is a high power of 2,
	1318	!-- the exchanged loops create bank conflicts. The following directive
	1319	!-- prohibits loop exchange and the loops perform much better.
	1320	! tn = omp_get_thread_num()
	1321	! WRITE( 120+tn, * ) '+++ id=',myid,' nx=',nx,' thread=', omp_get_thread_num()
	1322	! CALL FLUSH_( 120+tn )
	1323	!CDIR NOLOOPCHG
	1324	DO k = 0, nz-1
	1325	DO i = 0,nx
	1326	tri(2,i,k) = ddzu(k+1) * ddzw(k+1)
	1327	tri(3,i,k) = ddzu(k+2) * ddzw(k+1)
	1328	ENDDO
	1329	ENDDO
	1330	! WRITE( 120+tn, * ) '+++ id=',myid,' end of first tridia loop thread=', omp_get_thread_num()
	1331	! CALL FLUSH_( 120+tn )
	1332
	1333	IF ( j <= nnyh ) THEN
	1334	#if defined( __lcmuk )
	1335	CALL maketri_1dd( j, tri )
	1336	#else
	1337	CALL maketri_1dd( j )
	1338	#endif
	1339	ELSE
	1340	#if defined( __lcmuk )
	1341	CALL maketri_1dd( ny+1-j, tri )
	1342	#else
	1343	CALL maketri_1dd( ny+1-j )
	1344	#endif
	1345	ENDIF
	1346	#if defined( __lcmuk )
	1347	CALL split_1dd( tri )
	1348	#else
	1349	CALL split_1dd
	1350	#endif
	1351	CALL substi_1dd( ar, tri )
	1352
	1353	CONTAINS
	1354
	1355	#if defined( __lcmuk )
	1356	SUBROUTINE maketri_1dd( j, tri )
	1357	#else
	1358	SUBROUTINE maketri_1dd( j )
	1359	#endif
	1360
	1361	!------------------------------------------------------------------------------!
	1362	! computes the i- and j-dependent component of the matrix
	1363	!------------------------------------------------------------------------------!
	1364
	1365	USE constants
	1366
	1367	IMPLICIT NONE
	1368
	1369	INTEGER :: i, j, k, nnxh
	1370	REAL :: a, c
	1371
	1372	REAL, DIMENSION(0:nx) :: l
	1373
	1374	#if defined( __lcmuk )
	1375	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	1376	#endif
	1377
	1378
	1379	nnxh = ( nx + 1 ) / 2
	1380	!
	1381	!-- Provide the tridiagonal matrix for solution of the Poisson equation in
	1382	!-- Fourier space. The coefficients are computed following the method of
	1383	!-- Schmidt et al. (DFVLR-Mitteilung 84-15), which departs from Stephan
	1384	!-- Siano's original version by discretizing the Poisson equation,
	1385	!-- before it is Fourier-transformed
	1386	DO i = 0, nx
	1387	IF ( i >= 0 .AND. i < nnxh ) THEN
	1388	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
	1389	FLOAT( nx+1 ) ) ) * ddx2 + &
	1390	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
	1391	FLOAT( ny+1 ) ) ) * ddy2
	1392	ELSEIF ( i == nnxh ) THEN
	1393	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
	1394	FLOAT( nx+1 ) ) ) * ddx2 + &
	1395	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
	1396	FLOAT( ny+1 ) ) ) * ddy2
	1397	ELSE
	1398	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
	1399	FLOAT( nx+1 ) ) ) * ddx2 + &
	1400	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
	1401	FLOAT( ny+1 ) ) ) * ddy2
	1402	ENDIF
	1403	ENDDO
	1404
	1405	DO k = 0, nz-1
	1406	DO i = 0, nx
	1407	a = -1.0 * ddzu(k+2) * ddzw(k+1)
	1408	c = -1.0 * ddzu(k+1) * ddzw(k+1)
	1409	tri(1,i,k) = a + c - l(i)
	1410	ENDDO
	1411	ENDDO
	1412	IF ( ibc_p_b == 1 .OR. ibc_p_b == 2 ) THEN
	1413	DO i = 0, nx
	1414	tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
	1415	ENDDO
	1416	ENDIF
	1417	IF ( ibc_p_t == 1 ) THEN
	1418	DO i = 0, nx
	1419	tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
	1420	ENDDO
	1421	ENDIF
	1422
	1423	END SUBROUTINE maketri_1dd
	1424
	1425
	1426	#if defined( __lcmuk )
	1427	SUBROUTINE split_1dd( tri )
	1428	#else
	1429	SUBROUTINE split_1dd
	1430	#endif
	1431
	1432	!------------------------------------------------------------------------------!
	1433	! Splitting of the tridiagonal matrix (Thomas algorithm)
	1434	!------------------------------------------------------------------------------!
	1435
	1436	IMPLICIT NONE
	1437
	1438	INTEGER :: i, k
	1439
	1440	#if defined( __lcmuk )
	1441	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	1442	#endif
	1443
	1444
	1445	!
	1446	!-- Splitting
	1447	DO i = 0, nx
	1448	tri(4,i,0) = tri(1,i,0)
	1449	ENDDO
	1450	DO k = 1, nz-1
	1451	DO i = 0, nx
	1452	tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
	1453	tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
	1454	ENDDO
	1455	ENDDO
	1456
	1457	END SUBROUTINE split_1dd
	1458
	1459
	1460	SUBROUTINE substi_1dd( ar, tri )
	1461
	1462	!------------------------------------------------------------------------------!
	1463	! Substitution (Forward and Backward) (Thomas algorithm)
	1464	!------------------------------------------------------------------------------!
	1465
	1466	IMPLICIT NONE
	1467
	1468	INTEGER :: i, j, k
	1469
	1470	REAL, DIMENSION(0:nx,nz) :: ar
	1471	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
	1472	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	1473
	1474	!
	1475	!-- Forward substitution
	1476	DO i = 0, nx
	1477	ar1(i,0) = ar(i,1)
	1478	ENDDO
	1479	DO k = 1, nz-1
	1480	DO i = 0, nx
	1481	ar1(i,k) = ar(i,k+1) - tri(5,i,k) * ar1(i,k-1)
	1482	ENDDO
	1483	ENDDO
	1484
	1485	!
	1486	!-- Backward substitution
	1487	DO i = 0, nx
	1488	ar(i,nz) = ar1(i,nz-1) / tri(4,i,nz-1)
	1489	ENDDO
	1490	DO k = nz-2, 0, -1
	1491	DO i = 0, nx
	1492	ar(i,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,k+2) ) &
	1493	/ tri(4,i,k)
	1494	ENDDO
	1495	ENDDO
	1496
	1497	END SUBROUTINE substi_1dd
	1498
	1499	END SUBROUTINE tridia_1dd
	1500
	1501	#endif
	1502
	1503	END MODULE poisfft_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |