Home

Context Navigation

source: palm/trunk/SOURCE/poisfft.f90 @ 76

Last change on this file since 76 was 76, checked in by raasch, 17 years ago
changes for Neumann p conditions both at bottom and top
Property svn:keywords set to `Id`
File size: 44.9 KB

Rev	Line
[1]	1	MODULE poisfft_mod
	2
	3	!------------------------------------------------------------------------------!
	4	! Actual revisions:
	5	! -----------------
[76]	6	! Tridiagonal coefficients adjusted for Neumann boundary conditions both at
	7	! the bottom and the top.
[1]	8	!
	9	! Former revisions:
	10	! -----------------
[3]	11	! $Id: poisfft.f90 76 2007-03-29 00:58:32Z raasch $
	12	! RCS Log replace by Id keyword, revision history cleaned up
	13	!
[1]	14	! Revision 1.24 2006/08/04 15:00:24 raasch
	15	! Default setting of the thread number tn in case of not using OpenMP
	16	!
	17	! Revision 1.23 2006/02/23 12:48:38 raasch
	18	! Additional compiler directive in routine tridia_1dd for preventing loop
	19	! exchange on NEC-SX6
	20	!
	21	! Revision 1.20 2004/04/30 12:38:09 raasch
	22	! Parts of former poisfft_hybrid moved to this subroutine,
	23	! former subroutine changed to a module, renaming of FFT-subroutines and
	24	! -module, FFTs completely substituted by calls of fft_x and fft_y,
	25	! NAG fft used in the non-parallel case completely removed, l in maketri
	26	! is now a 1d-array, variables passed by modules instead of using parameter
	27	! lists, enlarged transposition arrays introduced
	28	!
	29	! Revision 1.1 1997/07/24 11:24:14 raasch
	30	! Initial revision
	31	!
	32	!
	33	! Description:
	34	! ------------
	35	! See below.
	36	!------------------------------------------------------------------------------!
	37
	38	!--------------------------------------------------------------------------!
	39	! poisfft !
	40	! !
	41	! Original version: Stephan Siano (pois3d) !
	42	! !
	43	! Institute of Meteorology and Climatology, University of Hannover !
	44	! Germany !
	45	! !
	46	! Version as of July 23,1996 !
	47	! !
	48	! !
	49	! Version for parallel computers: Siegfried Raasch !
	50	! !
	51	! Version as of July 03,1997 !
	52	! !
	53	! Solves the Poisson equation with a 2D spectral method !
	54	! d^2 p / dx^2 + d^2 p / dy^2 + d^2 p / dz^2 = s !
	55	! !
	56	! Input: !
	57	! real ar contains in the (nnx,nny,nnz) elements, !
	58	! starting from the element (1,nys,nxl), the !
	59	! values for s !
	60	! real work Temporary array !
	61	! !
	62	! Output: !
	63	! real ar contains the solution for p !
	64	!--------------------------------------------------------------------------!
	65
	66	USE fft_xy
	67	USE indices
	68	USE transpose_indices
	69
	70	IMPLICIT NONE
	71
	72	PRIVATE
	73	PUBLIC poisfft, poisfft_init
	74
	75	INTERFACE poisfft
	76	MODULE PROCEDURE poisfft
	77	END INTERFACE poisfft
	78
	79	INTERFACE poisfft_init
	80	MODULE PROCEDURE poisfft_init
	81	END INTERFACE poisfft_init
	82
	83	CONTAINS
	84
	85	SUBROUTINE poisfft_init
	86
	87	CALL fft_init
	88
	89	END SUBROUTINE poisfft_init
	90
	91
	92	SUBROUTINE poisfft( ar, work )
	93
	94	USE cpulog
	95	USE interfaces
	96	USE pegrid
	97
	98	IMPLICIT NONE
	99
	100	REAL, DIMENSION(1:nza,nys:nyna,nxl:nxra) :: ar, work
	101
	102
	103	CALL cpu_log( log_point_s(3), 'poisfft', 'start' )
	104
	105	!
	106	!-- Two-dimensional Fourier Transformation in x- and y-direction.
	107	#if defined( __parallel )
	108	IF ( pdims(2) == 1 ) THEN
	109
	110	!
	111	!-- 1d-domain-decomposition along x:
	112	!-- FFT along y and transposition y --> x
	113	CALL ffty_tr_yx( ar, work, ar )
	114
	115	!
	116	!-- FFT along x, solving the tridiagonal system and backward FFT
	117	CALL fftx_tri_fftx( ar )
	118
	119	!
	120	!-- Transposition x --> y and backward FFT along y
	121	CALL tr_xy_ffty( ar, work, ar )
	122
	123	ELSEIF ( pdims(1) == 1 ) THEN
	124
	125	!
	126	!-- 1d-domain-decomposition along y:
	127	!-- FFT along x and transposition x --> y
	128	CALL fftx_tr_xy( ar, work, ar )
	129
	130	!
	131	!-- FFT along y, solving the tridiagonal system and backward FFT
	132	CALL ffty_tri_ffty( ar )
	133
	134	!
	135	!-- Transposition y --> x and backward FFT along x
	136	CALL tr_yx_fftx( ar, work, ar )
	137
	138	ELSE
	139
	140	!
	141	!-- 2d-domain-decomposition
	142	!-- Transposition z --> x
	143	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
	144	CALL transpose_zx( ar, work, ar, work, ar )
	145	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
	146
	147	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
	148	CALL fftxp( ar, 'forward' )
	149	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
	150
	151	!
	152	!-- Transposition x --> y
	153	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
	154	CALL transpose_xy( ar, work, ar, work, ar )
	155	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
	156
	157	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
	158	CALL fftyp( ar, 'forward' )
	159	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	160
	161	!
	162	!-- Transposition y --> z
	163	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
	164	CALL transpose_yz( ar, work, ar, work, ar )
	165	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
	166
	167	!
	168	!-- Solve the Poisson equation in z-direction in cartesian space.
	169	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
	170	CALL tridia( ar )
	171	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
	172
	173	!
	174	!-- Inverse Fourier Transformation
	175	!-- Transposition z --> y
	176	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
	177	CALL transpose_zy( ar, work, ar, work, ar )
	178	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
	179
	180	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
	181	CALL fftyp( ar, 'backward' )
	182	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	183
	184	!
	185	!-- Transposition y --> x
	186	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
	187	CALL transpose_yx( ar, work, ar, work, ar )
	188	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
	189
	190	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
	191	CALL fftxp( ar, 'backward' )
	192	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	193
	194	!
	195	!-- Transposition x --> z
	196	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
	197	CALL transpose_xz( ar, work, ar, work, ar )
	198	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
	199
	200	ENDIF
	201
	202	#else
	203
	204	!
	205	!-- Two-dimensional Fourier Transformation along x- and y-direction.
	206	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
	207	CALL fftx( ar, 'forward' )
	208	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
	209	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
	210	CALL ffty( ar, 'forward' )
	211	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	212
	213	!
	214	!-- Solve the Poisson equation in z-direction in cartesian space.
	215	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
	216	CALL tridia( ar )
	217	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
	218
	219	!
	220	!-- Inverse Fourier Transformation.
	221	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
	222	CALL ffty( ar, 'backward' )
	223	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	224	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
	225	CALL fftx( ar, 'backward' )
	226	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	227
	228	#endif
	229
	230	CALL cpu_log( log_point_s(3), 'poisfft', 'stop' )
	231
	232	END SUBROUTINE poisfft
	233
	234
	235
	236	SUBROUTINE tridia( ar )
	237
	238	!------------------------------------------------------------------------------!
	239	! solves the linear system of equations:
	240	!
	241	! -(4 pi^2(i^2/(dx^2nnx^2)+j^2/(dy^2nny^2))+
	242	! 1/(dzu(k)dzw(k))+1/(dzu(k-1)dzw(k)))*p(i,j,k)+
	243	! 1/(dzu(k)dzw(k))p(i,j,k+1)+1/(dzu(k-1)dzw(k))p(i,j,k-1)=d(i,j,k)
	244	!
	245	! by using the Thomas algorithm
	246	!------------------------------------------------------------------------------!
	247
	248	USE arrays_3d
	249
	250	IMPLICIT NONE
	251
	252	INTEGER :: i, j, k, nnyh
	253
	254	REAL, DIMENSION(nxl_z:nxr_z,0:nz-1) :: ar1
	255	REAL, DIMENSION(5,nxl_z:nxr_z,0:nz-1) :: tri
	256
	257	#if defined( __parallel )
	258	REAL :: ar(nxl_z:nxr_za,nys_z:nyn_za,1:nza)
	259	#else
	260	REAL :: ar(1:nz,nys_z:nyn_z,nxl_z:nxr_z)
	261	#endif
	262
	263
	264	nnyh = (ny+1) / 2
	265
	266	!
	267	!-- Define constant elements of the tridiagonal matrix.
	268	DO k = 0, nz-1
	269	DO i = nxl_z, nxr_z
	270	tri(2,i,k) = ddzu(k+1) * ddzw(k+1)
	271	tri(3,i,k) = ddzu(k+2) * ddzw(k+1)
	272	ENDDO
	273	ENDDO
	274
	275	#if defined( __parallel )
	276	!
	277	!-- Repeat for all y-levels.
	278	DO j = nys_z, nyn_z
	279	IF ( j <= nnyh ) THEN
	280	CALL maketri( tri, j )
	281	ELSE
	282	CALL maketri( tri, ny+1-j )
	283	ENDIF
	284	CALL split( tri )
	285	CALL substi( ar, ar1, tri, j )
	286	ENDDO
	287	#else
	288	!
	289	!-- First y-level.
	290	CALL maketri( tri, nys_z )
	291	CALL split( tri )
	292	CALL substi( ar, ar1, tri, 0 )
	293
	294	!
	295	!-- Further y-levels.
	296	DO j = 1, nnyh - 1
	297	CALL maketri( tri, j )
	298	CALL split( tri )
	299	CALL substi( ar, ar1, tri, j )
	300	CALL substi( ar, ar1, tri, ny+1-j )
	301	ENDDO
	302	CALL maketri( tri, nnyh )
	303	CALL split( tri )
	304	CALL substi( ar, ar1, tri, nnyh+nys )
	305	#endif
	306
	307	CONTAINS
	308
	309	SUBROUTINE maketri( tri, j )
	310
	311	!------------------------------------------------------------------------------!
	312	! Computes the i- and j-dependent component of the matrix
	313	!------------------------------------------------------------------------------!
	314
	315	USE arrays_3d
	316	USE constants
	317	USE control_parameters
	318	USE grid_variables
	319
	320	IMPLICIT NONE
	321
	322	INTEGER :: i, j, k, nnxh
	323	REAL :: a, c
	324	REAL :: ll(nxl_z:nxr_z)
	325	REAL :: tri(5,nxl_z:nxr_z,0:nz-1)
	326
	327
	328	nnxh = ( nx + 1 ) / 2
	329
	330	!
	331	!-- Provide the tridiagonal matrix for solution of the Poisson equation in
	332	!-- Fourier space. The coefficients are computed following the method of
	333	!-- Schmidt et al. (DFVLR-Mitteilung 84-15), which departs from Stephan
	334	!-- Siano's original version by discretizing the Poisson equation,
	335	!-- before it is Fourier-transformed
	336	#if defined( __parallel )
	337	DO i = nxl_z, nxr_z
	338	IF ( i >= 0 .AND. i < nnxh ) THEN
	339	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
	340	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
	341	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
	342	FLOAT( ny+1 ) ) ) / ( dy * dy )
	343	ELSEIF ( i == nnxh ) THEN
	344	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
	345	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
	346	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
	347	FLOAT( ny+1 ) ) ) / ( dy * dy )
	348	ELSE
	349	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
	350	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
	351	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
	352	FLOAT( ny+1 ) ) ) / ( dy * dy )
	353	ENDIF
	354	DO k = 0,nz-1
	355	a = -1.0 * ddzu(k+2) * ddzw(k+1)
	356	c = -1.0 * ddzu(k+1) * ddzw(k+1)
	357	tri(1,i,k) = a + c - ll(i)
	358	ENDDO
	359	ENDDO
	360	#else
	361	DO i = 0, nnxh
	362	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / FLOAT( nx+1 ) ) ) / &
	363	( dx * dx ) + &
	364	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / FLOAT( ny+1 ) ) ) / &
	365	( dy * dy )
	366	DO k = 0, nz-1
	367	a = -1.0 * ddzu(k+2) * ddzw(k+1)
	368	c = -1.0 * ddzu(k+1) * ddzw(k+1)
	369	tri(1,i,k) = a + c - ll(i)
	370	IF ( i >= 1 .and. i < nnxh ) THEN
	371	tri(1,nx+1-i,k) = tri(1,i,k)
	372	ENDIF
	373	ENDDO
	374	ENDDO
	375	#endif
	376	IF ( ibc_p_b == 1 .OR. ibc_p_b == 2 ) THEN
	377	DO i = nxl_z, nxr_z
	378	tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
	379	ENDDO
	380	ENDIF
	381	IF ( ibc_p_t == 1 ) THEN
	382	DO i = nxl_z, nxr_z
	383	tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
	384	ENDDO
	385	ENDIF
	386
	387	END SUBROUTINE maketri
	388
	389
	390	SUBROUTINE substi( ar, ar1, tri, j )
	391
	392	!------------------------------------------------------------------------------!
	393	! Substitution (Forward and Backward) (Thomas algorithm)
	394	!------------------------------------------------------------------------------!
	395
[76]	396	USE control_parameters
	397
[1]	398	IMPLICIT NONE
	399
	400	INTEGER :: i, j, k
	401	REAL :: ar1(nxl_z:nxr_z,0:nz-1)
	402	REAL :: tri(5,nxl_z:nxr_z,0:nz-1)
	403	#if defined( __parallel )
	404	REAL :: ar(nxl_z:nxr_za,nys_z:nyn_za,1:nza)
	405	#else
	406	REAL :: ar(1:nz,nys_z:nyn_z,nxl_z:nxr_z)
	407	#endif
	408
	409	!
	410	!-- Forward substitution.
	411	DO i = nxl_z, nxr_z
	412	#if defined( __parallel )
	413	ar1(i,0) = ar(i,j,1)
	414	#else
	415	ar1(i,0) = ar(1,j,i)
	416	#endif
	417	ENDDO
	418	DO k = 1, nz - 1
	419	DO i = nxl_z, nxr_z
	420	#if defined( __parallel )
	421	ar1(i,k) = ar(i,j,k+1) - tri(5,i,k) * ar1(i,k-1)
	422	#else
	423	ar1(i,k) = ar(k+1,j,i) - tri(5,i,k) * ar1(i,k-1)
	424	#endif
	425	ENDDO
	426	ENDDO
	427
	428	!
	429	!-- Backward substitution.
	430	DO i = nxl_z, nxr_z
	431	#if defined( __parallel )
	432	ar(i,j,nz) = ar1(i,nz-1) / tri(4,i,nz-1)
	433	#else
	434	ar(nz,j,i) = ar1(i,nz-1) / tri(4,i,nz-1)
	435	#endif
	436	ENDDO
	437	DO k = nz-2, 0, -1
	438	DO i = nxl_z, nxr_z
	439	#if defined( __parallel )
	440	ar(i,j,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,j,k+2) ) &
	441	/ tri(4,i,k)
	442	#else
	443	ar(k+1,j,i) = ( ar1(i,k) - tri(3,i,k) * ar(k+2,j,i) ) &
	444	/ tri(4,i,k)
	445	#endif
	446	ENDDO
	447	ENDDO
	448
[76]	449	!
	450	!-- Indices i=0, j=0 correspond to horizontally averaged pressure.
	451	!-- The respective values of ar should be zero at all k-levels if
	452	!-- acceleration of horizontally averaged vertical velocity is zero.
	453	IF ( ibc_p_b == 1 .AND. ibc_p_t == 1 ) THEN
	454	IF ( j == 0 .AND. nxl_z == 0 ) THEN
	455	#if defined( __parallel )
	456	DO k = 1, nz
	457	ar(nxl_z,j,k) = 0.0
	458	ENDDO
	459	#else
	460	DO k = 1, nz
	461	ar(k,j,nxl_z) = 0.0
	462	ENDDO
	463	#endif
	464	ENDIF
	465	ENDIF
	466
[1]	467	END SUBROUTINE substi
	468
	469
	470	SUBROUTINE split( tri )
	471
	472	!------------------------------------------------------------------------------!
	473	! Splitting of the tridiagonal matrix (Thomas algorithm)
	474	!------------------------------------------------------------------------------!
	475
	476	IMPLICIT NONE
	477
	478	INTEGER :: i, k
	479	REAL :: tri(5,nxl_z:nxr_z,0:nz-1)
	480
	481	!
	482	!-- Splitting.
	483	DO i = nxl_z, nxr_z
	484	tri(4,i,0) = tri(1,i,0)
	485	ENDDO
	486	DO k = 1, nz-1
	487	DO i = nxl_z, nxr_z
	488	tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
	489	tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
	490	ENDDO
	491	ENDDO
	492
	493	END SUBROUTINE split
	494
	495	END SUBROUTINE tridia
	496
	497
	498	#if defined( __parallel )
	499	SUBROUTINE fftxp( ar, direction )
	500
	501	!------------------------------------------------------------------------------!
	502	! Fourier-transformation along x-direction Parallelized version
	503	!------------------------------------------------------------------------------!
	504
	505	IMPLICIT NONE
	506
	507	CHARACTER (LEN=*) :: direction
	508	INTEGER :: j, k
	509	REAL :: ar(0:nxa,nys_x:nyn_xa,nzb_x:nzt_xa)
	510
	511	!
	512	!-- Performing the fft with one of the methods implemented
	513	DO k = nzb_x, nzt_x
	514	DO j = nys_x, nyn_x
	515	CALL fft_x( ar(0:nx,j,k), direction )
	516	ENDDO
	517	ENDDO
	518
	519	END SUBROUTINE fftxp
	520
	521	#else
	522	SUBROUTINE fftx( ar, direction )
	523
	524	!------------------------------------------------------------------------------!
	525	! Fourier-transformation along x-direction Non parallel version
	526	!------------------------------------------------------------------------------!
	527
	528	IMPLICIT NONE
	529
	530	CHARACTER (LEN=*) :: direction
	531	INTEGER :: i, j, k
	532	REAL :: ar(1:nz,0:ny,0:nx)
	533
	534	!
	535	!-- Performing the fft with one of the methods implemented
	536	DO k = 1, nz
	537	DO j = 0, ny
	538	CALL fft_x( ar(k,j,0:nx), direction )
	539	ENDDO
	540	ENDDO
	541
	542	END SUBROUTINE fftx
	543	#endif
	544
	545
	546	#if defined( __parallel )
	547	SUBROUTINE fftyp( ar, direction )
	548
	549	!------------------------------------------------------------------------------!
	550	! Fourier-transformation along y-direction Parallelized version
	551	!------------------------------------------------------------------------------!
	552
	553	IMPLICIT NONE
	554
	555	CHARACTER (LEN=*) :: direction
	556	INTEGER :: i, k
	557	REAL :: ar(0:nya,nxl_y:nxr_ya,nzb_y:nzt_ya)
	558
	559	!
	560	!-- Performing the fft with one of the methods implemented
	561	DO k = nzb_y, nzt_y
	562	DO i = nxl_y, nxr_y
	563	CALL fft_y( ar(0:ny,i,k), direction )
	564	ENDDO
	565	ENDDO
	566
	567	END SUBROUTINE fftyp
	568
	569	#else
	570	SUBROUTINE ffty( ar, direction )
	571
	572	!------------------------------------------------------------------------------!
	573	! Fourier-transformation along y-direction Non parallel version
	574	!------------------------------------------------------------------------------!
	575
	576	IMPLICIT NONE
	577
	578	CHARACTER (LEN=*) :: direction
	579	INTEGER :: i, k
	580	REAL :: ar(1:nz,0:ny,0:nx)
	581
	582	!
	583	!-- Performing the fft with one of the methods implemented
	584	DO k = 1, nz
	585	DO i = 0, nx
	586	CALL fft_y( ar(k,0:ny,i), direction )
	587	ENDDO
	588	ENDDO
	589
	590	END SUBROUTINE ffty
	591	#endif
	592
	593	#if defined( __parallel )
	594	SUBROUTINE ffty_tr_yx( f_in, work, f_out )
	595
	596	!------------------------------------------------------------------------------!
	597	! Fourier-transformation along y with subsequent transposition y --> x for
	598	! a 1d-decomposition along x
	599	!
	600	! ATTENTION: The performance of this routine is much faster on the NEC-SX6,
	601	! if the first index of work_ffty_vec is odd. Otherwise
	602	! memory bank conflicts may occur (especially if the index is a
	603	! multiple of 128). That's why work_ffty_vec is dimensioned as
	604	! 0:ny+1.
	605	! Of course, this will not work if users are using an odd number
	606	! of gridpoints along y.
	607	!------------------------------------------------------------------------------!
	608
	609	USE control_parameters
	610	USE cpulog
	611	USE indices
	612	USE interfaces
	613	USE pegrid
	614	USE transpose_indices
	615
	616	IMPLICIT NONE
	617
	618	INTEGER :: i, iend, iouter, ir, j, k
	619	INTEGER, PARAMETER :: stridex = 4
	620
	621	REAL, DIMENSION(0:ny,stridex) :: work_ffty
	622	#if defined( __nec )
	623	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
	624	#endif
	625	REAL, DIMENSION(1:nza,0:nya,nxl:nxra) :: f_in
	626	REAL, DIMENSION(nnx,1:nza,nys_x:nyn_xa,pdims(1)) :: f_out
	627	REAL, DIMENSION(nxl:nxra,1:nza,0:nya) :: work
	628
	629	!
	630	!-- Carry out the FFT along y, where all data are present due to the
	631	!-- 1d-decomposition along x. Resort the data in a way that x becomes
	632	!-- the first index.
	633	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
	634
	635	IF ( host(1:3) == 'nec' ) THEN
	636	#if defined( __nec )
	637	!
	638	!-- Code optimized for vector processors
	639	!$OMP PARALLEL PRIVATE ( i, j, k, work_ffty_vec )
	640	!$OMP DO
	641	DO i = nxl, nxr
	642
	643	DO j = 0, ny
	644	DO k = 1, nz
	645	work_ffty_vec(j,k,i) = f_in(k,j,i)
	646	ENDDO
	647	ENDDO
	648
	649	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' )
	650
	651	ENDDO
	652
	653	!$OMP DO
	654	DO k = 1, nz
	655	DO j = 0, ny
	656	DO i = nxl, nxr
	657	work(i,k,j) = work_ffty_vec(j,k,i)
	658	ENDDO
	659	ENDDO
	660	ENDDO
	661	!$OMP END PARALLEL
	662	#endif
	663
	664	ELSE
	665
	666	!
	667	!-- Cache optimized code.
	668	!-- The i-(x-)direction is split into a strided outer loop and an inner
	669	!-- loop for better cache performance
	670	!$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty)
	671	!$OMP DO
	672	DO iouter = nxl, nxr, stridex
	673
	674	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
	675
	676	DO k = 1, nz
	677
	678	DO i = iouter, iend
	679
	680	ir = i-iouter+1 ! counter within a stride
	681	DO j = 0, ny
	682	work_ffty(j,ir) = f_in(k,j,i)
	683	ENDDO
	684	!
	685	!-- FFT along y
	686	CALL fft_y( work_ffty(:,ir), 'forward' )
	687
	688	ENDDO
	689
	690	!
	691	!-- Resort
	692	DO j = 0, ny
	693	DO i = iouter, iend
	694	work(i,k,j) = work_ffty(j,i-iouter+1)
	695	ENDDO
	696	ENDDO
	697
	698	ENDDO
	699
	700	ENDDO
	701	!$OMP END PARALLEL
	702
	703	ENDIF
	704	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	705
	706	!
	707	!-- Transpose array
	708	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
	709	CALL MPI_ALLTOALL( work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
	710	f_out(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
	711	comm1dx, ierr )
	712	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	713
	714	END SUBROUTINE ffty_tr_yx
	715
	716
	717	SUBROUTINE tr_xy_ffty( f_in, work, f_out )
	718
	719	!------------------------------------------------------------------------------!
	720	! Transposition x --> y with a subsequent backward Fourier transformation for
	721	! a 1d-decomposition along x
	722	!------------------------------------------------------------------------------!
	723
	724	USE control_parameters
	725	USE cpulog
	726	USE indices
	727	USE interfaces
	728	USE pegrid
	729	USE transpose_indices
	730
	731	IMPLICIT NONE
	732
	733	INTEGER :: i, iend, iouter, ir, j, k
	734	INTEGER, PARAMETER :: stridex = 4
	735
	736	REAL, DIMENSION(0:ny,stridex) :: work_ffty
	737	#if defined( __nec )
	738	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
	739	#endif
	740	REAL, DIMENSION(nnx,1:nza,nys_x:nyn_xa,pdims(1)) :: f_in
	741	REAL, DIMENSION(1:nza,0:nya,nxl:nxra) :: f_out
	742	REAL, DIMENSION(nxl:nxra,1:nza,0:nya) :: work
	743
	744	!
	745	!-- Transpose array
	746	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
	747	CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
	748	work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
	749	comm1dx, ierr )
	750	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	751
	752	!
	753	!-- Resort the data in a way that y becomes the first index and carry out the
	754	!-- backward fft along y.
	755	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
	756
	757	IF ( host(1:3) == 'nec' ) THEN
	758	#if defined( __nec )
	759	!
	760	!-- Code optimized for vector processors
	761	!$OMP PARALLEL PRIVATE ( i, j, k, work_ffty_vec )
	762	!$OMP DO
	763	DO k = 1, nz
	764	DO j = 0, ny
	765	DO i = nxl, nxr
	766	work_ffty_vec(j,k,i) = work(i,k,j)
	767	ENDDO
	768	ENDDO
	769	ENDDO
	770
	771	!$OMP DO
	772	DO i = nxl, nxr
	773
	774	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' )
	775
	776	DO j = 0, ny
	777	DO k = 1, nz
	778	f_out(k,j,i) = work_ffty_vec(j,k,i)
	779	ENDDO
	780	ENDDO
	781
	782	ENDDO
	783	!$OMP END PARALLEL
	784	#endif
	785
	786	ELSE
	787
	788	!
	789	!-- Cache optimized code.
	790	!-- The i-(x-)direction is split into a strided outer loop and an inner
	791	!-- loop for better cache performance
	792	!$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty )
	793	!$OMP DO
	794	DO iouter = nxl, nxr, stridex
	795
	796	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
	797
	798	DO k = 1, nz
	799	!
	800	!-- Resort
	801	DO j = 0, ny
	802	DO i = iouter, iend
	803	work_ffty(j,i-iouter+1) = work(i,k,j)
	804	ENDDO
	805	ENDDO
	806
	807	DO i = iouter, iend
	808
	809	!
	810	!-- FFT along y
	811	ir = i-iouter+1 ! counter within a stride
	812	CALL fft_y( work_ffty(:,ir), 'backward' )
	813
	814	DO j = 0, ny
	815	f_out(k,j,i) = work_ffty(j,ir)
	816	ENDDO
	817	ENDDO
	818
	819	ENDDO
	820
	821	ENDDO
	822	!$OMP END PARALLEL
	823
	824	ENDIF
	825
	826	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	827
	828	END SUBROUTINE tr_xy_ffty
	829
	830
	831	SUBROUTINE fftx_tri_fftx( ar )
	832
	833	!------------------------------------------------------------------------------!
	834	! FFT along x, solution of the tridiagonal system and backward FFT for
	835	! a 1d-decomposition along x
	836	!
	837	! WARNING: this subroutine may still not work for hybrid parallelization
	838	! with OpenMP (for possible necessary changes see the original
	839	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
	840	!------------------------------------------------------------------------------!
	841
	842	USE control_parameters
	843	USE cpulog
	844	USE grid_variables
	845	USE indices
	846	USE interfaces
	847	USE pegrid
	848	USE transpose_indices
	849
	850	IMPLICIT NONE
	851
	852	character(len=3) :: myth_char
	853
	854	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
	855
	856	REAL, DIMENSION(0:nx) :: work_fftx
	857	REAL, DIMENSION(0:nx,1:nz) :: work_trix
	858	REAL, DIMENSION(nnx,1:nza,nys_x:nyn_xa,pdims(1)) :: ar
	859	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
	860
	861
	862	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'start' )
	863
	864	ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) )
	865
	866	tn = 0 ! Default thread number in case of one thread
	867	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_fftx, work_trix )
	868	DO j = nys_x, nyn_x
	869
	870	!$ tn = omp_get_thread_num()
	871
	872	IF ( host(1:3) == 'nec' ) THEN
	873	!
	874	!-- Code optimized for vector processors
	875	DO k = 1, nz
	876
	877	m = 0
	878	DO n = 1, pdims(1)
	879	DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
	880	work_trix(m,k) = ar(i,k,j,n)
	881	m = m + 1
	882	ENDDO
	883	ENDDO
	884
	885	ENDDO
	886
	887	CALL fft_x_m( work_trix, 'forward' )
	888
	889	ELSE
	890	!
	891	!-- Cache optimized code
	892	DO k = 1, nz
	893
	894	m = 0
	895	DO n = 1, pdims(1)
	896	DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
	897	work_fftx(m) = ar(i,k,j,n)
	898	m = m + 1
	899	ENDDO
	900	ENDDO
	901
	902	CALL fft_x( work_fftx, 'forward' )
	903
	904	DO i = 0, nx
	905	work_trix(i,k) = work_fftx(i)
	906	ENDDO
	907
	908	ENDDO
	909
	910	ENDIF
	911
	912	!
	913	!-- Solve the linear equation system
	914	CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) )
	915
	916	IF ( host(1:3) == 'nec' ) THEN
	917	!
	918	!-- Code optimized for vector processors
	919	CALL fft_x_m( work_trix, 'backward' )
	920
	921	DO k = 1, nz
	922
	923	m = 0
	924	DO n = 1, pdims(1)
	925	DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
	926	ar(i,k,j,n) = work_trix(m,k)
	927	m = m + 1
	928	ENDDO
	929	ENDDO
	930
	931	ENDDO
	932
	933	ELSE
	934	!
	935	!-- Cache optimized code
	936	DO k = 1, nz
	937
	938	DO i = 0, nx
	939	work_fftx(i) = work_trix(i,k)
	940	ENDDO
	941
	942	CALL fft_x( work_fftx, 'backward' )
	943
	944	m = 0
	945	DO n = 1, pdims(1)
	946	DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
	947	ar(i,k,j,n) = work_fftx(m)
	948	m = m + 1
	949	ENDDO
	950	ENDDO
	951
	952	ENDDO
	953
	954	ENDIF
	955
	956	ENDDO
	957
	958	DEALLOCATE( tri )
	959
	960	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'stop' )
	961
	962	END SUBROUTINE fftx_tri_fftx
	963
	964
	965	SUBROUTINE fftx_tr_xy( f_in, work, f_out )
	966
	967	!------------------------------------------------------------------------------!
	968	! Fourier-transformation along x with subsequent transposition x --> y for
	969	! a 1d-decomposition along y
	970	!
	971	! ATTENTION: The NEC-branch of this routine may significantly profit from
	972	! further optimizations. So far, performance is much worse than
	973	! for routine ffty_tr_yx (more than three times slower).
	974	!------------------------------------------------------------------------------!
	975
	976	USE control_parameters
	977	USE cpulog
	978	USE indices
	979	USE interfaces
	980	USE pegrid
	981	USE transpose_indices
	982
	983	IMPLICIT NONE
	984
	985	INTEGER :: i, j, k
	986
	987	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
	988	REAL, DIMENSION(1:nza,nys:nyna,0:nxa) :: f_in
	989	REAL, DIMENSION(nny,1:nza,nxl_y:nxr_ya,pdims(2)) :: f_out
	990	REAL, DIMENSION(nys:nyna,1:nza,0:nxa) :: work
	991
	992	!
	993	!-- Carry out the FFT along x, where all data are present due to the
	994	!-- 1d-decomposition along y. Resort the data in a way that y becomes
	995	!-- the first index.
	996	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
	997
	998	IF ( host(1:3) == 'nec' ) THEN
	999	!
	1000	!-- Code for vector processors
	1001	!$OMP PARALLEL PRIVATE ( i, j, k, work_fftx )
	1002	!$OMP DO
	1003	DO i = 0, nx
	1004
	1005	DO j = nys, nyn
	1006	DO k = 1, nz
	1007	work_fftx(i,k,j) = f_in(k,j,i)
	1008	ENDDO
	1009	ENDDO
	1010
	1011	ENDDO
	1012
	1013	!$OMP DO
	1014	DO j = nys, nyn
	1015
	1016	CALL fft_x_m( work_fftx(:,:,j), 'forward' )
	1017
	1018	DO k = 1, nz
	1019	DO i = 0, nx
	1020	work(j,k,i) = work_fftx(i,k,j)
	1021	ENDDO
	1022	ENDDO
	1023
	1024	ENDDO
	1025	!$OMP END PARALLEL
	1026
	1027	ELSE
	1028
	1029	!
	1030	!-- Cache optimized code (there might be still a potential for better
	1031	!-- optimization).
	1032	!$OMP PARALLEL PRIVATE (i,j,k,work_fftx)
	1033	!$OMP DO
	1034	DO i = 0, nx
	1035
	1036	DO j = nys, nyn
	1037	DO k = 1, nz
	1038	work_fftx(i,k,j) = f_in(k,j,i)
	1039	ENDDO
	1040	ENDDO
	1041
	1042	ENDDO
	1043
	1044	!$OMP DO
	1045	DO j = nys, nyn
	1046	DO k = 1, nz
	1047
	1048	CALL fft_x( work_fftx(0:nx,k,j), 'forward' )
	1049
	1050	DO i = 0, nx
	1051	work(j,k,i) = work_fftx(i,k,j)
	1052	ENDDO
	1053	ENDDO
	1054
	1055	ENDDO
	1056	!$OMP END PARALLEL
	1057
	1058	ENDIF
	1059	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
	1060
	1061	!
	1062	!-- Transpose array
	1063	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
	1064	CALL MPI_ALLTOALL( work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
	1065	f_out(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
	1066	comm1dy, ierr )
	1067	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	1068
	1069	END SUBROUTINE fftx_tr_xy
	1070
	1071
	1072	SUBROUTINE tr_yx_fftx( f_in, work, f_out )
	1073
	1074	!------------------------------------------------------------------------------!
	1075	! Transposition y --> x with a subsequent backward Fourier transformation for
	1076	! a 1d-decomposition along x
	1077	!------------------------------------------------------------------------------!
	1078
	1079	USE control_parameters
	1080	USE cpulog
	1081	USE indices
	1082	USE interfaces
	1083	USE pegrid
	1084	USE transpose_indices
	1085
	1086	IMPLICIT NONE
	1087
	1088	INTEGER :: i, j, k
	1089
	1090	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
	1091	REAL, DIMENSION(nny,1:nza,nxl_y:nxr_ya,pdims(2)) :: f_in
	1092	REAL, DIMENSION(1:nza,nys:nyna,0:nxa) :: f_out
	1093	REAL, DIMENSION(nys:nyna,1:nza,0:nxa) :: work
	1094
	1095	!
	1096	!-- Transpose array
	1097	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
	1098	CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
	1099	work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
	1100	comm1dy, ierr )
	1101	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	1102
	1103	!
	1104	!-- Carry out the FFT along x, where all data are present due to the
	1105	!-- 1d-decomposition along y. Resort the data in a way that y becomes
	1106	!-- the first index.
	1107	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
	1108
	1109	IF ( host(1:3) == 'nec' ) THEN
	1110	!
	1111	!-- Code optimized for vector processors
	1112	!$OMP PARALLEL PRIVATE ( i, j, k, work_fftx )
	1113	!$OMP DO
	1114	DO j = nys, nyn
	1115
	1116	DO k = 1, nz
	1117	DO i = 0, nx
	1118	work_fftx(i,k,j) = work(j,k,i)
	1119	ENDDO
	1120	ENDDO
	1121
	1122	CALL fft_x_m( work_fftx(:,:,j), 'backward' )
	1123
	1124	ENDDO
	1125
	1126	!$OMP DO
	1127	DO i = 0, nx
	1128	DO j = nys, nyn
	1129	DO k = 1, nz
	1130	f_out(k,j,i) = work_fftx(i,k,j)
	1131	ENDDO
	1132	ENDDO
	1133	ENDDO
	1134	!$OMP END PARALLEL
	1135
	1136	ELSE
	1137
	1138	!
	1139	!-- Cache optimized code (there might be still a potential for better
	1140	!-- optimization).
	1141	!$OMP PARALLEL PRIVATE (i,j,k,work_fftx)
	1142	!$OMP DO
	1143	DO j = nys, nyn
	1144	DO k = 1, nz
	1145
	1146	DO i = 0, nx
	1147	work_fftx(i,k,j) = work(j,k,i)
	1148	ENDDO
	1149
	1150	CALL fft_x( work_fftx(0:nx,k,j), 'backward' )
	1151
	1152	ENDDO
	1153	ENDDO
	1154
	1155	!$OMP DO
	1156	DO i = 0, nx
	1157	DO j = nys, nyn
	1158	DO k = 1, nz
	1159	f_out(k,j,i) = work_fftx(i,k,j)
	1160	ENDDO
	1161	ENDDO
	1162	ENDDO
	1163	!$OMP END PARALLEL
	1164
	1165	ENDIF
	1166	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	1167
	1168	END SUBROUTINE tr_yx_fftx
	1169
	1170
	1171	SUBROUTINE ffty_tri_ffty( ar )
	1172
	1173	!------------------------------------------------------------------------------!
	1174	! FFT along y, solution of the tridiagonal system and backward FFT for
	1175	! a 1d-decomposition along y
	1176	!
	1177	! WARNING: this subroutine may still not work for hybrid parallelization
	1178	! with OpenMP (for possible necessary changes see the original
	1179	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
	1180	!------------------------------------------------------------------------------!
	1181
	1182	USE control_parameters
	1183	USE cpulog
	1184	USE grid_variables
	1185	USE indices
	1186	USE interfaces
	1187	USE pegrid
	1188	USE transpose_indices
	1189
	1190	IMPLICIT NONE
	1191
	1192	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
	1193
	1194	REAL, DIMENSION(0:ny) :: work_ffty
	1195	REAL, DIMENSION(0:ny,1:nz) :: work_triy
	1196	REAL, DIMENSION(nny,1:nza,nxl_y:nxr_ya,pdims(2)) :: ar
	1197	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
	1198
	1199
	1200	CALL cpu_log( log_point_s(39), 'fft_y + tridia', 'start' )
	1201
	1202	ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) )
	1203
	1204	tn = 0 ! Default thread number in case of one thread
	1205	!$OMP PARALLEL PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy )
	1206	!$OMP DO
	1207	DO i = nxl_y, nxr_y
	1208
	1209	!$ tn = omp_get_thread_num()
	1210
	1211	IF ( host(1:3) == 'nec' ) THEN
	1212	!
	1213	!-- Code optimized for vector processors
	1214	DO k = 1, nz
	1215
	1216	m = 0
	1217	DO n = 1, pdims(2)
	1218	DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
	1219	work_triy(m,k) = ar(j,k,i,n)
	1220	m = m + 1
	1221	ENDDO
	1222	ENDDO
	1223
	1224	ENDDO
	1225
	1226	CALL fft_y_m( work_triy, ny, 'forward' )
	1227
	1228	ELSE
	1229	!
	1230	!-- Cache optimized code
	1231	DO k = 1, nz
	1232
	1233	m = 0
	1234	DO n = 1, pdims(2)
	1235	DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
	1236	work_ffty(m) = ar(j,k,i,n)
	1237	m = m + 1
	1238	ENDDO
	1239	ENDDO
	1240
	1241	CALL fft_y( work_ffty, 'forward' )
	1242
	1243	DO j = 0, ny
	1244	work_triy(j,k) = work_ffty(j)
	1245	ENDDO
	1246
	1247	ENDDO
	1248
	1249	ENDIF
	1250
	1251	!
	1252	!-- Solve the linear equation system
	1253	CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) )
	1254
	1255	IF ( host(1:3) == 'nec' ) THEN
	1256	!
	1257	!-- Code optimized for vector processors
	1258	CALL fft_y_m( work_triy, ny, 'backward' )
	1259
	1260	DO k = 1, nz
	1261
	1262	m = 0
	1263	DO n = 1, pdims(2)
	1264	DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
	1265	ar(j,k,i,n) = work_triy(m,k)
	1266	m = m + 1
	1267	ENDDO
	1268	ENDDO
	1269
	1270	ENDDO
	1271
	1272	ELSE
	1273	!
	1274	!-- Cache optimized code
	1275	DO k = 1, nz
	1276
	1277	DO j = 0, ny
	1278	work_ffty(j) = work_triy(j,k)
	1279	ENDDO
	1280
	1281	CALL fft_y( work_ffty, 'backward' )
	1282
	1283	m = 0
	1284	DO n = 1, pdims(2)
	1285	DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
	1286	ar(j,k,i,n) = work_ffty(m)
	1287	m = m + 1
	1288	ENDDO
	1289	ENDDO
	1290
	1291	ENDDO
	1292
	1293	ENDIF
	1294
	1295	ENDDO
	1296	!$OMP END PARALLEL
	1297
	1298	DEALLOCATE( tri )
	1299
	1300	CALL cpu_log( log_point_s(39), 'fft_y + tridia', 'stop' )
	1301
	1302	END SUBROUTINE ffty_tri_ffty
	1303
	1304
	1305	SUBROUTINE tridia_1dd( ddx2, ddy2, nx, ny, j, ar, tri )
	1306
	1307	!------------------------------------------------------------------------------!
	1308	! Solves the linear system of equations for a 1d-decomposition along x (see
	1309	! tridia)
	1310	!
	1311	! Attention: when using the intel compiler, array tri must be passed as an
	1312	! argument to the contained subroutines. Otherwise addres faults
	1313	! will occur.
	1314	! On NEC, tri should not be passed (except for routine substi_1dd)
	1315	! because this causes very bad performance.
	1316	!------------------------------------------------------------------------------!
	1317
	1318	USE arrays_3d
	1319	USE control_parameters
	1320
	1321	USE pegrid
	1322
	1323	IMPLICIT NONE
	1324
	1325	INTEGER :: i, j, k, nnyh, nx, ny, omp_get_thread_num, tn
	1326
	1327	REAL :: ddx2, ddy2
	1328
	1329	REAL, DIMENSION(0:nx,1:nz) :: ar
	1330	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
	1331	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	1332
	1333
	1334	nnyh = ( ny + 1 ) / 2
	1335
	1336	!
	1337	!-- Define constant elements of the tridiagonal matrix.
	1338	!-- The compiler on SX6 does loop exchange. If 0:nx is a high power of 2,
	1339	!-- the exchanged loops create bank conflicts. The following directive
	1340	!-- prohibits loop exchange and the loops perform much better.
	1341	! tn = omp_get_thread_num()
	1342	! WRITE( 120+tn, * ) '+++ id=',myid,' nx=',nx,' thread=', omp_get_thread_num()
	1343	! CALL FLUSH_( 120+tn )
	1344	!CDIR NOLOOPCHG
	1345	DO k = 0, nz-1
	1346	DO i = 0,nx
	1347	tri(2,i,k) = ddzu(k+1) * ddzw(k+1)
	1348	tri(3,i,k) = ddzu(k+2) * ddzw(k+1)
	1349	ENDDO
	1350	ENDDO
	1351	! WRITE( 120+tn, * ) '+++ id=',myid,' end of first tridia loop thread=', omp_get_thread_num()
	1352	! CALL FLUSH_( 120+tn )
	1353
	1354	IF ( j <= nnyh ) THEN
	1355	#if defined( __lcmuk )
	1356	CALL maketri_1dd( j, tri )
	1357	#else
	1358	CALL maketri_1dd( j )
	1359	#endif
	1360	ELSE
	1361	#if defined( __lcmuk )
	1362	CALL maketri_1dd( ny+1-j, tri )
	1363	#else
	1364	CALL maketri_1dd( ny+1-j )
	1365	#endif
	1366	ENDIF
	1367	#if defined( __lcmuk )
	1368	CALL split_1dd( tri )
	1369	#else
	1370	CALL split_1dd
	1371	#endif
	1372	CALL substi_1dd( ar, tri )
	1373
	1374	CONTAINS
	1375
	1376	#if defined( __lcmuk )
	1377	SUBROUTINE maketri_1dd( j, tri )
	1378	#else
	1379	SUBROUTINE maketri_1dd( j )
	1380	#endif
	1381
	1382	!------------------------------------------------------------------------------!
	1383	! computes the i- and j-dependent component of the matrix
	1384	!------------------------------------------------------------------------------!
	1385
	1386	USE constants
	1387
	1388	IMPLICIT NONE
	1389
	1390	INTEGER :: i, j, k, nnxh
	1391	REAL :: a, c
	1392
	1393	REAL, DIMENSION(0:nx) :: l
	1394
	1395	#if defined( __lcmuk )
	1396	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	1397	#endif
	1398
	1399
	1400	nnxh = ( nx + 1 ) / 2
	1401	!
	1402	!-- Provide the tridiagonal matrix for solution of the Poisson equation in
	1403	!-- Fourier space. The coefficients are computed following the method of
	1404	!-- Schmidt et al. (DFVLR-Mitteilung 84-15), which departs from Stephan
	1405	!-- Siano's original version by discretizing the Poisson equation,
	1406	!-- before it is Fourier-transformed
	1407	DO i = 0, nx
	1408	IF ( i >= 0 .AND. i < nnxh ) THEN
	1409	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
	1410	FLOAT( nx+1 ) ) ) * ddx2 + &
	1411	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
	1412	FLOAT( ny+1 ) ) ) * ddy2
	1413	ELSEIF ( i == nnxh ) THEN
	1414	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
	1415	FLOAT( nx+1 ) ) ) * ddx2 + &
	1416	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
	1417	FLOAT( ny+1 ) ) ) * ddy2
	1418	ELSE
	1419	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
	1420	FLOAT( nx+1 ) ) ) * ddx2 + &
	1421	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
	1422	FLOAT( ny+1 ) ) ) * ddy2
	1423	ENDIF
	1424	ENDDO
	1425
	1426	DO k = 0, nz-1
	1427	DO i = 0, nx
	1428	a = -1.0 * ddzu(k+2) * ddzw(k+1)
	1429	c = -1.0 * ddzu(k+1) * ddzw(k+1)
	1430	tri(1,i,k) = a + c - l(i)
	1431	ENDDO
	1432	ENDDO
	1433	IF ( ibc_p_b == 1 .OR. ibc_p_b == 2 ) THEN
	1434	DO i = 0, nx
	1435	tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
	1436	ENDDO
	1437	ENDIF
	1438	IF ( ibc_p_t == 1 ) THEN
	1439	DO i = 0, nx
	1440	tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
	1441	ENDDO
	1442	ENDIF
	1443
	1444	END SUBROUTINE maketri_1dd
	1445
	1446
	1447	#if defined( __lcmuk )
	1448	SUBROUTINE split_1dd( tri )
	1449	#else
	1450	SUBROUTINE split_1dd
	1451	#endif
	1452
	1453	!------------------------------------------------------------------------------!
	1454	! Splitting of the tridiagonal matrix (Thomas algorithm)
	1455	!------------------------------------------------------------------------------!
	1456
	1457	IMPLICIT NONE
	1458
	1459	INTEGER :: i, k
	1460
	1461	#if defined( __lcmuk )
	1462	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	1463	#endif
	1464
	1465
	1466	!
	1467	!-- Splitting
	1468	DO i = 0, nx
	1469	tri(4,i,0) = tri(1,i,0)
	1470	ENDDO
	1471	DO k = 1, nz-1
	1472	DO i = 0, nx
	1473	tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
	1474	tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
	1475	ENDDO
	1476	ENDDO
	1477
	1478	END SUBROUTINE split_1dd
	1479
	1480
	1481	SUBROUTINE substi_1dd( ar, tri )
	1482
	1483	!------------------------------------------------------------------------------!
	1484	! Substitution (Forward and Backward) (Thomas algorithm)
	1485	!------------------------------------------------------------------------------!
	1486
	1487	IMPLICIT NONE
	1488
[76]	1489	INTEGER :: i, k
[1]	1490
	1491	REAL, DIMENSION(0:nx,nz) :: ar
	1492	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
	1493	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	1494
	1495	!
	1496	!-- Forward substitution
	1497	DO i = 0, nx
	1498	ar1(i,0) = ar(i,1)
	1499	ENDDO
	1500	DO k = 1, nz-1
	1501	DO i = 0, nx
	1502	ar1(i,k) = ar(i,k+1) - tri(5,i,k) * ar1(i,k-1)
	1503	ENDDO
	1504	ENDDO
	1505
	1506	!
	1507	!-- Backward substitution
	1508	DO i = 0, nx
	1509	ar(i,nz) = ar1(i,nz-1) / tri(4,i,nz-1)
	1510	ENDDO
	1511	DO k = nz-2, 0, -1
	1512	DO i = 0, nx
	1513	ar(i,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,k+2) ) &
	1514	/ tri(4,i,k)
	1515	ENDDO
	1516	ENDDO
	1517
[76]	1518	!
	1519	!-- Indices i=0, j=0 correspond to horizontally averaged pressure.
	1520	!-- The respective values of ar should be zero at all k-levels if
	1521	!-- acceleration of horizontally averaged vertical velocity is zero.
	1522	IF ( ibc_p_b == 1 .AND. ibc_p_t == 1 ) THEN
	1523	IF ( j == 0 ) THEN
	1524	DO k = 1, nz
	1525	ar(0,k) = 0.0
	1526	ENDDO
	1527	ENDIF
	1528	ENDIF
	1529
[1]	1530	END SUBROUTINE substi_1dd
	1531
	1532	END SUBROUTINE tridia_1dd
	1533
	1534	#endif
	1535
	1536	END MODULE poisfft_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |