Home

Context Navigation

source: palm/trunk/SOURCE/poisfft.f90 @ 624

Last change on this file since 624 was 623, checked in by raasch, 14 years ago
last commit documented
Property svn:keywords set to `Id`
File size: 44.9 KB

Rev	Line
[1]	1	MODULE poisfft_mod
	2
	3	!------------------------------------------------------------------------------!
[484]	4	! Current revisions:
[1]	5	! -----------------
[623]	6	!
[1]	7	!
	8	! Former revisions:
	9	! -----------------
[3]	10	! $Id: poisfft.f90 623 2010-12-10 08:52:17Z heinze $
[77]	11	!
[623]	12	! 622 2010-12-10 08:08:13Z raasch
	13	! optional barriers included in order to speed up collective operations
	14	!
[392]	15	! 377 2009-09-04 11:09:00Z raasch
	16	! __lcmuk changed to __lc to avoid problems with Intel compiler on sgi-ice
	17	!
[198]	18	! 164 2008-05-15 08:46:15Z raasch
	19	! Arguments removed from transpose routines
	20	!
[139]	21	! 128 2007-10-26 13:11:14Z raasch
	22	! Bugfix: wavenumber calculation for even nx in routines maketri
	23	!
[90]	24	! 85 2007-05-11 09:35:14Z raasch
	25	! Bugfix: work_fft*_vec removed from some PRIVATE-declarations
	26	!
[77]	27	! 76 2007-03-29 00:58:32Z raasch
	28	! Tridiagonal coefficients adjusted for Neumann boundary conditions both at
	29	! the bottom and the top.
	30	!
[3]	31	! RCS Log replace by Id keyword, revision history cleaned up
	32	!
[1]	33	! Revision 1.24 2006/08/04 15:00:24 raasch
	34	! Default setting of the thread number tn in case of not using OpenMP
	35	!
	36	! Revision 1.23 2006/02/23 12:48:38 raasch
	37	! Additional compiler directive in routine tridia_1dd for preventing loop
	38	! exchange on NEC-SX6
	39	!
	40	! Revision 1.20 2004/04/30 12:38:09 raasch
	41	! Parts of former poisfft_hybrid moved to this subroutine,
	42	! former subroutine changed to a module, renaming of FFT-subroutines and
	43	! -module, FFTs completely substituted by calls of fft_x and fft_y,
	44	! NAG fft used in the non-parallel case completely removed, l in maketri
	45	! is now a 1d-array, variables passed by modules instead of using parameter
	46	! lists, enlarged transposition arrays introduced
	47	!
	48	! Revision 1.1 1997/07/24 11:24:14 raasch
	49	! Initial revision
	50	!
	51	!
	52	! Description:
	53	! ------------
	54	! See below.
	55	!------------------------------------------------------------------------------!
	56
	57	!--------------------------------------------------------------------------!
	58	! poisfft !
	59	! !
	60	! Original version: Stephan Siano (pois3d) !
	61	! !
	62	! Institute of Meteorology and Climatology, University of Hannover !
	63	! Germany !
	64	! !
	65	! Version as of July 23,1996 !
	66	! !
	67	! !
	68	! Version for parallel computers: Siegfried Raasch !
	69	! !
	70	! Version as of July 03,1997 !
	71	! !
	72	! Solves the Poisson equation with a 2D spectral method !
	73	! d^2 p / dx^2 + d^2 p / dy^2 + d^2 p / dz^2 = s !
	74	! !
	75	! Input: !
	76	! real ar contains in the (nnx,nny,nnz) elements, !
	77	! starting from the element (1,nys,nxl), the !
	78	! values for s !
	79	! real work Temporary array !
	80	! !
	81	! Output: !
	82	! real ar contains the solution for p !
	83	!--------------------------------------------------------------------------!
	84
	85	USE fft_xy
	86	USE indices
	87	USE transpose_indices
	88
	89	IMPLICIT NONE
	90
	91	PRIVATE
	92	PUBLIC poisfft, poisfft_init
	93
	94	INTERFACE poisfft
	95	MODULE PROCEDURE poisfft
	96	END INTERFACE poisfft
	97
	98	INTERFACE poisfft_init
	99	MODULE PROCEDURE poisfft_init
	100	END INTERFACE poisfft_init
	101
	102	CONTAINS
	103
	104	SUBROUTINE poisfft_init
	105
	106	CALL fft_init
	107
	108	END SUBROUTINE poisfft_init
	109
	110
	111	SUBROUTINE poisfft( ar, work )
	112
	113	USE cpulog
	114	USE interfaces
	115	USE pegrid
	116
	117	IMPLICIT NONE
	118
	119	REAL, DIMENSION(1:nza,nys:nyna,nxl:nxra) :: ar, work
	120
	121
	122	CALL cpu_log( log_point_s(3), 'poisfft', 'start' )
	123
	124	!
	125	!-- Two-dimensional Fourier Transformation in x- and y-direction.
	126	#if defined( __parallel )
	127	IF ( pdims(2) == 1 ) THEN
	128
	129	!
	130	!-- 1d-domain-decomposition along x:
	131	!-- FFT along y and transposition y --> x
	132	CALL ffty_tr_yx( ar, work, ar )
	133
	134	!
	135	!-- FFT along x, solving the tridiagonal system and backward FFT
	136	CALL fftx_tri_fftx( ar )
	137
	138	!
	139	!-- Transposition x --> y and backward FFT along y
	140	CALL tr_xy_ffty( ar, work, ar )
	141
	142	ELSEIF ( pdims(1) == 1 ) THEN
	143
	144	!
	145	!-- 1d-domain-decomposition along y:
	146	!-- FFT along x and transposition x --> y
	147	CALL fftx_tr_xy( ar, work, ar )
	148
	149	!
	150	!-- FFT along y, solving the tridiagonal system and backward FFT
	151	CALL ffty_tri_ffty( ar )
	152
	153	!
	154	!-- Transposition y --> x and backward FFT along x
	155	CALL tr_yx_fftx( ar, work, ar )
	156
	157	ELSE
	158
	159	!
	160	!-- 2d-domain-decomposition
	161	!-- Transposition z --> x
	162	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
[164]	163	CALL transpose_zx( ar, work, ar )
[1]	164	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
	165
	166	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
	167	CALL fftxp( ar, 'forward' )
	168	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
	169
	170	!
	171	!-- Transposition x --> y
	172	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
[164]	173	CALL transpose_xy( ar, work, ar )
[1]	174	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
	175
	176	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
	177	CALL fftyp( ar, 'forward' )
	178	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	179
	180	!
	181	!-- Transposition y --> z
	182	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
[164]	183	CALL transpose_yz( ar, work, ar )
[1]	184	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
	185
	186	!
	187	!-- Solve the Poisson equation in z-direction in cartesian space.
	188	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
	189	CALL tridia( ar )
	190	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
	191
	192	!
	193	!-- Inverse Fourier Transformation
	194	!-- Transposition z --> y
	195	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
[164]	196	CALL transpose_zy( ar, work, ar )
[1]	197	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
	198
	199	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
	200	CALL fftyp( ar, 'backward' )
	201	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	202
	203	!
	204	!-- Transposition y --> x
	205	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
[164]	206	CALL transpose_yx( ar, work, ar )
[1]	207	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
	208
	209	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
	210	CALL fftxp( ar, 'backward' )
	211	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	212
	213	!
	214	!-- Transposition x --> z
	215	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
[164]	216	CALL transpose_xz( ar, work, ar )
[1]	217	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
	218
	219	ENDIF
	220
	221	#else
	222
	223	!
	224	!-- Two-dimensional Fourier Transformation along x- and y-direction.
	225	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
	226	CALL fftx( ar, 'forward' )
	227	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
	228	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
	229	CALL ffty( ar, 'forward' )
	230	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	231
	232	!
	233	!-- Solve the Poisson equation in z-direction in cartesian space.
	234	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
	235	CALL tridia( ar )
	236	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
	237
	238	!
	239	!-- Inverse Fourier Transformation.
	240	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
	241	CALL ffty( ar, 'backward' )
	242	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	243	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
	244	CALL fftx( ar, 'backward' )
	245	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	246
	247	#endif
	248
	249	CALL cpu_log( log_point_s(3), 'poisfft', 'stop' )
	250
	251	END SUBROUTINE poisfft
	252
	253
	254
	255	SUBROUTINE tridia( ar )
	256
	257	!------------------------------------------------------------------------------!
	258	! solves the linear system of equations:
	259	!
	260	! -(4 pi^2(i^2/(dx^2nnx^2)+j^2/(dy^2nny^2))+
	261	! 1/(dzu(k)dzw(k))+1/(dzu(k-1)dzw(k)))*p(i,j,k)+
	262	! 1/(dzu(k)dzw(k))p(i,j,k+1)+1/(dzu(k-1)dzw(k))p(i,j,k-1)=d(i,j,k)
	263	!
	264	! by using the Thomas algorithm
	265	!------------------------------------------------------------------------------!
	266
	267	USE arrays_3d
	268
	269	IMPLICIT NONE
	270
	271	INTEGER :: i, j, k, nnyh
	272
	273	REAL, DIMENSION(nxl_z:nxr_z,0:nz-1) :: ar1
	274	REAL, DIMENSION(5,nxl_z:nxr_z,0:nz-1) :: tri
	275
	276	#if defined( __parallel )
	277	REAL :: ar(nxl_z:nxr_za,nys_z:nyn_za,1:nza)
	278	#else
	279	REAL :: ar(1:nz,nys_z:nyn_z,nxl_z:nxr_z)
	280	#endif
	281
	282
	283	nnyh = (ny+1) / 2
	284
	285	!
	286	!-- Define constant elements of the tridiagonal matrix.
	287	DO k = 0, nz-1
	288	DO i = nxl_z, nxr_z
	289	tri(2,i,k) = ddzu(k+1) * ddzw(k+1)
	290	tri(3,i,k) = ddzu(k+2) * ddzw(k+1)
	291	ENDDO
	292	ENDDO
	293
	294	#if defined( __parallel )
	295	!
	296	!-- Repeat for all y-levels.
	297	DO j = nys_z, nyn_z
	298	IF ( j <= nnyh ) THEN
	299	CALL maketri( tri, j )
	300	ELSE
	301	CALL maketri( tri, ny+1-j )
	302	ENDIF
	303	CALL split( tri )
	304	CALL substi( ar, ar1, tri, j )
	305	ENDDO
	306	#else
	307	!
	308	!-- First y-level.
	309	CALL maketri( tri, nys_z )
	310	CALL split( tri )
	311	CALL substi( ar, ar1, tri, 0 )
	312
	313	!
	314	!-- Further y-levels.
	315	DO j = 1, nnyh - 1
	316	CALL maketri( tri, j )
	317	CALL split( tri )
	318	CALL substi( ar, ar1, tri, j )
	319	CALL substi( ar, ar1, tri, ny+1-j )
	320	ENDDO
	321	CALL maketri( tri, nnyh )
	322	CALL split( tri )
	323	CALL substi( ar, ar1, tri, nnyh+nys )
	324	#endif
	325
	326	CONTAINS
	327
	328	SUBROUTINE maketri( tri, j )
	329
	330	!------------------------------------------------------------------------------!
	331	! Computes the i- and j-dependent component of the matrix
	332	!------------------------------------------------------------------------------!
	333
	334	USE arrays_3d
	335	USE constants
	336	USE control_parameters
	337	USE grid_variables
	338
	339	IMPLICIT NONE
	340
	341	INTEGER :: i, j, k, nnxh
	342	REAL :: a, c
	343	REAL :: ll(nxl_z:nxr_z)
	344	REAL :: tri(5,nxl_z:nxr_z,0:nz-1)
	345
	346
	347	nnxh = ( nx + 1 ) / 2
	348
	349	!
	350	!-- Provide the tridiagonal matrix for solution of the Poisson equation in
	351	!-- Fourier space. The coefficients are computed following the method of
	352	!-- Schmidt et al. (DFVLR-Mitteilung 84-15), which departs from Stephan
	353	!-- Siano's original version by discretizing the Poisson equation,
	354	!-- before it is Fourier-transformed
	355	#if defined( __parallel )
	356	DO i = nxl_z, nxr_z
[128]	357	IF ( i >= 0 .AND. i <= nnxh ) THEN
[1]	358	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
	359	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
	360	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
	361	FLOAT( ny+1 ) ) ) / ( dy * dy )
	362	ELSE
	363	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
	364	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
	365	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
	366	FLOAT( ny+1 ) ) ) / ( dy * dy )
	367	ENDIF
	368	DO k = 0,nz-1
	369	a = -1.0 * ddzu(k+2) * ddzw(k+1)
	370	c = -1.0 * ddzu(k+1) * ddzw(k+1)
	371	tri(1,i,k) = a + c - ll(i)
	372	ENDDO
	373	ENDDO
	374	#else
	375	DO i = 0, nnxh
	376	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / FLOAT( nx+1 ) ) ) / &
	377	( dx * dx ) + &
	378	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / FLOAT( ny+1 ) ) ) / &
	379	( dy * dy )
	380	DO k = 0, nz-1
	381	a = -1.0 * ddzu(k+2) * ddzw(k+1)
	382	c = -1.0 * ddzu(k+1) * ddzw(k+1)
	383	tri(1,i,k) = a + c - ll(i)
	384	IF ( i >= 1 .and. i < nnxh ) THEN
	385	tri(1,nx+1-i,k) = tri(1,i,k)
	386	ENDIF
	387	ENDDO
	388	ENDDO
	389	#endif
	390	IF ( ibc_p_b == 1 .OR. ibc_p_b == 2 ) THEN
	391	DO i = nxl_z, nxr_z
	392	tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
	393	ENDDO
	394	ENDIF
	395	IF ( ibc_p_t == 1 ) THEN
	396	DO i = nxl_z, nxr_z
	397	tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
	398	ENDDO
	399	ENDIF
	400
	401	END SUBROUTINE maketri
	402
	403
	404	SUBROUTINE substi( ar, ar1, tri, j )
	405
	406	!------------------------------------------------------------------------------!
	407	! Substitution (Forward and Backward) (Thomas algorithm)
	408	!------------------------------------------------------------------------------!
	409
[76]	410	USE control_parameters
	411
[1]	412	IMPLICIT NONE
	413
	414	INTEGER :: i, j, k
	415	REAL :: ar1(nxl_z:nxr_z,0:nz-1)
	416	REAL :: tri(5,nxl_z:nxr_z,0:nz-1)
	417	#if defined( __parallel )
	418	REAL :: ar(nxl_z:nxr_za,nys_z:nyn_za,1:nza)
	419	#else
	420	REAL :: ar(1:nz,nys_z:nyn_z,nxl_z:nxr_z)
	421	#endif
	422
	423	!
	424	!-- Forward substitution.
	425	DO i = nxl_z, nxr_z
	426	#if defined( __parallel )
	427	ar1(i,0) = ar(i,j,1)
	428	#else
	429	ar1(i,0) = ar(1,j,i)
	430	#endif
	431	ENDDO
	432	DO k = 1, nz - 1
	433	DO i = nxl_z, nxr_z
	434	#if defined( __parallel )
	435	ar1(i,k) = ar(i,j,k+1) - tri(5,i,k) * ar1(i,k-1)
	436	#else
	437	ar1(i,k) = ar(k+1,j,i) - tri(5,i,k) * ar1(i,k-1)
	438	#endif
	439	ENDDO
	440	ENDDO
	441
	442	!
	443	!-- Backward substitution.
	444	DO i = nxl_z, nxr_z
	445	#if defined( __parallel )
	446	ar(i,j,nz) = ar1(i,nz-1) / tri(4,i,nz-1)
	447	#else
	448	ar(nz,j,i) = ar1(i,nz-1) / tri(4,i,nz-1)
	449	#endif
	450	ENDDO
	451	DO k = nz-2, 0, -1
	452	DO i = nxl_z, nxr_z
	453	#if defined( __parallel )
	454	ar(i,j,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,j,k+2) ) &
	455	/ tri(4,i,k)
	456	#else
	457	ar(k+1,j,i) = ( ar1(i,k) - tri(3,i,k) * ar(k+2,j,i) ) &
	458	/ tri(4,i,k)
	459	#endif
	460	ENDDO
	461	ENDDO
	462
[76]	463	!
	464	!-- Indices i=0, j=0 correspond to horizontally averaged pressure.
	465	!-- The respective values of ar should be zero at all k-levels if
	466	!-- acceleration of horizontally averaged vertical velocity is zero.
	467	IF ( ibc_p_b == 1 .AND. ibc_p_t == 1 ) THEN
	468	IF ( j == 0 .AND. nxl_z == 0 ) THEN
	469	#if defined( __parallel )
	470	DO k = 1, nz
	471	ar(nxl_z,j,k) = 0.0
	472	ENDDO
	473	#else
	474	DO k = 1, nz
	475	ar(k,j,nxl_z) = 0.0
	476	ENDDO
	477	#endif
	478	ENDIF
	479	ENDIF
	480
[1]	481	END SUBROUTINE substi
	482
	483
	484	SUBROUTINE split( tri )
	485
	486	!------------------------------------------------------------------------------!
	487	! Splitting of the tridiagonal matrix (Thomas algorithm)
	488	!------------------------------------------------------------------------------!
	489
	490	IMPLICIT NONE
	491
	492	INTEGER :: i, k
	493	REAL :: tri(5,nxl_z:nxr_z,0:nz-1)
	494
	495	!
	496	!-- Splitting.
	497	DO i = nxl_z, nxr_z
	498	tri(4,i,0) = tri(1,i,0)
	499	ENDDO
	500	DO k = 1, nz-1
	501	DO i = nxl_z, nxr_z
	502	tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
	503	tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
	504	ENDDO
	505	ENDDO
	506
	507	END SUBROUTINE split
	508
	509	END SUBROUTINE tridia
	510
	511
	512	#if defined( __parallel )
	513	SUBROUTINE fftxp( ar, direction )
	514
	515	!------------------------------------------------------------------------------!
	516	! Fourier-transformation along x-direction Parallelized version
	517	!------------------------------------------------------------------------------!
	518
	519	IMPLICIT NONE
	520
	521	CHARACTER (LEN=*) :: direction
	522	INTEGER :: j, k
	523	REAL :: ar(0:nxa,nys_x:nyn_xa,nzb_x:nzt_xa)
	524
	525	!
	526	!-- Performing the fft with one of the methods implemented
	527	DO k = nzb_x, nzt_x
	528	DO j = nys_x, nyn_x
	529	CALL fft_x( ar(0:nx,j,k), direction )
	530	ENDDO
	531	ENDDO
	532
	533	END SUBROUTINE fftxp
	534
	535	#else
	536	SUBROUTINE fftx( ar, direction )
	537
	538	!------------------------------------------------------------------------------!
	539	! Fourier-transformation along x-direction Non parallel version
	540	!------------------------------------------------------------------------------!
	541
	542	IMPLICIT NONE
	543
	544	CHARACTER (LEN=*) :: direction
	545	INTEGER :: i, j, k
	546	REAL :: ar(1:nz,0:ny,0:nx)
	547
	548	!
	549	!-- Performing the fft with one of the methods implemented
	550	DO k = 1, nz
	551	DO j = 0, ny
	552	CALL fft_x( ar(k,j,0:nx), direction )
	553	ENDDO
	554	ENDDO
	555
	556	END SUBROUTINE fftx
	557	#endif
	558
	559
	560	#if defined( __parallel )
	561	SUBROUTINE fftyp( ar, direction )
	562
	563	!------------------------------------------------------------------------------!
	564	! Fourier-transformation along y-direction Parallelized version
	565	!------------------------------------------------------------------------------!
	566
	567	IMPLICIT NONE
	568
	569	CHARACTER (LEN=*) :: direction
	570	INTEGER :: i, k
	571	REAL :: ar(0:nya,nxl_y:nxr_ya,nzb_y:nzt_ya)
	572
	573	!
	574	!-- Performing the fft with one of the methods implemented
	575	DO k = nzb_y, nzt_y
	576	DO i = nxl_y, nxr_y
	577	CALL fft_y( ar(0:ny,i,k), direction )
	578	ENDDO
	579	ENDDO
	580
	581	END SUBROUTINE fftyp
	582
	583	#else
	584	SUBROUTINE ffty( ar, direction )
	585
	586	!------------------------------------------------------------------------------!
	587	! Fourier-transformation along y-direction Non parallel version
	588	!------------------------------------------------------------------------------!
	589
	590	IMPLICIT NONE
	591
	592	CHARACTER (LEN=*) :: direction
	593	INTEGER :: i, k
	594	REAL :: ar(1:nz,0:ny,0:nx)
	595
	596	!
	597	!-- Performing the fft with one of the methods implemented
	598	DO k = 1, nz
	599	DO i = 0, nx
	600	CALL fft_y( ar(k,0:ny,i), direction )
	601	ENDDO
	602	ENDDO
	603
	604	END SUBROUTINE ffty
	605	#endif
	606
	607	#if defined( __parallel )
	608	SUBROUTINE ffty_tr_yx( f_in, work, f_out )
	609
	610	!------------------------------------------------------------------------------!
	611	! Fourier-transformation along y with subsequent transposition y --> x for
	612	! a 1d-decomposition along x
	613	!
	614	! ATTENTION: The performance of this routine is much faster on the NEC-SX6,
	615	! if the first index of work_ffty_vec is odd. Otherwise
	616	! memory bank conflicts may occur (especially if the index is a
	617	! multiple of 128). That's why work_ffty_vec is dimensioned as
	618	! 0:ny+1.
	619	! Of course, this will not work if users are using an odd number
	620	! of gridpoints along y.
	621	!------------------------------------------------------------------------------!
	622
	623	USE control_parameters
	624	USE cpulog
	625	USE indices
	626	USE interfaces
	627	USE pegrid
	628	USE transpose_indices
	629
	630	IMPLICIT NONE
	631
	632	INTEGER :: i, iend, iouter, ir, j, k
	633	INTEGER, PARAMETER :: stridex = 4
	634
	635	REAL, DIMENSION(0:ny,stridex) :: work_ffty
	636	#if defined( __nec )
	637	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
	638	#endif
	639	REAL, DIMENSION(1:nza,0:nya,nxl:nxra) :: f_in
	640	REAL, DIMENSION(nnx,1:nza,nys_x:nyn_xa,pdims(1)) :: f_out
	641	REAL, DIMENSION(nxl:nxra,1:nza,0:nya) :: work
	642
	643	!
	644	!-- Carry out the FFT along y, where all data are present due to the
	645	!-- 1d-decomposition along x. Resort the data in a way that x becomes
	646	!-- the first index.
	647	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
	648
	649	IF ( host(1:3) == 'nec' ) THEN
	650	#if defined( __nec )
	651	!
	652	!-- Code optimized for vector processors
[85]	653	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	654	!$OMP DO
	655	DO i = nxl, nxr
	656
	657	DO j = 0, ny
	658	DO k = 1, nz
	659	work_ffty_vec(j,k,i) = f_in(k,j,i)
	660	ENDDO
	661	ENDDO
	662
	663	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' )
	664
	665	ENDDO
	666
	667	!$OMP DO
	668	DO k = 1, nz
	669	DO j = 0, ny
	670	DO i = nxl, nxr
	671	work(i,k,j) = work_ffty_vec(j,k,i)
	672	ENDDO
	673	ENDDO
	674	ENDDO
	675	!$OMP END PARALLEL
	676	#endif
	677
	678	ELSE
	679
	680	!
	681	!-- Cache optimized code.
	682	!-- The i-(x-)direction is split into a strided outer loop and an inner
	683	!-- loop for better cache performance
	684	!$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty)
	685	!$OMP DO
	686	DO iouter = nxl, nxr, stridex
	687
	688	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
	689
	690	DO k = 1, nz
	691
	692	DO i = iouter, iend
	693
	694	ir = i-iouter+1 ! counter within a stride
	695	DO j = 0, ny
	696	work_ffty(j,ir) = f_in(k,j,i)
	697	ENDDO
	698	!
	699	!-- FFT along y
	700	CALL fft_y( work_ffty(:,ir), 'forward' )
	701
	702	ENDDO
	703
	704	!
	705	!-- Resort
	706	DO j = 0, ny
	707	DO i = iouter, iend
	708	work(i,k,j) = work_ffty(j,i-iouter+1)
	709	ENDDO
	710	ENDDO
	711
	712	ENDDO
	713
	714	ENDDO
	715	!$OMP END PARALLEL
	716
	717	ENDIF
	718	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	719
	720	!
	721	!-- Transpose array
	722	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	723	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	724	CALL MPI_ALLTOALL( work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
	725	f_out(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
	726	comm1dx, ierr )
	727	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	728
	729	END SUBROUTINE ffty_tr_yx
	730
	731
	732	SUBROUTINE tr_xy_ffty( f_in, work, f_out )
	733
	734	!------------------------------------------------------------------------------!
	735	! Transposition x --> y with a subsequent backward Fourier transformation for
	736	! a 1d-decomposition along x
	737	!------------------------------------------------------------------------------!
	738
	739	USE control_parameters
	740	USE cpulog
	741	USE indices
	742	USE interfaces
	743	USE pegrid
	744	USE transpose_indices
	745
	746	IMPLICIT NONE
	747
	748	INTEGER :: i, iend, iouter, ir, j, k
	749	INTEGER, PARAMETER :: stridex = 4
	750
	751	REAL, DIMENSION(0:ny,stridex) :: work_ffty
	752	#if defined( __nec )
	753	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
	754	#endif
	755	REAL, DIMENSION(nnx,1:nza,nys_x:nyn_xa,pdims(1)) :: f_in
	756	REAL, DIMENSION(1:nza,0:nya,nxl:nxra) :: f_out
	757	REAL, DIMENSION(nxl:nxra,1:nza,0:nya) :: work
	758
	759	!
	760	!-- Transpose array
	761	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	762	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	763	CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
	764	work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
	765	comm1dx, ierr )
	766	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	767
	768	!
	769	!-- Resort the data in a way that y becomes the first index and carry out the
	770	!-- backward fft along y.
	771	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
	772
	773	IF ( host(1:3) == 'nec' ) THEN
	774	#if defined( __nec )
	775	!
	776	!-- Code optimized for vector processors
[85]	777	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	778	!$OMP DO
	779	DO k = 1, nz
	780	DO j = 0, ny
	781	DO i = nxl, nxr
	782	work_ffty_vec(j,k,i) = work(i,k,j)
	783	ENDDO
	784	ENDDO
	785	ENDDO
	786
	787	!$OMP DO
	788	DO i = nxl, nxr
	789
	790	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' )
	791
	792	DO j = 0, ny
	793	DO k = 1, nz
	794	f_out(k,j,i) = work_ffty_vec(j,k,i)
	795	ENDDO
	796	ENDDO
	797
	798	ENDDO
	799	!$OMP END PARALLEL
	800	#endif
	801
	802	ELSE
	803
	804	!
	805	!-- Cache optimized code.
	806	!-- The i-(x-)direction is split into a strided outer loop and an inner
	807	!-- loop for better cache performance
	808	!$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty )
	809	!$OMP DO
	810	DO iouter = nxl, nxr, stridex
	811
	812	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
	813
	814	DO k = 1, nz
	815	!
	816	!-- Resort
	817	DO j = 0, ny
	818	DO i = iouter, iend
	819	work_ffty(j,i-iouter+1) = work(i,k,j)
	820	ENDDO
	821	ENDDO
	822
	823	DO i = iouter, iend
	824
	825	!
	826	!-- FFT along y
	827	ir = i-iouter+1 ! counter within a stride
	828	CALL fft_y( work_ffty(:,ir), 'backward' )
	829
	830	DO j = 0, ny
	831	f_out(k,j,i) = work_ffty(j,ir)
	832	ENDDO
	833	ENDDO
	834
	835	ENDDO
	836
	837	ENDDO
	838	!$OMP END PARALLEL
	839
	840	ENDIF
	841
	842	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	843
	844	END SUBROUTINE tr_xy_ffty
	845
	846
	847	SUBROUTINE fftx_tri_fftx( ar )
	848
	849	!------------------------------------------------------------------------------!
	850	! FFT along x, solution of the tridiagonal system and backward FFT for
	851	! a 1d-decomposition along x
	852	!
	853	! WARNING: this subroutine may still not work for hybrid parallelization
	854	! with OpenMP (for possible necessary changes see the original
	855	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
	856	!------------------------------------------------------------------------------!
	857
	858	USE control_parameters
	859	USE cpulog
	860	USE grid_variables
	861	USE indices
	862	USE interfaces
	863	USE pegrid
	864	USE transpose_indices
	865
	866	IMPLICIT NONE
	867
	868	character(len=3) :: myth_char
	869
	870	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
	871
	872	REAL, DIMENSION(0:nx) :: work_fftx
	873	REAL, DIMENSION(0:nx,1:nz) :: work_trix
	874	REAL, DIMENSION(nnx,1:nza,nys_x:nyn_xa,pdims(1)) :: ar
	875	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
	876
	877
	878	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'start' )
	879
	880	ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) )
	881
	882	tn = 0 ! Default thread number in case of one thread
	883	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_fftx, work_trix )
	884	DO j = nys_x, nyn_x
	885
	886	!$ tn = omp_get_thread_num()
	887
	888	IF ( host(1:3) == 'nec' ) THEN
	889	!
	890	!-- Code optimized for vector processors
	891	DO k = 1, nz
	892
	893	m = 0
	894	DO n = 1, pdims(1)
	895	DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
	896	work_trix(m,k) = ar(i,k,j,n)
	897	m = m + 1
	898	ENDDO
	899	ENDDO
	900
	901	ENDDO
	902
	903	CALL fft_x_m( work_trix, 'forward' )
	904
	905	ELSE
	906	!
	907	!-- Cache optimized code
	908	DO k = 1, nz
	909
	910	m = 0
	911	DO n = 1, pdims(1)
	912	DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
	913	work_fftx(m) = ar(i,k,j,n)
	914	m = m + 1
	915	ENDDO
	916	ENDDO
	917
	918	CALL fft_x( work_fftx, 'forward' )
	919
	920	DO i = 0, nx
	921	work_trix(i,k) = work_fftx(i)
	922	ENDDO
	923
	924	ENDDO
	925
	926	ENDIF
	927
	928	!
	929	!-- Solve the linear equation system
	930	CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) )
	931
	932	IF ( host(1:3) == 'nec' ) THEN
	933	!
	934	!-- Code optimized for vector processors
	935	CALL fft_x_m( work_trix, 'backward' )
	936
	937	DO k = 1, nz
	938
	939	m = 0
	940	DO n = 1, pdims(1)
	941	DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
	942	ar(i,k,j,n) = work_trix(m,k)
	943	m = m + 1
	944	ENDDO
	945	ENDDO
	946
	947	ENDDO
	948
	949	ELSE
	950	!
	951	!-- Cache optimized code
	952	DO k = 1, nz
	953
	954	DO i = 0, nx
	955	work_fftx(i) = work_trix(i,k)
	956	ENDDO
	957
	958	CALL fft_x( work_fftx, 'backward' )
	959
	960	m = 0
	961	DO n = 1, pdims(1)
	962	DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
	963	ar(i,k,j,n) = work_fftx(m)
	964	m = m + 1
	965	ENDDO
	966	ENDDO
	967
	968	ENDDO
	969
	970	ENDIF
	971
	972	ENDDO
	973
	974	DEALLOCATE( tri )
	975
	976	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'stop' )
	977
	978	END SUBROUTINE fftx_tri_fftx
	979
	980
	981	SUBROUTINE fftx_tr_xy( f_in, work, f_out )
	982
	983	!------------------------------------------------------------------------------!
	984	! Fourier-transformation along x with subsequent transposition x --> y for
	985	! a 1d-decomposition along y
	986	!
	987	! ATTENTION: The NEC-branch of this routine may significantly profit from
	988	! further optimizations. So far, performance is much worse than
	989	! for routine ffty_tr_yx (more than three times slower).
	990	!------------------------------------------------------------------------------!
	991
	992	USE control_parameters
	993	USE cpulog
	994	USE indices
	995	USE interfaces
	996	USE pegrid
	997	USE transpose_indices
	998
	999	IMPLICIT NONE
	1000
	1001	INTEGER :: i, j, k
	1002
	1003	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
	1004	REAL, DIMENSION(1:nza,nys:nyna,0:nxa) :: f_in
	1005	REAL, DIMENSION(nny,1:nza,nxl_y:nxr_ya,pdims(2)) :: f_out
	1006	REAL, DIMENSION(nys:nyna,1:nza,0:nxa) :: work
	1007
	1008	!
	1009	!-- Carry out the FFT along x, where all data are present due to the
	1010	!-- 1d-decomposition along y. Resort the data in a way that y becomes
	1011	!-- the first index.
	1012	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
	1013
	1014	IF ( host(1:3) == 'nec' ) THEN
	1015	!
	1016	!-- Code for vector processors
[85]	1017	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	1018	!$OMP DO
	1019	DO i = 0, nx
	1020
	1021	DO j = nys, nyn
	1022	DO k = 1, nz
	1023	work_fftx(i,k,j) = f_in(k,j,i)
	1024	ENDDO
	1025	ENDDO
	1026
	1027	ENDDO
	1028
	1029	!$OMP DO
	1030	DO j = nys, nyn
	1031
	1032	CALL fft_x_m( work_fftx(:,:,j), 'forward' )
	1033
	1034	DO k = 1, nz
	1035	DO i = 0, nx
	1036	work(j,k,i) = work_fftx(i,k,j)
	1037	ENDDO
	1038	ENDDO
	1039
	1040	ENDDO
	1041	!$OMP END PARALLEL
	1042
	1043	ELSE
	1044
	1045	!
	1046	!-- Cache optimized code (there might be still a potential for better
	1047	!-- optimization).
	1048	!$OMP PARALLEL PRIVATE (i,j,k,work_fftx)
	1049	!$OMP DO
	1050	DO i = 0, nx
	1051
	1052	DO j = nys, nyn
	1053	DO k = 1, nz
	1054	work_fftx(i,k,j) = f_in(k,j,i)
	1055	ENDDO
	1056	ENDDO
	1057
	1058	ENDDO
	1059
	1060	!$OMP DO
	1061	DO j = nys, nyn
	1062	DO k = 1, nz
	1063
	1064	CALL fft_x( work_fftx(0:nx,k,j), 'forward' )
	1065
	1066	DO i = 0, nx
	1067	work(j,k,i) = work_fftx(i,k,j)
	1068	ENDDO
	1069	ENDDO
	1070
	1071	ENDDO
	1072	!$OMP END PARALLEL
	1073
	1074	ENDIF
	1075	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
	1076
	1077	!
	1078	!-- Transpose array
	1079	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	1080	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	1081	CALL MPI_ALLTOALL( work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
	1082	f_out(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
	1083	comm1dy, ierr )
	1084	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	1085
	1086	END SUBROUTINE fftx_tr_xy
	1087
	1088
	1089	SUBROUTINE tr_yx_fftx( f_in, work, f_out )
	1090
	1091	!------------------------------------------------------------------------------!
	1092	! Transposition y --> x with a subsequent backward Fourier transformation for
	1093	! a 1d-decomposition along x
	1094	!------------------------------------------------------------------------------!
	1095
	1096	USE control_parameters
	1097	USE cpulog
	1098	USE indices
	1099	USE interfaces
	1100	USE pegrid
	1101	USE transpose_indices
	1102
	1103	IMPLICIT NONE
	1104
	1105	INTEGER :: i, j, k
	1106
	1107	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
	1108	REAL, DIMENSION(nny,1:nza,nxl_y:nxr_ya,pdims(2)) :: f_in
	1109	REAL, DIMENSION(1:nza,nys:nyna,0:nxa) :: f_out
	1110	REAL, DIMENSION(nys:nyna,1:nza,0:nxa) :: work
	1111
	1112	!
	1113	!-- Transpose array
	1114	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	1115	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	1116	CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
	1117	work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
	1118	comm1dy, ierr )
	1119	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	1120
	1121	!
	1122	!-- Carry out the FFT along x, where all data are present due to the
	1123	!-- 1d-decomposition along y. Resort the data in a way that y becomes
	1124	!-- the first index.
	1125	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
	1126
	1127	IF ( host(1:3) == 'nec' ) THEN
	1128	!
	1129	!-- Code optimized for vector processors
[85]	1130	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	1131	!$OMP DO
	1132	DO j = nys, nyn
	1133
	1134	DO k = 1, nz
	1135	DO i = 0, nx
	1136	work_fftx(i,k,j) = work(j,k,i)
	1137	ENDDO
	1138	ENDDO
	1139
	1140	CALL fft_x_m( work_fftx(:,:,j), 'backward' )
	1141
	1142	ENDDO
	1143
	1144	!$OMP DO
	1145	DO i = 0, nx
	1146	DO j = nys, nyn
	1147	DO k = 1, nz
	1148	f_out(k,j,i) = work_fftx(i,k,j)
	1149	ENDDO
	1150	ENDDO
	1151	ENDDO
	1152	!$OMP END PARALLEL
	1153
	1154	ELSE
	1155
	1156	!
	1157	!-- Cache optimized code (there might be still a potential for better
	1158	!-- optimization).
	1159	!$OMP PARALLEL PRIVATE (i,j,k,work_fftx)
	1160	!$OMP DO
	1161	DO j = nys, nyn
	1162	DO k = 1, nz
	1163
	1164	DO i = 0, nx
	1165	work_fftx(i,k,j) = work(j,k,i)
	1166	ENDDO
	1167
	1168	CALL fft_x( work_fftx(0:nx,k,j), 'backward' )
	1169
	1170	ENDDO
	1171	ENDDO
	1172
	1173	!$OMP DO
	1174	DO i = 0, nx
	1175	DO j = nys, nyn
	1176	DO k = 1, nz
	1177	f_out(k,j,i) = work_fftx(i,k,j)
	1178	ENDDO
	1179	ENDDO
	1180	ENDDO
	1181	!$OMP END PARALLEL
	1182
	1183	ENDIF
	1184	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	1185
	1186	END SUBROUTINE tr_yx_fftx
	1187
	1188
	1189	SUBROUTINE ffty_tri_ffty( ar )
	1190
	1191	!------------------------------------------------------------------------------!
	1192	! FFT along y, solution of the tridiagonal system and backward FFT for
	1193	! a 1d-decomposition along y
	1194	!
	1195	! WARNING: this subroutine may still not work for hybrid parallelization
	1196	! with OpenMP (for possible necessary changes see the original
	1197	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
	1198	!------------------------------------------------------------------------------!
	1199
	1200	USE control_parameters
	1201	USE cpulog
	1202	USE grid_variables
	1203	USE indices
	1204	USE interfaces
	1205	USE pegrid
	1206	USE transpose_indices
	1207
	1208	IMPLICIT NONE
	1209
	1210	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
	1211
	1212	REAL, DIMENSION(0:ny) :: work_ffty
	1213	REAL, DIMENSION(0:ny,1:nz) :: work_triy
	1214	REAL, DIMENSION(nny,1:nza,nxl_y:nxr_ya,pdims(2)) :: ar
	1215	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
	1216
	1217
	1218	CALL cpu_log( log_point_s(39), 'fft_y + tridia', 'start' )
	1219
	1220	ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) )
	1221
	1222	tn = 0 ! Default thread number in case of one thread
	1223	!$OMP PARALLEL PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy )
	1224	!$OMP DO
	1225	DO i = nxl_y, nxr_y
	1226
	1227	!$ tn = omp_get_thread_num()
	1228
	1229	IF ( host(1:3) == 'nec' ) THEN
	1230	!
	1231	!-- Code optimized for vector processors
	1232	DO k = 1, nz
	1233
	1234	m = 0
	1235	DO n = 1, pdims(2)
	1236	DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
	1237	work_triy(m,k) = ar(j,k,i,n)
	1238	m = m + 1
	1239	ENDDO
	1240	ENDDO
	1241
	1242	ENDDO
	1243
	1244	CALL fft_y_m( work_triy, ny, 'forward' )
	1245
	1246	ELSE
	1247	!
	1248	!-- Cache optimized code
	1249	DO k = 1, nz
	1250
	1251	m = 0
	1252	DO n = 1, pdims(2)
	1253	DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
	1254	work_ffty(m) = ar(j,k,i,n)
	1255	m = m + 1
	1256	ENDDO
	1257	ENDDO
	1258
	1259	CALL fft_y( work_ffty, 'forward' )
	1260
	1261	DO j = 0, ny
	1262	work_triy(j,k) = work_ffty(j)
	1263	ENDDO
	1264
	1265	ENDDO
	1266
	1267	ENDIF
	1268
	1269	!
	1270	!-- Solve the linear equation system
	1271	CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) )
	1272
	1273	IF ( host(1:3) == 'nec' ) THEN
	1274	!
	1275	!-- Code optimized for vector processors
	1276	CALL fft_y_m( work_triy, ny, 'backward' )
	1277
	1278	DO k = 1, nz
	1279
	1280	m = 0
	1281	DO n = 1, pdims(2)
	1282	DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
	1283	ar(j,k,i,n) = work_triy(m,k)
	1284	m = m + 1
	1285	ENDDO
	1286	ENDDO
	1287
	1288	ENDDO
	1289
	1290	ELSE
	1291	!
	1292	!-- Cache optimized code
	1293	DO k = 1, nz
	1294
	1295	DO j = 0, ny
	1296	work_ffty(j) = work_triy(j,k)
	1297	ENDDO
	1298
	1299	CALL fft_y( work_ffty, 'backward' )
	1300
	1301	m = 0
	1302	DO n = 1, pdims(2)
	1303	DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
	1304	ar(j,k,i,n) = work_ffty(m)
	1305	m = m + 1
	1306	ENDDO
	1307	ENDDO
	1308
	1309	ENDDO
	1310
	1311	ENDIF
	1312
	1313	ENDDO
	1314	!$OMP END PARALLEL
	1315
	1316	DEALLOCATE( tri )
	1317
	1318	CALL cpu_log( log_point_s(39), 'fft_y + tridia', 'stop' )
	1319
	1320	END SUBROUTINE ffty_tri_ffty
	1321
	1322
	1323	SUBROUTINE tridia_1dd( ddx2, ddy2, nx, ny, j, ar, tri )
	1324
	1325	!------------------------------------------------------------------------------!
	1326	! Solves the linear system of equations for a 1d-decomposition along x (see
	1327	! tridia)
	1328	!
	1329	! Attention: when using the intel compiler, array tri must be passed as an
	1330	! argument to the contained subroutines. Otherwise addres faults
	1331	! will occur.
	1332	! On NEC, tri should not be passed (except for routine substi_1dd)
	1333	! because this causes very bad performance.
	1334	!------------------------------------------------------------------------------!
	1335
	1336	USE arrays_3d
	1337	USE control_parameters
	1338
	1339	USE pegrid
	1340
	1341	IMPLICIT NONE
	1342
	1343	INTEGER :: i, j, k, nnyh, nx, ny, omp_get_thread_num, tn
	1344
	1345	REAL :: ddx2, ddy2
	1346
	1347	REAL, DIMENSION(0:nx,1:nz) :: ar
	1348	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
	1349	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	1350
	1351
	1352	nnyh = ( ny + 1 ) / 2
	1353
	1354	!
	1355	!-- Define constant elements of the tridiagonal matrix.
	1356	!-- The compiler on SX6 does loop exchange. If 0:nx is a high power of 2,
	1357	!-- the exchanged loops create bank conflicts. The following directive
	1358	!-- prohibits loop exchange and the loops perform much better.
	1359	! tn = omp_get_thread_num()
	1360	! WRITE( 120+tn, * ) '+++ id=',myid,' nx=',nx,' thread=', omp_get_thread_num()
[82]	1361	! CALL local_flush( 120+tn )
[1]	1362	!CDIR NOLOOPCHG
	1363	DO k = 0, nz-1
	1364	DO i = 0,nx
	1365	tri(2,i,k) = ddzu(k+1) * ddzw(k+1)
	1366	tri(3,i,k) = ddzu(k+2) * ddzw(k+1)
	1367	ENDDO
	1368	ENDDO
	1369	! WRITE( 120+tn, * ) '+++ id=',myid,' end of first tridia loop thread=', omp_get_thread_num()
[82]	1370	! CALL local_flush( 120+tn )
[1]	1371
	1372	IF ( j <= nnyh ) THEN
[377]	1373	#if defined( __lc )
[1]	1374	CALL maketri_1dd( j, tri )
	1375	#else
	1376	CALL maketri_1dd( j )
	1377	#endif
	1378	ELSE
[377]	1379	#if defined( __lc )
[1]	1380	CALL maketri_1dd( ny+1-j, tri )
	1381	#else
	1382	CALL maketri_1dd( ny+1-j )
	1383	#endif
	1384	ENDIF
[377]	1385	#if defined( __lc )
[1]	1386	CALL split_1dd( tri )
	1387	#else
	1388	CALL split_1dd
	1389	#endif
	1390	CALL substi_1dd( ar, tri )
	1391
	1392	CONTAINS
	1393
[377]	1394	#if defined( __lc )
[1]	1395	SUBROUTINE maketri_1dd( j, tri )
	1396	#else
	1397	SUBROUTINE maketri_1dd( j )
	1398	#endif
	1399
	1400	!------------------------------------------------------------------------------!
	1401	! computes the i- and j-dependent component of the matrix
	1402	!------------------------------------------------------------------------------!
	1403
	1404	USE constants
	1405
	1406	IMPLICIT NONE
	1407
	1408	INTEGER :: i, j, k, nnxh
	1409	REAL :: a, c
	1410
	1411	REAL, DIMENSION(0:nx) :: l
	1412
[377]	1413	#if defined( __lc )
[1]	1414	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	1415	#endif
	1416
	1417
	1418	nnxh = ( nx + 1 ) / 2
	1419	!
	1420	!-- Provide the tridiagonal matrix for solution of the Poisson equation in
	1421	!-- Fourier space. The coefficients are computed following the method of
	1422	!-- Schmidt et al. (DFVLR-Mitteilung 84-15), which departs from Stephan
	1423	!-- Siano's original version by discretizing the Poisson equation,
	1424	!-- before it is Fourier-transformed
	1425	DO i = 0, nx
[128]	1426	IF ( i >= 0 .AND. i <= nnxh ) THEN
[1]	1427	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
	1428	FLOAT( nx+1 ) ) ) * ddx2 + &
	1429	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
	1430	FLOAT( ny+1 ) ) ) * ddy2
	1431	ELSE
	1432	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
	1433	FLOAT( nx+1 ) ) ) * ddx2 + &
	1434	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
	1435	FLOAT( ny+1 ) ) ) * ddy2
	1436	ENDIF
	1437	ENDDO
	1438
	1439	DO k = 0, nz-1
	1440	DO i = 0, nx
	1441	a = -1.0 * ddzu(k+2) * ddzw(k+1)
	1442	c = -1.0 * ddzu(k+1) * ddzw(k+1)
	1443	tri(1,i,k) = a + c - l(i)
	1444	ENDDO
	1445	ENDDO
	1446	IF ( ibc_p_b == 1 .OR. ibc_p_b == 2 ) THEN
	1447	DO i = 0, nx
	1448	tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
	1449	ENDDO
	1450	ENDIF
	1451	IF ( ibc_p_t == 1 ) THEN
	1452	DO i = 0, nx
	1453	tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
	1454	ENDDO
	1455	ENDIF
	1456
	1457	END SUBROUTINE maketri_1dd
	1458
	1459
[377]	1460	#if defined( __lc )
[1]	1461	SUBROUTINE split_1dd( tri )
	1462	#else
	1463	SUBROUTINE split_1dd
	1464	#endif
	1465
	1466	!------------------------------------------------------------------------------!
	1467	! Splitting of the tridiagonal matrix (Thomas algorithm)
	1468	!------------------------------------------------------------------------------!
	1469
	1470	IMPLICIT NONE
	1471
	1472	INTEGER :: i, k
	1473
[377]	1474	#if defined( __lc )
[1]	1475	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	1476	#endif
	1477
	1478
	1479	!
	1480	!-- Splitting
	1481	DO i = 0, nx
	1482	tri(4,i,0) = tri(1,i,0)
	1483	ENDDO
	1484	DO k = 1, nz-1
	1485	DO i = 0, nx
	1486	tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
	1487	tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
	1488	ENDDO
	1489	ENDDO
	1490
	1491	END SUBROUTINE split_1dd
	1492
	1493
	1494	SUBROUTINE substi_1dd( ar, tri )
	1495
	1496	!------------------------------------------------------------------------------!
	1497	! Substitution (Forward and Backward) (Thomas algorithm)
	1498	!------------------------------------------------------------------------------!
	1499
	1500	IMPLICIT NONE
	1501
[76]	1502	INTEGER :: i, k
[1]	1503
	1504	REAL, DIMENSION(0:nx,nz) :: ar
	1505	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
	1506	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	1507
	1508	!
	1509	!-- Forward substitution
	1510	DO i = 0, nx
	1511	ar1(i,0) = ar(i,1)
	1512	ENDDO
	1513	DO k = 1, nz-1
	1514	DO i = 0, nx
	1515	ar1(i,k) = ar(i,k+1) - tri(5,i,k) * ar1(i,k-1)
	1516	ENDDO
	1517	ENDDO
	1518
	1519	!
	1520	!-- Backward substitution
	1521	DO i = 0, nx
	1522	ar(i,nz) = ar1(i,nz-1) / tri(4,i,nz-1)
	1523	ENDDO
	1524	DO k = nz-2, 0, -1
	1525	DO i = 0, nx
	1526	ar(i,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,k+2) ) &
	1527	/ tri(4,i,k)
	1528	ENDDO
	1529	ENDDO
	1530
[76]	1531	!
	1532	!-- Indices i=0, j=0 correspond to horizontally averaged pressure.
	1533	!-- The respective values of ar should be zero at all k-levels if
	1534	!-- acceleration of horizontally averaged vertical velocity is zero.
	1535	IF ( ibc_p_b == 1 .AND. ibc_p_t == 1 ) THEN
	1536	IF ( j == 0 ) THEN
	1537	DO k = 1, nz
	1538	ar(0,k) = 0.0
	1539	ENDDO
	1540	ENDIF
	1541	ENDIF
	1542
[1]	1543	END SUBROUTINE substi_1dd
	1544
	1545	END SUBROUTINE tridia_1dd
	1546
	1547	#endif
	1548
	1549	END MODULE poisfft_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |