Home

Context Navigation

source: palm/trunk/SOURCE/poisfft.f90 @ 757

Last change on this file since 757 was 697, checked in by raasch, 14 years ago
last commit documented
Property svn:keywords set to `Id`
File size: 45.6 KB

Rev	Line
[1]	1	MODULE poisfft_mod
	2
	3	!------------------------------------------------------------------------------!
[484]	4	! Current revisions:
[1]	5	! -----------------
[697]	6	!
[1]	7	!
	8	! Former revisions:
	9	! -----------------
[3]	10	! $Id: poisfft.f90 697 2011-03-18 07:10:52Z helmke $
[77]	11	!
[697]	12	! 696 2011-03-18 07:03:49Z raasch
	13	! work_fftx removed from PRIVATE clauses in fftx_tr_xy and tr_yx_fftx
	14	!
[684]	15	! 683 2011-02-09 14:25:15Z raasch
	16	! openMP parallelization for 2d-domain-decomposition
	17	!
[668]	18	! 667 2010-12-23 12:06:00Z suehring/gryschka
	19	! ddzu replaced by ddzu_pres due to changes in zu(0)
	20	!
[623]	21	! 622 2010-12-10 08:08:13Z raasch
	22	! optional barriers included in order to speed up collective operations
	23	!
[392]	24	! 377 2009-09-04 11:09:00Z raasch
	25	! __lcmuk changed to __lc to avoid problems with Intel compiler on sgi-ice
	26	!
[198]	27	! 164 2008-05-15 08:46:15Z raasch
	28	! Arguments removed from transpose routines
	29	!
[139]	30	! 128 2007-10-26 13:11:14Z raasch
	31	! Bugfix: wavenumber calculation for even nx in routines maketri
	32	!
[90]	33	! 85 2007-05-11 09:35:14Z raasch
	34	! Bugfix: work_fft*_vec removed from some PRIVATE-declarations
	35	!
[77]	36	! 76 2007-03-29 00:58:32Z raasch
	37	! Tridiagonal coefficients adjusted for Neumann boundary conditions both at
	38	! the bottom and the top.
	39	!
[3]	40	! RCS Log replace by Id keyword, revision history cleaned up
	41	!
[1]	42	! Revision 1.24 2006/08/04 15:00:24 raasch
	43	! Default setting of the thread number tn in case of not using OpenMP
	44	!
	45	! Revision 1.23 2006/02/23 12:48:38 raasch
	46	! Additional compiler directive in routine tridia_1dd for preventing loop
	47	! exchange on NEC-SX6
	48	!
	49	! Revision 1.20 2004/04/30 12:38:09 raasch
	50	! Parts of former poisfft_hybrid moved to this subroutine,
	51	! former subroutine changed to a module, renaming of FFT-subroutines and
	52	! -module, FFTs completely substituted by calls of fft_x and fft_y,
	53	! NAG fft used in the non-parallel case completely removed, l in maketri
	54	! is now a 1d-array, variables passed by modules instead of using parameter
	55	! lists, enlarged transposition arrays introduced
	56	!
	57	! Revision 1.1 1997/07/24 11:24:14 raasch
	58	! Initial revision
	59	!
	60	!
	61	! Description:
	62	! ------------
	63	! See below.
	64	!------------------------------------------------------------------------------!
	65
	66	!--------------------------------------------------------------------------!
	67	! poisfft !
	68	! !
	69	! Original version: Stephan Siano (pois3d) !
	70	! !
	71	! Institute of Meteorology and Climatology, University of Hannover !
	72	! Germany !
	73	! !
	74	! Version as of July 23,1996 !
	75	! !
	76	! !
	77	! Version for parallel computers: Siegfried Raasch !
	78	! !
	79	! Version as of July 03,1997 !
	80	! !
	81	! Solves the Poisson equation with a 2D spectral method !
	82	! d^2 p / dx^2 + d^2 p / dy^2 + d^2 p / dz^2 = s !
	83	! !
	84	! Input: !
	85	! real ar contains in the (nnx,nny,nnz) elements, !
	86	! starting from the element (1,nys,nxl), the !
	87	! values for s !
	88	! real work Temporary array !
	89	! !
	90	! Output: !
	91	! real ar contains the solution for p !
	92	!--------------------------------------------------------------------------!
	93
	94	USE fft_xy
	95	USE indices
	96	USE transpose_indices
	97
	98	IMPLICIT NONE
	99
	100	PRIVATE
	101	PUBLIC poisfft, poisfft_init
	102
	103	INTERFACE poisfft
	104	MODULE PROCEDURE poisfft
	105	END INTERFACE poisfft
	106
	107	INTERFACE poisfft_init
	108	MODULE PROCEDURE poisfft_init
	109	END INTERFACE poisfft_init
	110
	111	CONTAINS
	112
	113	SUBROUTINE poisfft_init
	114
	115	CALL fft_init
	116
	117	END SUBROUTINE poisfft_init
	118
	119
	120	SUBROUTINE poisfft( ar, work )
	121
	122	USE cpulog
	123	USE interfaces
	124	USE pegrid
	125
	126	IMPLICIT NONE
	127
	128	REAL, DIMENSION(1:nza,nys:nyna,nxl:nxra) :: ar, work
	129
	130
	131	CALL cpu_log( log_point_s(3), 'poisfft', 'start' )
	132
	133	!
	134	!-- Two-dimensional Fourier Transformation in x- and y-direction.
	135	#if defined( __parallel )
	136	IF ( pdims(2) == 1 ) THEN
	137
	138	!
	139	!-- 1d-domain-decomposition along x:
	140	!-- FFT along y and transposition y --> x
	141	CALL ffty_tr_yx( ar, work, ar )
	142
	143	!
	144	!-- FFT along x, solving the tridiagonal system and backward FFT
	145	CALL fftx_tri_fftx( ar )
	146
	147	!
	148	!-- Transposition x --> y and backward FFT along y
	149	CALL tr_xy_ffty( ar, work, ar )
	150
	151	ELSEIF ( pdims(1) == 1 ) THEN
	152
	153	!
	154	!-- 1d-domain-decomposition along y:
	155	!-- FFT along x and transposition x --> y
	156	CALL fftx_tr_xy( ar, work, ar )
	157
	158	!
	159	!-- FFT along y, solving the tridiagonal system and backward FFT
	160	CALL ffty_tri_ffty( ar )
	161
	162	!
	163	!-- Transposition y --> x and backward FFT along x
	164	CALL tr_yx_fftx( ar, work, ar )
	165
	166	ELSE
	167
	168	!
	169	!-- 2d-domain-decomposition
	170	!-- Transposition z --> x
	171	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
[164]	172	CALL transpose_zx( ar, work, ar )
[1]	173	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
	174
	175	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
	176	CALL fftxp( ar, 'forward' )
	177	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
	178
	179	!
	180	!-- Transposition x --> y
	181	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
[164]	182	CALL transpose_xy( ar, work, ar )
[1]	183	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
	184
	185	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
	186	CALL fftyp( ar, 'forward' )
	187	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	188
	189	!
	190	!-- Transposition y --> z
	191	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
[164]	192	CALL transpose_yz( ar, work, ar )
[1]	193	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
	194
	195	!
	196	!-- Solve the Poisson equation in z-direction in cartesian space.
	197	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
	198	CALL tridia( ar )
	199	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
	200
	201	!
	202	!-- Inverse Fourier Transformation
	203	!-- Transposition z --> y
	204	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
[164]	205	CALL transpose_zy( ar, work, ar )
[1]	206	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
	207
	208	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
	209	CALL fftyp( ar, 'backward' )
	210	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	211
	212	!
	213	!-- Transposition y --> x
	214	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
[164]	215	CALL transpose_yx( ar, work, ar )
[1]	216	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
	217
	218	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
	219	CALL fftxp( ar, 'backward' )
	220	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	221
	222	!
	223	!-- Transposition x --> z
	224	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
[164]	225	CALL transpose_xz( ar, work, ar )
[1]	226	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
	227
	228	ENDIF
	229
	230	#else
	231
	232	!
	233	!-- Two-dimensional Fourier Transformation along x- and y-direction.
	234	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
	235	CALL fftx( ar, 'forward' )
	236	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
	237	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
	238	CALL ffty( ar, 'forward' )
	239	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	240
	241	!
	242	!-- Solve the Poisson equation in z-direction in cartesian space.
	243	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
	244	CALL tridia( ar )
	245	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
	246
	247	!
	248	!-- Inverse Fourier Transformation.
	249	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
	250	CALL ffty( ar, 'backward' )
	251	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	252	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
	253	CALL fftx( ar, 'backward' )
	254	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	255
	256	#endif
	257
	258	CALL cpu_log( log_point_s(3), 'poisfft', 'stop' )
	259
	260	END SUBROUTINE poisfft
	261
	262
	263
	264	SUBROUTINE tridia( ar )
	265
	266	!------------------------------------------------------------------------------!
	267	! solves the linear system of equations:
	268	!
	269	! -(4 pi^2(i^2/(dx^2nnx^2)+j^2/(dy^2nny^2))+
	270	! 1/(dzu(k)dzw(k))+1/(dzu(k-1)dzw(k)))*p(i,j,k)+
	271	! 1/(dzu(k)dzw(k))p(i,j,k+1)+1/(dzu(k-1)dzw(k))p(i,j,k-1)=d(i,j,k)
	272	!
	273	! by using the Thomas algorithm
	274	!------------------------------------------------------------------------------!
	275
	276	USE arrays_3d
	277
	278	IMPLICIT NONE
	279
	280	INTEGER :: i, j, k, nnyh
	281
	282	REAL, DIMENSION(nxl_z:nxr_z,0:nz-1) :: ar1
	283	REAL, DIMENSION(5,nxl_z:nxr_z,0:nz-1) :: tri
	284
	285	#if defined( __parallel )
	286	REAL :: ar(nxl_z:nxr_za,nys_z:nyn_za,1:nza)
	287	#else
	288	REAL :: ar(1:nz,nys_z:nyn_z,nxl_z:nxr_z)
	289	#endif
	290
	291
	292	nnyh = (ny+1) / 2
	293
	294	!
	295	!-- Define constant elements of the tridiagonal matrix.
[683]	296	!$OMP PARALLEL PRIVATE ( k, i )
	297	!$OMP DO
[1]	298	DO k = 0, nz-1
	299	DO i = nxl_z, nxr_z
[667]	300	tri(2,i,k) = ddzu_pres(k+1) * ddzw(k+1)
	301	tri(3,i,k) = ddzu_pres(k+2) * ddzw(k+1)
[1]	302	ENDDO
	303	ENDDO
[683]	304	!$OMP END PARALLEL
[1]	305
	306	#if defined( __parallel )
	307	!
	308	!-- Repeat for all y-levels.
[683]	309	!$OMP PARALLEL FIRSTPRIVATE( tri ) PRIVATE ( ar1, j )
	310	!$OMP DO
[1]	311	DO j = nys_z, nyn_z
	312	IF ( j <= nnyh ) THEN
	313	CALL maketri( tri, j )
	314	ELSE
	315	CALL maketri( tri, ny+1-j )
	316	ENDIF
	317	CALL split( tri )
	318	CALL substi( ar, ar1, tri, j )
	319	ENDDO
[683]	320	!$OMP END PARALLEL
[1]	321	#else
	322	!
	323	!-- First y-level.
	324	CALL maketri( tri, nys_z )
	325	CALL split( tri )
	326	CALL substi( ar, ar1, tri, 0 )
	327
	328	!
	329	!-- Further y-levels.
	330	DO j = 1, nnyh - 1
	331	CALL maketri( tri, j )
	332	CALL split( tri )
	333	CALL substi( ar, ar1, tri, j )
	334	CALL substi( ar, ar1, tri, ny+1-j )
	335	ENDDO
	336	CALL maketri( tri, nnyh )
	337	CALL split( tri )
	338	CALL substi( ar, ar1, tri, nnyh+nys )
	339	#endif
	340
	341	CONTAINS
	342
	343	SUBROUTINE maketri( tri, j )
	344
	345	!------------------------------------------------------------------------------!
	346	! Computes the i- and j-dependent component of the matrix
	347	!------------------------------------------------------------------------------!
	348
	349	USE arrays_3d
	350	USE constants
	351	USE control_parameters
	352	USE grid_variables
	353
	354	IMPLICIT NONE
	355
	356	INTEGER :: i, j, k, nnxh
	357	REAL :: a, c
	358	REAL :: ll(nxl_z:nxr_z)
	359	REAL :: tri(5,nxl_z:nxr_z,0:nz-1)
	360
	361
	362	nnxh = ( nx + 1 ) / 2
	363
	364	!
	365	!-- Provide the tridiagonal matrix for solution of the Poisson equation in
	366	!-- Fourier space. The coefficients are computed following the method of
	367	!-- Schmidt et al. (DFVLR-Mitteilung 84-15), which departs from Stephan
	368	!-- Siano's original version by discretizing the Poisson equation,
	369	!-- before it is Fourier-transformed
	370	#if defined( __parallel )
	371	DO i = nxl_z, nxr_z
[128]	372	IF ( i >= 0 .AND. i <= nnxh ) THEN
[1]	373	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
	374	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
	375	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
	376	FLOAT( ny+1 ) ) ) / ( dy * dy )
	377	ELSE
	378	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
	379	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
	380	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
	381	FLOAT( ny+1 ) ) ) / ( dy * dy )
	382	ENDIF
	383	DO k = 0,nz-1
[667]	384	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
	385	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
[1]	386	tri(1,i,k) = a + c - ll(i)
	387	ENDDO
	388	ENDDO
	389	#else
	390	DO i = 0, nnxh
	391	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / FLOAT( nx+1 ) ) ) / &
	392	( dx * dx ) + &
	393	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / FLOAT( ny+1 ) ) ) / &
	394	( dy * dy )
	395	DO k = 0, nz-1
[667]	396	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
	397	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
[1]	398	tri(1,i,k) = a + c - ll(i)
	399	IF ( i >= 1 .and. i < nnxh ) THEN
	400	tri(1,nx+1-i,k) = tri(1,i,k)
	401	ENDIF
	402	ENDDO
	403	ENDDO
	404	#endif
	405	IF ( ibc_p_b == 1 .OR. ibc_p_b == 2 ) THEN
	406	DO i = nxl_z, nxr_z
	407	tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
	408	ENDDO
	409	ENDIF
	410	IF ( ibc_p_t == 1 ) THEN
	411	DO i = nxl_z, nxr_z
	412	tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
	413	ENDDO
	414	ENDIF
	415
	416	END SUBROUTINE maketri
	417
	418
	419	SUBROUTINE substi( ar, ar1, tri, j )
	420
	421	!------------------------------------------------------------------------------!
	422	! Substitution (Forward and Backward) (Thomas algorithm)
	423	!------------------------------------------------------------------------------!
	424
[76]	425	USE control_parameters
	426
[1]	427	IMPLICIT NONE
	428
	429	INTEGER :: i, j, k
	430	REAL :: ar1(nxl_z:nxr_z,0:nz-1)
	431	REAL :: tri(5,nxl_z:nxr_z,0:nz-1)
	432	#if defined( __parallel )
	433	REAL :: ar(nxl_z:nxr_za,nys_z:nyn_za,1:nza)
	434	#else
	435	REAL :: ar(1:nz,nys_z:nyn_z,nxl_z:nxr_z)
	436	#endif
	437
	438	!
	439	!-- Forward substitution.
	440	DO i = nxl_z, nxr_z
	441	#if defined( __parallel )
	442	ar1(i,0) = ar(i,j,1)
	443	#else
	444	ar1(i,0) = ar(1,j,i)
	445	#endif
	446	ENDDO
	447	DO k = 1, nz - 1
	448	DO i = nxl_z, nxr_z
	449	#if defined( __parallel )
	450	ar1(i,k) = ar(i,j,k+1) - tri(5,i,k) * ar1(i,k-1)
	451	#else
	452	ar1(i,k) = ar(k+1,j,i) - tri(5,i,k) * ar1(i,k-1)
	453	#endif
	454	ENDDO
	455	ENDDO
	456
	457	!
	458	!-- Backward substitution.
	459	DO i = nxl_z, nxr_z
	460	#if defined( __parallel )
	461	ar(i,j,nz) = ar1(i,nz-1) / tri(4,i,nz-1)
	462	#else
	463	ar(nz,j,i) = ar1(i,nz-1) / tri(4,i,nz-1)
	464	#endif
	465	ENDDO
	466	DO k = nz-2, 0, -1
	467	DO i = nxl_z, nxr_z
	468	#if defined( __parallel )
	469	ar(i,j,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,j,k+2) ) &
	470	/ tri(4,i,k)
	471	#else
	472	ar(k+1,j,i) = ( ar1(i,k) - tri(3,i,k) * ar(k+2,j,i) ) &
	473	/ tri(4,i,k)
	474	#endif
	475	ENDDO
	476	ENDDO
	477
[76]	478	!
	479	!-- Indices i=0, j=0 correspond to horizontally averaged pressure.
	480	!-- The respective values of ar should be zero at all k-levels if
	481	!-- acceleration of horizontally averaged vertical velocity is zero.
	482	IF ( ibc_p_b == 1 .AND. ibc_p_t == 1 ) THEN
	483	IF ( j == 0 .AND. nxl_z == 0 ) THEN
	484	#if defined( __parallel )
	485	DO k = 1, nz
	486	ar(nxl_z,j,k) = 0.0
	487	ENDDO
	488	#else
	489	DO k = 1, nz
	490	ar(k,j,nxl_z) = 0.0
	491	ENDDO
	492	#endif
	493	ENDIF
	494	ENDIF
	495
[1]	496	END SUBROUTINE substi
	497
	498
	499	SUBROUTINE split( tri )
	500
	501	!------------------------------------------------------------------------------!
	502	! Splitting of the tridiagonal matrix (Thomas algorithm)
	503	!------------------------------------------------------------------------------!
	504
	505	IMPLICIT NONE
	506
	507	INTEGER :: i, k
	508	REAL :: tri(5,nxl_z:nxr_z,0:nz-1)
	509
	510	!
	511	!-- Splitting.
	512	DO i = nxl_z, nxr_z
	513	tri(4,i,0) = tri(1,i,0)
	514	ENDDO
	515	DO k = 1, nz-1
	516	DO i = nxl_z, nxr_z
	517	tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
	518	tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
	519	ENDDO
	520	ENDDO
	521
	522	END SUBROUTINE split
	523
	524	END SUBROUTINE tridia
	525
	526
	527	#if defined( __parallel )
	528	SUBROUTINE fftxp( ar, direction )
	529
	530	!------------------------------------------------------------------------------!
	531	! Fourier-transformation along x-direction Parallelized version
	532	!------------------------------------------------------------------------------!
	533
	534	IMPLICIT NONE
	535
	536	CHARACTER (LEN=*) :: direction
	537	INTEGER :: j, k
	538	REAL :: ar(0:nxa,nys_x:nyn_xa,nzb_x:nzt_xa)
	539
	540	!
	541	!-- Performing the fft with one of the methods implemented
[683]	542	!$OMP PARALLEL PRIVATE ( j, k )
	543	!$OMP DO
[1]	544	DO k = nzb_x, nzt_x
	545	DO j = nys_x, nyn_x
	546	CALL fft_x( ar(0:nx,j,k), direction )
	547	ENDDO
	548	ENDDO
[683]	549	!$OMP END PARALLEL
[1]	550
	551	END SUBROUTINE fftxp
	552
	553	#else
	554	SUBROUTINE fftx( ar, direction )
	555
	556	!------------------------------------------------------------------------------!
	557	! Fourier-transformation along x-direction Non parallel version
	558	!------------------------------------------------------------------------------!
	559
	560	IMPLICIT NONE
	561
	562	CHARACTER (LEN=*) :: direction
	563	INTEGER :: i, j, k
	564	REAL :: ar(1:nz,0:ny,0:nx)
	565
	566	!
	567	!-- Performing the fft with one of the methods implemented
[683]	568	!$OMP PARALLEL PRIVATE ( j, k )
	569	!$OMP DO
[1]	570	DO k = 1, nz
	571	DO j = 0, ny
	572	CALL fft_x( ar(k,j,0:nx), direction )
	573	ENDDO
	574	ENDDO
[683]	575	!$OMP END PARALLEL
[1]	576
	577	END SUBROUTINE fftx
	578	#endif
	579
	580
	581	#if defined( __parallel )
	582	SUBROUTINE fftyp( ar, direction )
	583
	584	!------------------------------------------------------------------------------!
	585	! Fourier-transformation along y-direction Parallelized version
	586	!------------------------------------------------------------------------------!
	587
	588	IMPLICIT NONE
	589
	590	CHARACTER (LEN=*) :: direction
	591	INTEGER :: i, k
	592	REAL :: ar(0:nya,nxl_y:nxr_ya,nzb_y:nzt_ya)
	593
	594	!
	595	!-- Performing the fft with one of the methods implemented
[683]	596	!$OMP PARALLEL PRIVATE ( i, k )
	597	!$OMP DO
[1]	598	DO k = nzb_y, nzt_y
	599	DO i = nxl_y, nxr_y
	600	CALL fft_y( ar(0:ny,i,k), direction )
	601	ENDDO
	602	ENDDO
[683]	603	!$OMP END PARALLEL
[1]	604
	605	END SUBROUTINE fftyp
	606
	607	#else
	608	SUBROUTINE ffty( ar, direction )
	609
	610	!------------------------------------------------------------------------------!
	611	! Fourier-transformation along y-direction Non parallel version
	612	!------------------------------------------------------------------------------!
	613
	614	IMPLICIT NONE
	615
	616	CHARACTER (LEN=*) :: direction
	617	INTEGER :: i, k
	618	REAL :: ar(1:nz,0:ny,0:nx)
	619
	620	!
	621	!-- Performing the fft with one of the methods implemented
[683]	622	!$OMP PARALLEL PRIVATE ( i, k )
	623	!$OMP DO
[1]	624	DO k = 1, nz
	625	DO i = 0, nx
	626	CALL fft_y( ar(k,0:ny,i), direction )
	627	ENDDO
	628	ENDDO
[683]	629	!$OMP END PARALLEL
[1]	630
	631	END SUBROUTINE ffty
	632	#endif
	633
	634	#if defined( __parallel )
	635	SUBROUTINE ffty_tr_yx( f_in, work, f_out )
	636
	637	!------------------------------------------------------------------------------!
	638	! Fourier-transformation along y with subsequent transposition y --> x for
	639	! a 1d-decomposition along x
	640	!
	641	! ATTENTION: The performance of this routine is much faster on the NEC-SX6,
	642	! if the first index of work_ffty_vec is odd. Otherwise
	643	! memory bank conflicts may occur (especially if the index is a
	644	! multiple of 128). That's why work_ffty_vec is dimensioned as
	645	! 0:ny+1.
	646	! Of course, this will not work if users are using an odd number
	647	! of gridpoints along y.
	648	!------------------------------------------------------------------------------!
	649
	650	USE control_parameters
	651	USE cpulog
	652	USE indices
	653	USE interfaces
	654	USE pegrid
	655	USE transpose_indices
	656
	657	IMPLICIT NONE
	658
	659	INTEGER :: i, iend, iouter, ir, j, k
	660	INTEGER, PARAMETER :: stridex = 4
	661
	662	REAL, DIMENSION(0:ny,stridex) :: work_ffty
	663	#if defined( __nec )
	664	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
	665	#endif
	666	REAL, DIMENSION(1:nza,0:nya,nxl:nxra) :: f_in
	667	REAL, DIMENSION(nnx,1:nza,nys_x:nyn_xa,pdims(1)) :: f_out
	668	REAL, DIMENSION(nxl:nxra,1:nza,0:nya) :: work
	669
	670	!
	671	!-- Carry out the FFT along y, where all data are present due to the
	672	!-- 1d-decomposition along x. Resort the data in a way that x becomes
	673	!-- the first index.
	674	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
	675
	676	IF ( host(1:3) == 'nec' ) THEN
	677	#if defined( __nec )
	678	!
	679	!-- Code optimized for vector processors
[85]	680	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	681	!$OMP DO
	682	DO i = nxl, nxr
	683
	684	DO j = 0, ny
	685	DO k = 1, nz
	686	work_ffty_vec(j,k,i) = f_in(k,j,i)
	687	ENDDO
	688	ENDDO
	689
	690	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' )
	691
	692	ENDDO
	693
	694	!$OMP DO
	695	DO k = 1, nz
	696	DO j = 0, ny
	697	DO i = nxl, nxr
	698	work(i,k,j) = work_ffty_vec(j,k,i)
	699	ENDDO
	700	ENDDO
	701	ENDDO
	702	!$OMP END PARALLEL
	703	#endif
	704
	705	ELSE
	706
	707	!
	708	!-- Cache optimized code.
	709	!-- The i-(x-)direction is split into a strided outer loop and an inner
	710	!-- loop for better cache performance
	711	!$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty)
	712	!$OMP DO
	713	DO iouter = nxl, nxr, stridex
	714
	715	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
	716
	717	DO k = 1, nz
	718
	719	DO i = iouter, iend
	720
	721	ir = i-iouter+1 ! counter within a stride
	722	DO j = 0, ny
	723	work_ffty(j,ir) = f_in(k,j,i)
	724	ENDDO
	725	!
	726	!-- FFT along y
	727	CALL fft_y( work_ffty(:,ir), 'forward' )
	728
	729	ENDDO
	730
	731	!
	732	!-- Resort
	733	DO j = 0, ny
	734	DO i = iouter, iend
	735	work(i,k,j) = work_ffty(j,i-iouter+1)
	736	ENDDO
	737	ENDDO
	738
	739	ENDDO
	740
	741	ENDDO
	742	!$OMP END PARALLEL
	743
	744	ENDIF
	745	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	746
	747	!
	748	!-- Transpose array
	749	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	750	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	751	CALL MPI_ALLTOALL( work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
	752	f_out(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
	753	comm1dx, ierr )
	754	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	755
	756	END SUBROUTINE ffty_tr_yx
	757
	758
	759	SUBROUTINE tr_xy_ffty( f_in, work, f_out )
	760
	761	!------------------------------------------------------------------------------!
	762	! Transposition x --> y with a subsequent backward Fourier transformation for
	763	! a 1d-decomposition along x
	764	!------------------------------------------------------------------------------!
	765
	766	USE control_parameters
	767	USE cpulog
	768	USE indices
	769	USE interfaces
	770	USE pegrid
	771	USE transpose_indices
	772
	773	IMPLICIT NONE
	774
	775	INTEGER :: i, iend, iouter, ir, j, k
	776	INTEGER, PARAMETER :: stridex = 4
	777
	778	REAL, DIMENSION(0:ny,stridex) :: work_ffty
	779	#if defined( __nec )
	780	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
	781	#endif
	782	REAL, DIMENSION(nnx,1:nza,nys_x:nyn_xa,pdims(1)) :: f_in
	783	REAL, DIMENSION(1:nza,0:nya,nxl:nxra) :: f_out
	784	REAL, DIMENSION(nxl:nxra,1:nza,0:nya) :: work
	785
	786	!
	787	!-- Transpose array
	788	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	789	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	790	CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
	791	work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
	792	comm1dx, ierr )
	793	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	794
	795	!
	796	!-- Resort the data in a way that y becomes the first index and carry out the
	797	!-- backward fft along y.
	798	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
	799
	800	IF ( host(1:3) == 'nec' ) THEN
	801	#if defined( __nec )
	802	!
	803	!-- Code optimized for vector processors
[85]	804	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	805	!$OMP DO
	806	DO k = 1, nz
	807	DO j = 0, ny
	808	DO i = nxl, nxr
	809	work_ffty_vec(j,k,i) = work(i,k,j)
	810	ENDDO
	811	ENDDO
	812	ENDDO
	813
	814	!$OMP DO
	815	DO i = nxl, nxr
	816
	817	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' )
	818
	819	DO j = 0, ny
	820	DO k = 1, nz
	821	f_out(k,j,i) = work_ffty_vec(j,k,i)
	822	ENDDO
	823	ENDDO
	824
	825	ENDDO
	826	!$OMP END PARALLEL
	827	#endif
	828
	829	ELSE
	830
	831	!
	832	!-- Cache optimized code.
	833	!-- The i-(x-)direction is split into a strided outer loop and an inner
	834	!-- loop for better cache performance
	835	!$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty )
	836	!$OMP DO
	837	DO iouter = nxl, nxr, stridex
	838
	839	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
	840
	841	DO k = 1, nz
	842	!
	843	!-- Resort
	844	DO j = 0, ny
	845	DO i = iouter, iend
	846	work_ffty(j,i-iouter+1) = work(i,k,j)
	847	ENDDO
	848	ENDDO
	849
	850	DO i = iouter, iend
	851
	852	!
	853	!-- FFT along y
	854	ir = i-iouter+1 ! counter within a stride
	855	CALL fft_y( work_ffty(:,ir), 'backward' )
	856
	857	DO j = 0, ny
	858	f_out(k,j,i) = work_ffty(j,ir)
	859	ENDDO
	860	ENDDO
	861
	862	ENDDO
	863
	864	ENDDO
	865	!$OMP END PARALLEL
	866
	867	ENDIF
	868
	869	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	870
	871	END SUBROUTINE tr_xy_ffty
	872
	873
	874	SUBROUTINE fftx_tri_fftx( ar )
	875
	876	!------------------------------------------------------------------------------!
	877	! FFT along x, solution of the tridiagonal system and backward FFT for
	878	! a 1d-decomposition along x
	879	!
	880	! WARNING: this subroutine may still not work for hybrid parallelization
	881	! with OpenMP (for possible necessary changes see the original
	882	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
	883	!------------------------------------------------------------------------------!
	884
	885	USE control_parameters
	886	USE cpulog
	887	USE grid_variables
	888	USE indices
	889	USE interfaces
	890	USE pegrid
	891	USE transpose_indices
	892
	893	IMPLICIT NONE
	894
	895	character(len=3) :: myth_char
	896
	897	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
	898
	899	REAL, DIMENSION(0:nx) :: work_fftx
	900	REAL, DIMENSION(0:nx,1:nz) :: work_trix
	901	REAL, DIMENSION(nnx,1:nza,nys_x:nyn_xa,pdims(1)) :: ar
	902	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
	903
	904
	905	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'start' )
	906
	907	ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) )
	908
	909	tn = 0 ! Default thread number in case of one thread
	910	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_fftx, work_trix )
	911	DO j = nys_x, nyn_x
	912
	913	!$ tn = omp_get_thread_num()
	914
	915	IF ( host(1:3) == 'nec' ) THEN
	916	!
	917	!-- Code optimized for vector processors
	918	DO k = 1, nz
	919
	920	m = 0
	921	DO n = 1, pdims(1)
	922	DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
	923	work_trix(m,k) = ar(i,k,j,n)
	924	m = m + 1
	925	ENDDO
	926	ENDDO
	927
	928	ENDDO
	929
	930	CALL fft_x_m( work_trix, 'forward' )
	931
	932	ELSE
	933	!
	934	!-- Cache optimized code
	935	DO k = 1, nz
	936
	937	m = 0
	938	DO n = 1, pdims(1)
	939	DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
	940	work_fftx(m) = ar(i,k,j,n)
	941	m = m + 1
	942	ENDDO
	943	ENDDO
	944
	945	CALL fft_x( work_fftx, 'forward' )
	946
	947	DO i = 0, nx
	948	work_trix(i,k) = work_fftx(i)
	949	ENDDO
	950
	951	ENDDO
	952
	953	ENDIF
	954
	955	!
	956	!-- Solve the linear equation system
	957	CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) )
	958
	959	IF ( host(1:3) == 'nec' ) THEN
	960	!
	961	!-- Code optimized for vector processors
	962	CALL fft_x_m( work_trix, 'backward' )
	963
	964	DO k = 1, nz
	965
	966	m = 0
	967	DO n = 1, pdims(1)
	968	DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
	969	ar(i,k,j,n) = work_trix(m,k)
	970	m = m + 1
	971	ENDDO
	972	ENDDO
	973
	974	ENDDO
	975
	976	ELSE
	977	!
	978	!-- Cache optimized code
	979	DO k = 1, nz
	980
	981	DO i = 0, nx
	982	work_fftx(i) = work_trix(i,k)
	983	ENDDO
	984
	985	CALL fft_x( work_fftx, 'backward' )
	986
	987	m = 0
	988	DO n = 1, pdims(1)
	989	DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
	990	ar(i,k,j,n) = work_fftx(m)
	991	m = m + 1
	992	ENDDO
	993	ENDDO
	994
	995	ENDDO
	996
	997	ENDIF
	998
	999	ENDDO
	1000
	1001	DEALLOCATE( tri )
	1002
	1003	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'stop' )
	1004
	1005	END SUBROUTINE fftx_tri_fftx
	1006
	1007
	1008	SUBROUTINE fftx_tr_xy( f_in, work, f_out )
	1009
	1010	!------------------------------------------------------------------------------!
	1011	! Fourier-transformation along x with subsequent transposition x --> y for
	1012	! a 1d-decomposition along y
	1013	!
	1014	! ATTENTION: The NEC-branch of this routine may significantly profit from
	1015	! further optimizations. So far, performance is much worse than
	1016	! for routine ffty_tr_yx (more than three times slower).
	1017	!------------------------------------------------------------------------------!
	1018
	1019	USE control_parameters
	1020	USE cpulog
	1021	USE indices
	1022	USE interfaces
	1023	USE pegrid
	1024	USE transpose_indices
	1025
	1026	IMPLICIT NONE
	1027
	1028	INTEGER :: i, j, k
	1029
	1030	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
	1031	REAL, DIMENSION(1:nza,nys:nyna,0:nxa) :: f_in
	1032	REAL, DIMENSION(nny,1:nza,nxl_y:nxr_ya,pdims(2)) :: f_out
	1033	REAL, DIMENSION(nys:nyna,1:nza,0:nxa) :: work
	1034
	1035	!
	1036	!-- Carry out the FFT along x, where all data are present due to the
	1037	!-- 1d-decomposition along y. Resort the data in a way that y becomes
	1038	!-- the first index.
	1039	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
	1040
	1041	IF ( host(1:3) == 'nec' ) THEN
	1042	!
	1043	!-- Code for vector processors
[85]	1044	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	1045	!$OMP DO
	1046	DO i = 0, nx
	1047
	1048	DO j = nys, nyn
	1049	DO k = 1, nz
	1050	work_fftx(i,k,j) = f_in(k,j,i)
	1051	ENDDO
	1052	ENDDO
	1053
	1054	ENDDO
	1055
	1056	!$OMP DO
	1057	DO j = nys, nyn
	1058
	1059	CALL fft_x_m( work_fftx(:,:,j), 'forward' )
	1060
	1061	DO k = 1, nz
	1062	DO i = 0, nx
	1063	work(j,k,i) = work_fftx(i,k,j)
	1064	ENDDO
	1065	ENDDO
	1066
	1067	ENDDO
	1068	!$OMP END PARALLEL
	1069
	1070	ELSE
	1071
	1072	!
	1073	!-- Cache optimized code (there might be still a potential for better
	1074	!-- optimization).
[696]	1075	!$OMP PARALLEL PRIVATE (i,j,k)
[1]	1076	!$OMP DO
	1077	DO i = 0, nx
	1078
	1079	DO j = nys, nyn
	1080	DO k = 1, nz
	1081	work_fftx(i,k,j) = f_in(k,j,i)
	1082	ENDDO
	1083	ENDDO
	1084
	1085	ENDDO
	1086
	1087	!$OMP DO
	1088	DO j = nys, nyn
	1089	DO k = 1, nz
	1090
	1091	CALL fft_x( work_fftx(0:nx,k,j), 'forward' )
	1092
	1093	DO i = 0, nx
	1094	work(j,k,i) = work_fftx(i,k,j)
	1095	ENDDO
	1096	ENDDO
	1097
	1098	ENDDO
	1099	!$OMP END PARALLEL
	1100
	1101	ENDIF
	1102	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
	1103
	1104	!
	1105	!-- Transpose array
	1106	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	1107	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	1108	CALL MPI_ALLTOALL( work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
	1109	f_out(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
	1110	comm1dy, ierr )
	1111	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	1112
	1113	END SUBROUTINE fftx_tr_xy
	1114
	1115
	1116	SUBROUTINE tr_yx_fftx( f_in, work, f_out )
	1117
	1118	!------------------------------------------------------------------------------!
	1119	! Transposition y --> x with a subsequent backward Fourier transformation for
	1120	! a 1d-decomposition along x
	1121	!------------------------------------------------------------------------------!
	1122
	1123	USE control_parameters
	1124	USE cpulog
	1125	USE indices
	1126	USE interfaces
	1127	USE pegrid
	1128	USE transpose_indices
	1129
	1130	IMPLICIT NONE
	1131
	1132	INTEGER :: i, j, k
	1133
	1134	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
	1135	REAL, DIMENSION(nny,1:nza,nxl_y:nxr_ya,pdims(2)) :: f_in
	1136	REAL, DIMENSION(1:nza,nys:nyna,0:nxa) :: f_out
	1137	REAL, DIMENSION(nys:nyna,1:nza,0:nxa) :: work
	1138
	1139	!
	1140	!-- Transpose array
	1141	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	1142	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	1143	CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
	1144	work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
	1145	comm1dy, ierr )
	1146	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	1147
	1148	!
	1149	!-- Carry out the FFT along x, where all data are present due to the
	1150	!-- 1d-decomposition along y. Resort the data in a way that y becomes
	1151	!-- the first index.
	1152	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
	1153
	1154	IF ( host(1:3) == 'nec' ) THEN
	1155	!
	1156	!-- Code optimized for vector processors
[85]	1157	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	1158	!$OMP DO
	1159	DO j = nys, nyn
	1160
	1161	DO k = 1, nz
	1162	DO i = 0, nx
	1163	work_fftx(i,k,j) = work(j,k,i)
	1164	ENDDO
	1165	ENDDO
	1166
	1167	CALL fft_x_m( work_fftx(:,:,j), 'backward' )
	1168
	1169	ENDDO
	1170
	1171	!$OMP DO
	1172	DO i = 0, nx
	1173	DO j = nys, nyn
	1174	DO k = 1, nz
	1175	f_out(k,j,i) = work_fftx(i,k,j)
	1176	ENDDO
	1177	ENDDO
	1178	ENDDO
	1179	!$OMP END PARALLEL
	1180
	1181	ELSE
	1182
	1183	!
	1184	!-- Cache optimized code (there might be still a potential for better
	1185	!-- optimization).
[696]	1186	!$OMP PARALLEL PRIVATE (i,j,k)
[1]	1187	!$OMP DO
	1188	DO j = nys, nyn
	1189	DO k = 1, nz
	1190
	1191	DO i = 0, nx
	1192	work_fftx(i,k,j) = work(j,k,i)
	1193	ENDDO
	1194
	1195	CALL fft_x( work_fftx(0:nx,k,j), 'backward' )
	1196
	1197	ENDDO
	1198	ENDDO
	1199
	1200	!$OMP DO
	1201	DO i = 0, nx
	1202	DO j = nys, nyn
	1203	DO k = 1, nz
	1204	f_out(k,j,i) = work_fftx(i,k,j)
	1205	ENDDO
	1206	ENDDO
	1207	ENDDO
	1208	!$OMP END PARALLEL
	1209
	1210	ENDIF
	1211	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	1212
	1213	END SUBROUTINE tr_yx_fftx
	1214
	1215
	1216	SUBROUTINE ffty_tri_ffty( ar )
	1217
	1218	!------------------------------------------------------------------------------!
	1219	! FFT along y, solution of the tridiagonal system and backward FFT for
	1220	! a 1d-decomposition along y
	1221	!
	1222	! WARNING: this subroutine may still not work for hybrid parallelization
	1223	! with OpenMP (for possible necessary changes see the original
	1224	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
	1225	!------------------------------------------------------------------------------!
	1226
	1227	USE control_parameters
	1228	USE cpulog
	1229	USE grid_variables
	1230	USE indices
	1231	USE interfaces
	1232	USE pegrid
	1233	USE transpose_indices
	1234
	1235	IMPLICIT NONE
	1236
	1237	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
	1238
	1239	REAL, DIMENSION(0:ny) :: work_ffty
	1240	REAL, DIMENSION(0:ny,1:nz) :: work_triy
	1241	REAL, DIMENSION(nny,1:nza,nxl_y:nxr_ya,pdims(2)) :: ar
	1242	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
	1243
	1244
	1245	CALL cpu_log( log_point_s(39), 'fft_y + tridia', 'start' )
	1246
	1247	ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) )
	1248
	1249	tn = 0 ! Default thread number in case of one thread
[696]	1250	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy )
[1]	1251	DO i = nxl_y, nxr_y
	1252
	1253	!$ tn = omp_get_thread_num()
	1254
	1255	IF ( host(1:3) == 'nec' ) THEN
	1256	!
	1257	!-- Code optimized for vector processors
	1258	DO k = 1, nz
	1259
	1260	m = 0
	1261	DO n = 1, pdims(2)
	1262	DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
	1263	work_triy(m,k) = ar(j,k,i,n)
	1264	m = m + 1
	1265	ENDDO
	1266	ENDDO
	1267
	1268	ENDDO
	1269
	1270	CALL fft_y_m( work_triy, ny, 'forward' )
	1271
	1272	ELSE
	1273	!
	1274	!-- Cache optimized code
	1275	DO k = 1, nz
	1276
	1277	m = 0
	1278	DO n = 1, pdims(2)
	1279	DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
	1280	work_ffty(m) = ar(j,k,i,n)
	1281	m = m + 1
	1282	ENDDO
	1283	ENDDO
	1284
	1285	CALL fft_y( work_ffty, 'forward' )
	1286
	1287	DO j = 0, ny
	1288	work_triy(j,k) = work_ffty(j)
	1289	ENDDO
	1290
	1291	ENDDO
	1292
	1293	ENDIF
	1294
	1295	!
	1296	!-- Solve the linear equation system
	1297	CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) )
	1298
	1299	IF ( host(1:3) == 'nec' ) THEN
	1300	!
	1301	!-- Code optimized for vector processors
	1302	CALL fft_y_m( work_triy, ny, 'backward' )
	1303
	1304	DO k = 1, nz
	1305
	1306	m = 0
	1307	DO n = 1, pdims(2)
	1308	DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
	1309	ar(j,k,i,n) = work_triy(m,k)
	1310	m = m + 1
	1311	ENDDO
	1312	ENDDO
	1313
	1314	ENDDO
	1315
	1316	ELSE
	1317	!
	1318	!-- Cache optimized code
	1319	DO k = 1, nz
	1320
	1321	DO j = 0, ny
	1322	work_ffty(j) = work_triy(j,k)
	1323	ENDDO
	1324
	1325	CALL fft_y( work_ffty, 'backward' )
	1326
	1327	m = 0
	1328	DO n = 1, pdims(2)
	1329	DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
	1330	ar(j,k,i,n) = work_ffty(m)
	1331	m = m + 1
	1332	ENDDO
	1333	ENDDO
	1334
	1335	ENDDO
	1336
	1337	ENDIF
	1338
	1339	ENDDO
	1340
	1341	DEALLOCATE( tri )
	1342
	1343	CALL cpu_log( log_point_s(39), 'fft_y + tridia', 'stop' )
	1344
	1345	END SUBROUTINE ffty_tri_ffty
	1346
	1347
	1348	SUBROUTINE tridia_1dd( ddx2, ddy2, nx, ny, j, ar, tri )
	1349
	1350	!------------------------------------------------------------------------------!
	1351	! Solves the linear system of equations for a 1d-decomposition along x (see
	1352	! tridia)
	1353	!
	1354	! Attention: when using the intel compiler, array tri must be passed as an
	1355	! argument to the contained subroutines. Otherwise addres faults
	1356	! will occur.
	1357	! On NEC, tri should not be passed (except for routine substi_1dd)
	1358	! because this causes very bad performance.
	1359	!------------------------------------------------------------------------------!
	1360
	1361	USE arrays_3d
	1362	USE control_parameters
	1363
	1364	USE pegrid
	1365
	1366	IMPLICIT NONE
	1367
	1368	INTEGER :: i, j, k, nnyh, nx, ny, omp_get_thread_num, tn
	1369
	1370	REAL :: ddx2, ddy2
	1371
	1372	REAL, DIMENSION(0:nx,1:nz) :: ar
	1373	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
	1374	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	1375
	1376
	1377	nnyh = ( ny + 1 ) / 2
	1378
	1379	!
	1380	!-- Define constant elements of the tridiagonal matrix.
	1381	!-- The compiler on SX6 does loop exchange. If 0:nx is a high power of 2,
	1382	!-- the exchanged loops create bank conflicts. The following directive
	1383	!-- prohibits loop exchange and the loops perform much better.
	1384	! tn = omp_get_thread_num()
	1385	! WRITE( 120+tn, * ) '+++ id=',myid,' nx=',nx,' thread=', omp_get_thread_num()
[82]	1386	! CALL local_flush( 120+tn )
[1]	1387	!CDIR NOLOOPCHG
	1388	DO k = 0, nz-1
	1389	DO i = 0,nx
[667]	1390	tri(2,i,k) = ddzu_pres(k+1) * ddzw(k+1)
	1391	tri(3,i,k) = ddzu_pres(k+2) * ddzw(k+1)
[1]	1392	ENDDO
	1393	ENDDO
	1394	! WRITE( 120+tn, * ) '+++ id=',myid,' end of first tridia loop thread=', omp_get_thread_num()
[82]	1395	! CALL local_flush( 120+tn )
[1]	1396
	1397	IF ( j <= nnyh ) THEN
[377]	1398	#if defined( __lc )
[1]	1399	CALL maketri_1dd( j, tri )
	1400	#else
	1401	CALL maketri_1dd( j )
	1402	#endif
	1403	ELSE
[377]	1404	#if defined( __lc )
[1]	1405	CALL maketri_1dd( ny+1-j, tri )
	1406	#else
	1407	CALL maketri_1dd( ny+1-j )
	1408	#endif
	1409	ENDIF
[377]	1410	#if defined( __lc )
[1]	1411	CALL split_1dd( tri )
	1412	#else
	1413	CALL split_1dd
	1414	#endif
	1415	CALL substi_1dd( ar, tri )
	1416
	1417	CONTAINS
	1418
[377]	1419	#if defined( __lc )
[1]	1420	SUBROUTINE maketri_1dd( j, tri )
	1421	#else
	1422	SUBROUTINE maketri_1dd( j )
	1423	#endif
	1424
	1425	!------------------------------------------------------------------------------!
	1426	! computes the i- and j-dependent component of the matrix
	1427	!------------------------------------------------------------------------------!
	1428
	1429	USE constants
	1430
	1431	IMPLICIT NONE
	1432
	1433	INTEGER :: i, j, k, nnxh
	1434	REAL :: a, c
	1435
	1436	REAL, DIMENSION(0:nx) :: l
	1437
[377]	1438	#if defined( __lc )
[1]	1439	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	1440	#endif
	1441
	1442
	1443	nnxh = ( nx + 1 ) / 2
	1444	!
	1445	!-- Provide the tridiagonal matrix for solution of the Poisson equation in
	1446	!-- Fourier space. The coefficients are computed following the method of
	1447	!-- Schmidt et al. (DFVLR-Mitteilung 84-15), which departs from Stephan
	1448	!-- Siano's original version by discretizing the Poisson equation,
	1449	!-- before it is Fourier-transformed
	1450	DO i = 0, nx
[128]	1451	IF ( i >= 0 .AND. i <= nnxh ) THEN
[1]	1452	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
	1453	FLOAT( nx+1 ) ) ) * ddx2 + &
	1454	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
	1455	FLOAT( ny+1 ) ) ) * ddy2
	1456	ELSE
	1457	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
	1458	FLOAT( nx+1 ) ) ) * ddx2 + &
	1459	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
	1460	FLOAT( ny+1 ) ) ) * ddy2
	1461	ENDIF
	1462	ENDDO
	1463
	1464	DO k = 0, nz-1
	1465	DO i = 0, nx
[667]	1466	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
	1467	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
[1]	1468	tri(1,i,k) = a + c - l(i)
	1469	ENDDO
	1470	ENDDO
	1471	IF ( ibc_p_b == 1 .OR. ibc_p_b == 2 ) THEN
	1472	DO i = 0, nx
	1473	tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
	1474	ENDDO
	1475	ENDIF
	1476	IF ( ibc_p_t == 1 ) THEN
	1477	DO i = 0, nx
	1478	tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
	1479	ENDDO
	1480	ENDIF
	1481
	1482	END SUBROUTINE maketri_1dd
	1483
	1484
[377]	1485	#if defined( __lc )
[1]	1486	SUBROUTINE split_1dd( tri )
	1487	#else
	1488	SUBROUTINE split_1dd
	1489	#endif
	1490
	1491	!------------------------------------------------------------------------------!
	1492	! Splitting of the tridiagonal matrix (Thomas algorithm)
	1493	!------------------------------------------------------------------------------!
	1494
	1495	IMPLICIT NONE
	1496
	1497	INTEGER :: i, k
	1498
[377]	1499	#if defined( __lc )
[1]	1500	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	1501	#endif
	1502
	1503
	1504	!
	1505	!-- Splitting
	1506	DO i = 0, nx
	1507	tri(4,i,0) = tri(1,i,0)
	1508	ENDDO
	1509	DO k = 1, nz-1
	1510	DO i = 0, nx
	1511	tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
	1512	tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
	1513	ENDDO
	1514	ENDDO
	1515
	1516	END SUBROUTINE split_1dd
	1517
	1518
	1519	SUBROUTINE substi_1dd( ar, tri )
	1520
	1521	!------------------------------------------------------------------------------!
	1522	! Substitution (Forward and Backward) (Thomas algorithm)
	1523	!------------------------------------------------------------------------------!
	1524
	1525	IMPLICIT NONE
	1526
[76]	1527	INTEGER :: i, k
[1]	1528
	1529	REAL, DIMENSION(0:nx,nz) :: ar
	1530	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
	1531	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	1532
	1533	!
	1534	!-- Forward substitution
	1535	DO i = 0, nx
	1536	ar1(i,0) = ar(i,1)
	1537	ENDDO
	1538	DO k = 1, nz-1
	1539	DO i = 0, nx
	1540	ar1(i,k) = ar(i,k+1) - tri(5,i,k) * ar1(i,k-1)
	1541	ENDDO
	1542	ENDDO
	1543
	1544	!
	1545	!-- Backward substitution
	1546	DO i = 0, nx
	1547	ar(i,nz) = ar1(i,nz-1) / tri(4,i,nz-1)
	1548	ENDDO
	1549	DO k = nz-2, 0, -1
	1550	DO i = 0, nx
	1551	ar(i,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,k+2) ) &
	1552	/ tri(4,i,k)
	1553	ENDDO
	1554	ENDDO
	1555
[76]	1556	!
	1557	!-- Indices i=0, j=0 correspond to horizontally averaged pressure.
	1558	!-- The respective values of ar should be zero at all k-levels if
	1559	!-- acceleration of horizontally averaged vertical velocity is zero.
	1560	IF ( ibc_p_b == 1 .AND. ibc_p_t == 1 ) THEN
	1561	IF ( j == 0 ) THEN
	1562	DO k = 1, nz
	1563	ar(0,k) = 0.0
	1564	ENDDO
	1565	ENDIF
	1566	ENDIF
	1567
[1]	1568	END SUBROUTINE substi_1dd
	1569
	1570	END SUBROUTINE tridia_1dd
	1571
	1572	#endif
	1573
	1574	END MODULE poisfft_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |