Home

Context Navigation

source: palm/trunk/SOURCE/poisfft.f90 @ 874

Last change on this file since 874 was 810, checked in by maronga, 13 years ago
last commit documented
Property svn:keywords set to `Id`
File size: 46.5 KB

Rev	Line
[1]	1	MODULE poisfft_mod
	2
	3	!------------------------------------------------------------------------------!
[484]	4	! Current revisions:
[1]	5	! -----------------
	6	!
[810]	7	!
[1]	8	! Former revisions:
	9	! -----------------
[3]	10	! $Id: poisfft.f90 810 2012-01-30 13:40:12Z heinze $
[77]	11	!
[810]	12	! 809 2012-01-30 13:32:58Z maronga
	13	! Bugfix: replaced .AND. and .NOT. with && and ! in the preprocessor directives
	14	!
[808]	15	! 807 2012-01-25 11:53:51Z maronga
	16	! New cpp directive "__check" implemented which is used by check_namelist_files
	17	! (most of the code is unneeded by check_namelist_files).
	18	!
[764]	19	! 763 2011-10-06 09:32:09Z suehring
	20	! Comment added concerning the last change.
	21	!
[762]	22	! 761 2011-10-05 17:58:52Z suehring
	23	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
	24	! pressure at the top of the model domain.
	25	!
[697]	26	! 696 2011-03-18 07:03:49Z raasch
	27	! work_fftx removed from PRIVATE clauses in fftx_tr_xy and tr_yx_fftx
	28	!
[684]	29	! 683 2011-02-09 14:25:15Z raasch
	30	! openMP parallelization for 2d-domain-decomposition
	31	!
[668]	32	! 667 2010-12-23 12:06:00Z suehring/gryschka
	33	! ddzu replaced by ddzu_pres due to changes in zu(0)
	34	!
[623]	35	! 622 2010-12-10 08:08:13Z raasch
	36	! optional barriers included in order to speed up collective operations
	37	!
[392]	38	! 377 2009-09-04 11:09:00Z raasch
	39	! __lcmuk changed to __lc to avoid problems with Intel compiler on sgi-ice
	40	!
[198]	41	! 164 2008-05-15 08:46:15Z raasch
	42	! Arguments removed from transpose routines
	43	!
[139]	44	! 128 2007-10-26 13:11:14Z raasch
	45	! Bugfix: wavenumber calculation for even nx in routines maketri
	46	!
[90]	47	! 85 2007-05-11 09:35:14Z raasch
	48	! Bugfix: work_fft*_vec removed from some PRIVATE-declarations
	49	!
[77]	50	! 76 2007-03-29 00:58:32Z raasch
	51	! Tridiagonal coefficients adjusted for Neumann boundary conditions both at
	52	! the bottom and the top.
	53	!
[3]	54	! RCS Log replace by Id keyword, revision history cleaned up
	55	!
[1]	56	! Revision 1.24 2006/08/04 15:00:24 raasch
	57	! Default setting of the thread number tn in case of not using OpenMP
	58	!
	59	! Revision 1.23 2006/02/23 12:48:38 raasch
	60	! Additional compiler directive in routine tridia_1dd for preventing loop
	61	! exchange on NEC-SX6
	62	!
	63	! Revision 1.20 2004/04/30 12:38:09 raasch
	64	! Parts of former poisfft_hybrid moved to this subroutine,
	65	! former subroutine changed to a module, renaming of FFT-subroutines and
	66	! -module, FFTs completely substituted by calls of fft_x and fft_y,
	67	! NAG fft used in the non-parallel case completely removed, l in maketri
	68	! is now a 1d-array, variables passed by modules instead of using parameter
	69	! lists, enlarged transposition arrays introduced
	70	!
	71	! Revision 1.1 1997/07/24 11:24:14 raasch
	72	! Initial revision
	73	!
	74	!
	75	! Description:
	76	! ------------
	77	! See below.
	78	!------------------------------------------------------------------------------!
	79
	80	!--------------------------------------------------------------------------!
	81	! poisfft !
	82	! !
	83	! Original version: Stephan Siano (pois3d) !
	84	! !
	85	! Institute of Meteorology and Climatology, University of Hannover !
	86	! Germany !
	87	! !
	88	! Version as of July 23,1996 !
	89	! !
	90	! !
	91	! Version for parallel computers: Siegfried Raasch !
	92	! !
	93	! Version as of July 03,1997 !
	94	! !
	95	! Solves the Poisson equation with a 2D spectral method !
	96	! d^2 p / dx^2 + d^2 p / dy^2 + d^2 p / dz^2 = s !
	97	! !
	98	! Input: !
	99	! real ar contains in the (nnx,nny,nnz) elements, !
	100	! starting from the element (1,nys,nxl), the !
	101	! values for s !
	102	! real work Temporary array !
	103	! !
	104	! Output: !
	105	! real ar contains the solution for p !
	106	!--------------------------------------------------------------------------!
	107
	108	USE fft_xy
	109	USE indices
	110	USE transpose_indices
	111
	112	IMPLICIT NONE
	113
	114	PRIVATE
[807]	115
[809]	116	#if ! defined ( __check )
[1]	117	PUBLIC poisfft, poisfft_init
	118
	119	INTERFACE poisfft
	120	MODULE PROCEDURE poisfft
	121	END INTERFACE poisfft
	122
	123	INTERFACE poisfft_init
	124	MODULE PROCEDURE poisfft_init
	125	END INTERFACE poisfft_init
[807]	126	#else
	127	PUBLIC poisfft_init
[1]	128
[807]	129	INTERFACE poisfft_init
	130	MODULE PROCEDURE poisfft_init
	131	END INTERFACE poisfft_init
	132	#endif
	133
[1]	134	CONTAINS
	135
	136	SUBROUTINE poisfft_init
	137
	138	CALL fft_init
	139
	140	END SUBROUTINE poisfft_init
	141
[809]	142	#if ! defined ( __check )
[1]	143	SUBROUTINE poisfft( ar, work )
	144
	145	USE cpulog
	146	USE interfaces
	147	USE pegrid
	148
	149	IMPLICIT NONE
	150
	151	REAL, DIMENSION(1:nza,nys:nyna,nxl:nxra) :: ar, work
	152
	153
	154	CALL cpu_log( log_point_s(3), 'poisfft', 'start' )
	155
	156	!
	157	!-- Two-dimensional Fourier Transformation in x- and y-direction.
	158	#if defined( __parallel )
	159	IF ( pdims(2) == 1 ) THEN
	160
	161	!
	162	!-- 1d-domain-decomposition along x:
	163	!-- FFT along y and transposition y --> x
	164	CALL ffty_tr_yx( ar, work, ar )
	165
	166	!
	167	!-- FFT along x, solving the tridiagonal system and backward FFT
	168	CALL fftx_tri_fftx( ar )
	169
	170	!
	171	!-- Transposition x --> y and backward FFT along y
	172	CALL tr_xy_ffty( ar, work, ar )
	173
	174	ELSEIF ( pdims(1) == 1 ) THEN
	175
	176	!
	177	!-- 1d-domain-decomposition along y:
	178	!-- FFT along x and transposition x --> y
	179	CALL fftx_tr_xy( ar, work, ar )
	180
	181	!
	182	!-- FFT along y, solving the tridiagonal system and backward FFT
	183	CALL ffty_tri_ffty( ar )
	184
	185	!
	186	!-- Transposition y --> x and backward FFT along x
	187	CALL tr_yx_fftx( ar, work, ar )
	188
	189	ELSE
	190
	191	!
	192	!-- 2d-domain-decomposition
	193	!-- Transposition z --> x
	194	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
[164]	195	CALL transpose_zx( ar, work, ar )
[1]	196	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
	197
	198	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
	199	CALL fftxp( ar, 'forward' )
	200	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
	201
	202	!
	203	!-- Transposition x --> y
	204	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
[164]	205	CALL transpose_xy( ar, work, ar )
[1]	206	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
	207
	208	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
	209	CALL fftyp( ar, 'forward' )
	210	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	211
	212	!
	213	!-- Transposition y --> z
	214	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
[164]	215	CALL transpose_yz( ar, work, ar )
[1]	216	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
	217
	218	!
	219	!-- Solve the Poisson equation in z-direction in cartesian space.
	220	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
	221	CALL tridia( ar )
	222	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
	223
	224	!
	225	!-- Inverse Fourier Transformation
	226	!-- Transposition z --> y
	227	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
[164]	228	CALL transpose_zy( ar, work, ar )
[1]	229	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
	230
	231	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
	232	CALL fftyp( ar, 'backward' )
	233	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	234
	235	!
	236	!-- Transposition y --> x
	237	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
[164]	238	CALL transpose_yx( ar, work, ar )
[1]	239	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
	240
	241	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
	242	CALL fftxp( ar, 'backward' )
	243	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	244
	245	!
	246	!-- Transposition x --> z
	247	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
[164]	248	CALL transpose_xz( ar, work, ar )
[1]	249	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
	250
	251	ENDIF
	252
	253	#else
	254
	255	!
	256	!-- Two-dimensional Fourier Transformation along x- and y-direction.
	257	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
	258	CALL fftx( ar, 'forward' )
	259	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
	260	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
	261	CALL ffty( ar, 'forward' )
	262	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	263
	264	!
	265	!-- Solve the Poisson equation in z-direction in cartesian space.
	266	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
	267	CALL tridia( ar )
	268	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
	269
	270	!
	271	!-- Inverse Fourier Transformation.
	272	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
	273	CALL ffty( ar, 'backward' )
	274	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	275	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
	276	CALL fftx( ar, 'backward' )
	277	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	278
	279	#endif
	280
	281	CALL cpu_log( log_point_s(3), 'poisfft', 'stop' )
	282
	283	END SUBROUTINE poisfft
	284
	285
	286
	287	SUBROUTINE tridia( ar )
	288
	289	!------------------------------------------------------------------------------!
	290	! solves the linear system of equations:
	291	!
	292	! -(4 pi^2(i^2/(dx^2nnx^2)+j^2/(dy^2nny^2))+
	293	! 1/(dzu(k)dzw(k))+1/(dzu(k-1)dzw(k)))*p(i,j,k)+
	294	! 1/(dzu(k)dzw(k))p(i,j,k+1)+1/(dzu(k-1)dzw(k))p(i,j,k-1)=d(i,j,k)
	295	!
	296	! by using the Thomas algorithm
	297	!------------------------------------------------------------------------------!
	298
	299	USE arrays_3d
	300
	301	IMPLICIT NONE
	302
	303	INTEGER :: i, j, k, nnyh
	304
	305	REAL, DIMENSION(nxl_z:nxr_z,0:nz-1) :: ar1
	306	REAL, DIMENSION(5,nxl_z:nxr_z,0:nz-1) :: tri
	307
	308	#if defined( __parallel )
	309	REAL :: ar(nxl_z:nxr_za,nys_z:nyn_za,1:nza)
	310	#else
	311	REAL :: ar(1:nz,nys_z:nyn_z,nxl_z:nxr_z)
	312	#endif
	313
	314
	315	nnyh = (ny+1) / 2
	316
	317	!
	318	!-- Define constant elements of the tridiagonal matrix.
[683]	319	!$OMP PARALLEL PRIVATE ( k, i )
	320	!$OMP DO
[1]	321	DO k = 0, nz-1
	322	DO i = nxl_z, nxr_z
[667]	323	tri(2,i,k) = ddzu_pres(k+1) * ddzw(k+1)
	324	tri(3,i,k) = ddzu_pres(k+2) * ddzw(k+1)
[1]	325	ENDDO
	326	ENDDO
[683]	327	!$OMP END PARALLEL
[1]	328
	329	#if defined( __parallel )
	330	!
	331	!-- Repeat for all y-levels.
[683]	332	!$OMP PARALLEL FIRSTPRIVATE( tri ) PRIVATE ( ar1, j )
	333	!$OMP DO
[1]	334	DO j = nys_z, nyn_z
	335	IF ( j <= nnyh ) THEN
	336	CALL maketri( tri, j )
	337	ELSE
	338	CALL maketri( tri, ny+1-j )
	339	ENDIF
	340	CALL split( tri )
	341	CALL substi( ar, ar1, tri, j )
	342	ENDDO
[683]	343	!$OMP END PARALLEL
[1]	344	#else
	345	!
	346	!-- First y-level.
	347	CALL maketri( tri, nys_z )
	348	CALL split( tri )
	349	CALL substi( ar, ar1, tri, 0 )
	350
	351	!
	352	!-- Further y-levels.
	353	DO j = 1, nnyh - 1
	354	CALL maketri( tri, j )
	355	CALL split( tri )
	356	CALL substi( ar, ar1, tri, j )
	357	CALL substi( ar, ar1, tri, ny+1-j )
	358	ENDDO
	359	CALL maketri( tri, nnyh )
	360	CALL split( tri )
	361	CALL substi( ar, ar1, tri, nnyh+nys )
	362	#endif
	363
	364	CONTAINS
	365
	366	SUBROUTINE maketri( tri, j )
	367
	368	!------------------------------------------------------------------------------!
	369	! Computes the i- and j-dependent component of the matrix
	370	!------------------------------------------------------------------------------!
	371
	372	USE arrays_3d
	373	USE constants
	374	USE control_parameters
	375	USE grid_variables
	376
	377	IMPLICIT NONE
	378
	379	INTEGER :: i, j, k, nnxh
	380	REAL :: a, c
	381	REAL :: ll(nxl_z:nxr_z)
	382	REAL :: tri(5,nxl_z:nxr_z,0:nz-1)
	383
	384
	385	nnxh = ( nx + 1 ) / 2
	386
	387	!
	388	!-- Provide the tridiagonal matrix for solution of the Poisson equation in
	389	!-- Fourier space. The coefficients are computed following the method of
	390	!-- Schmidt et al. (DFVLR-Mitteilung 84-15), which departs from Stephan
	391	!-- Siano's original version by discretizing the Poisson equation,
	392	!-- before it is Fourier-transformed
	393	#if defined( __parallel )
	394	DO i = nxl_z, nxr_z
[128]	395	IF ( i >= 0 .AND. i <= nnxh ) THEN
[1]	396	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
	397	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
	398	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
	399	FLOAT( ny+1 ) ) ) / ( dy * dy )
	400	ELSE
	401	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
	402	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
	403	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
	404	FLOAT( ny+1 ) ) ) / ( dy * dy )
	405	ENDIF
	406	DO k = 0,nz-1
[667]	407	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
	408	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
[1]	409	tri(1,i,k) = a + c - ll(i)
	410	ENDDO
	411	ENDDO
	412	#else
	413	DO i = 0, nnxh
	414	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / FLOAT( nx+1 ) ) ) / &
	415	( dx * dx ) + &
	416	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / FLOAT( ny+1 ) ) ) / &
	417	( dy * dy )
	418	DO k = 0, nz-1
[667]	419	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
	420	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
[1]	421	tri(1,i,k) = a + c - ll(i)
	422	IF ( i >= 1 .and. i < nnxh ) THEN
	423	tri(1,nx+1-i,k) = tri(1,i,k)
	424	ENDIF
	425	ENDDO
	426	ENDDO
	427	#endif
	428	IF ( ibc_p_b == 1 .OR. ibc_p_b == 2 ) THEN
	429	DO i = nxl_z, nxr_z
	430	tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
	431	ENDDO
	432	ENDIF
	433	IF ( ibc_p_t == 1 ) THEN
	434	DO i = nxl_z, nxr_z
	435	tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
	436	ENDDO
	437	ENDIF
	438
	439	END SUBROUTINE maketri
	440
	441
	442	SUBROUTINE substi( ar, ar1, tri, j )
	443
	444	!------------------------------------------------------------------------------!
	445	! Substitution (Forward and Backward) (Thomas algorithm)
	446	!------------------------------------------------------------------------------!
	447
[76]	448	USE control_parameters
	449
[1]	450	IMPLICIT NONE
	451
	452	INTEGER :: i, j, k
	453	REAL :: ar1(nxl_z:nxr_z,0:nz-1)
	454	REAL :: tri(5,nxl_z:nxr_z,0:nz-1)
	455	#if defined( __parallel )
	456	REAL :: ar(nxl_z:nxr_za,nys_z:nyn_za,1:nza)
	457	#else
	458	REAL :: ar(1:nz,nys_z:nyn_z,nxl_z:nxr_z)
	459	#endif
	460
	461	!
	462	!-- Forward substitution.
	463	DO i = nxl_z, nxr_z
	464	#if defined( __parallel )
	465	ar1(i,0) = ar(i,j,1)
	466	#else
	467	ar1(i,0) = ar(1,j,i)
	468	#endif
	469	ENDDO
	470	DO k = 1, nz - 1
	471	DO i = nxl_z, nxr_z
	472	#if defined( __parallel )
	473	ar1(i,k) = ar(i,j,k+1) - tri(5,i,k) * ar1(i,k-1)
	474	#else
	475	ar1(i,k) = ar(k+1,j,i) - tri(5,i,k) * ar1(i,k-1)
	476	#endif
	477	ENDDO
	478	ENDDO
	479
	480	!
	481	!-- Backward substitution.
	482	DO i = nxl_z, nxr_z
	483	#if defined( __parallel )
	484	ar(i,j,nz) = ar1(i,nz-1) / tri(4,i,nz-1)
	485	#else
	486	ar(nz,j,i) = ar1(i,nz-1) / tri(4,i,nz-1)
	487	#endif
	488	ENDDO
	489	DO k = nz-2, 0, -1
	490	DO i = nxl_z, nxr_z
	491	#if defined( __parallel )
	492	ar(i,j,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,j,k+2) ) &
	493	/ tri(4,i,k)
	494	#else
	495	ar(k+1,j,i) = ( ar1(i,k) - tri(3,i,k) * ar(k+2,j,i) ) &
	496	/ tri(4,i,k)
	497	#endif
	498	ENDDO
	499	ENDDO
	500
[76]	501	!
	502	!-- Indices i=0, j=0 correspond to horizontally averaged pressure.
	503	!-- The respective values of ar should be zero at all k-levels if
	504	!-- acceleration of horizontally averaged vertical velocity is zero.
	505	IF ( ibc_p_b == 1 .AND. ibc_p_t == 1 ) THEN
	506	IF ( j == 0 .AND. nxl_z == 0 ) THEN
	507	#if defined( __parallel )
	508	DO k = 1, nz
	509	ar(nxl_z,j,k) = 0.0
	510	ENDDO
	511	#else
	512	DO k = 1, nz
	513	ar(k,j,nxl_z) = 0.0
	514	ENDDO
	515	#endif
	516	ENDIF
	517	ENDIF
	518
[1]	519	END SUBROUTINE substi
	520
	521
	522	SUBROUTINE split( tri )
	523
	524	!------------------------------------------------------------------------------!
	525	! Splitting of the tridiagonal matrix (Thomas algorithm)
	526	!------------------------------------------------------------------------------!
	527
	528	IMPLICIT NONE
	529
	530	INTEGER :: i, k
	531	REAL :: tri(5,nxl_z:nxr_z,0:nz-1)
	532
	533	!
	534	!-- Splitting.
	535	DO i = nxl_z, nxr_z
	536	tri(4,i,0) = tri(1,i,0)
	537	ENDDO
	538	DO k = 1, nz-1
	539	DO i = nxl_z, nxr_z
	540	tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
	541	tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
	542	ENDDO
	543	ENDDO
	544
	545	END SUBROUTINE split
	546
	547	END SUBROUTINE tridia
	548
	549
	550	#if defined( __parallel )
	551	SUBROUTINE fftxp( ar, direction )
	552
	553	!------------------------------------------------------------------------------!
	554	! Fourier-transformation along x-direction Parallelized version
	555	!------------------------------------------------------------------------------!
	556
	557	IMPLICIT NONE
	558
	559	CHARACTER (LEN=*) :: direction
	560	INTEGER :: j, k
	561	REAL :: ar(0:nxa,nys_x:nyn_xa,nzb_x:nzt_xa)
	562
	563	!
	564	!-- Performing the fft with one of the methods implemented
[683]	565	!$OMP PARALLEL PRIVATE ( j, k )
	566	!$OMP DO
[1]	567	DO k = nzb_x, nzt_x
	568	DO j = nys_x, nyn_x
	569	CALL fft_x( ar(0:nx,j,k), direction )
	570	ENDDO
	571	ENDDO
[683]	572	!$OMP END PARALLEL
[1]	573
	574	END SUBROUTINE fftxp
	575
	576	#else
	577	SUBROUTINE fftx( ar, direction )
	578
	579	!------------------------------------------------------------------------------!
	580	! Fourier-transformation along x-direction Non parallel version
	581	!------------------------------------------------------------------------------!
	582
	583	IMPLICIT NONE
	584
	585	CHARACTER (LEN=*) :: direction
	586	INTEGER :: i, j, k
	587	REAL :: ar(1:nz,0:ny,0:nx)
	588
	589	!
	590	!-- Performing the fft with one of the methods implemented
[683]	591	!$OMP PARALLEL PRIVATE ( j, k )
	592	!$OMP DO
[1]	593	DO k = 1, nz
	594	DO j = 0, ny
	595	CALL fft_x( ar(k,j,0:nx), direction )
	596	ENDDO
	597	ENDDO
[683]	598	!$OMP END PARALLEL
[1]	599
	600	END SUBROUTINE fftx
	601	#endif
	602
	603
	604	#if defined( __parallel )
	605	SUBROUTINE fftyp( ar, direction )
	606
	607	!------------------------------------------------------------------------------!
	608	! Fourier-transformation along y-direction Parallelized version
	609	!------------------------------------------------------------------------------!
	610
	611	IMPLICIT NONE
	612
	613	CHARACTER (LEN=*) :: direction
	614	INTEGER :: i, k
	615	REAL :: ar(0:nya,nxl_y:nxr_ya,nzb_y:nzt_ya)
	616
	617	!
	618	!-- Performing the fft with one of the methods implemented
[683]	619	!$OMP PARALLEL PRIVATE ( i, k )
	620	!$OMP DO
[1]	621	DO k = nzb_y, nzt_y
	622	DO i = nxl_y, nxr_y
	623	CALL fft_y( ar(0:ny,i,k), direction )
	624	ENDDO
	625	ENDDO
[683]	626	!$OMP END PARALLEL
[1]	627
	628	END SUBROUTINE fftyp
	629
	630	#else
	631	SUBROUTINE ffty( ar, direction )
	632
	633	!------------------------------------------------------------------------------!
	634	! Fourier-transformation along y-direction Non parallel version
	635	!------------------------------------------------------------------------------!
	636
	637	IMPLICIT NONE
	638
	639	CHARACTER (LEN=*) :: direction
	640	INTEGER :: i, k
	641	REAL :: ar(1:nz,0:ny,0:nx)
	642
	643	!
	644	!-- Performing the fft with one of the methods implemented
[683]	645	!$OMP PARALLEL PRIVATE ( i, k )
	646	!$OMP DO
[1]	647	DO k = 1, nz
	648	DO i = 0, nx
	649	CALL fft_y( ar(k,0:ny,i), direction )
	650	ENDDO
	651	ENDDO
[683]	652	!$OMP END PARALLEL
[1]	653
	654	END SUBROUTINE ffty
	655	#endif
	656
	657	#if defined( __parallel )
	658	SUBROUTINE ffty_tr_yx( f_in, work, f_out )
	659
	660	!------------------------------------------------------------------------------!
	661	! Fourier-transformation along y with subsequent transposition y --> x for
	662	! a 1d-decomposition along x
	663	!
	664	! ATTENTION: The performance of this routine is much faster on the NEC-SX6,
	665	! if the first index of work_ffty_vec is odd. Otherwise
	666	! memory bank conflicts may occur (especially if the index is a
	667	! multiple of 128). That's why work_ffty_vec is dimensioned as
	668	! 0:ny+1.
	669	! Of course, this will not work if users are using an odd number
	670	! of gridpoints along y.
	671	!------------------------------------------------------------------------------!
	672
	673	USE control_parameters
	674	USE cpulog
	675	USE indices
	676	USE interfaces
	677	USE pegrid
	678	USE transpose_indices
	679
	680	IMPLICIT NONE
	681
	682	INTEGER :: i, iend, iouter, ir, j, k
	683	INTEGER, PARAMETER :: stridex = 4
	684
	685	REAL, DIMENSION(0:ny,stridex) :: work_ffty
	686	#if defined( __nec )
	687	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
	688	#endif
	689	REAL, DIMENSION(1:nza,0:nya,nxl:nxra) :: f_in
	690	REAL, DIMENSION(nnx,1:nza,nys_x:nyn_xa,pdims(1)) :: f_out
	691	REAL, DIMENSION(nxl:nxra,1:nza,0:nya) :: work
	692
	693	!
	694	!-- Carry out the FFT along y, where all data are present due to the
	695	!-- 1d-decomposition along x. Resort the data in a way that x becomes
	696	!-- the first index.
	697	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
	698
	699	IF ( host(1:3) == 'nec' ) THEN
	700	#if defined( __nec )
	701	!
	702	!-- Code optimized for vector processors
[85]	703	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	704	!$OMP DO
	705	DO i = nxl, nxr
	706
	707	DO j = 0, ny
	708	DO k = 1, nz
	709	work_ffty_vec(j,k,i) = f_in(k,j,i)
	710	ENDDO
	711	ENDDO
	712
	713	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' )
	714
	715	ENDDO
	716
	717	!$OMP DO
	718	DO k = 1, nz
	719	DO j = 0, ny
	720	DO i = nxl, nxr
	721	work(i,k,j) = work_ffty_vec(j,k,i)
	722	ENDDO
	723	ENDDO
	724	ENDDO
	725	!$OMP END PARALLEL
	726	#endif
	727
	728	ELSE
	729
	730	!
	731	!-- Cache optimized code.
	732	!-- The i-(x-)direction is split into a strided outer loop and an inner
	733	!-- loop for better cache performance
	734	!$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty)
	735	!$OMP DO
	736	DO iouter = nxl, nxr, stridex
	737
	738	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
	739
	740	DO k = 1, nz
	741
	742	DO i = iouter, iend
	743
	744	ir = i-iouter+1 ! counter within a stride
	745	DO j = 0, ny
	746	work_ffty(j,ir) = f_in(k,j,i)
	747	ENDDO
	748	!
	749	!-- FFT along y
	750	CALL fft_y( work_ffty(:,ir), 'forward' )
	751
	752	ENDDO
	753
	754	!
	755	!-- Resort
	756	DO j = 0, ny
	757	DO i = iouter, iend
	758	work(i,k,j) = work_ffty(j,i-iouter+1)
	759	ENDDO
	760	ENDDO
	761
	762	ENDDO
	763
	764	ENDDO
	765	!$OMP END PARALLEL
	766
	767	ENDIF
	768	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	769
	770	!
	771	!-- Transpose array
	772	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	773	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	774	CALL MPI_ALLTOALL( work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
	775	f_out(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
	776	comm1dx, ierr )
	777	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	778
	779	END SUBROUTINE ffty_tr_yx
	780
	781
	782	SUBROUTINE tr_xy_ffty( f_in, work, f_out )
	783
	784	!------------------------------------------------------------------------------!
	785	! Transposition x --> y with a subsequent backward Fourier transformation for
	786	! a 1d-decomposition along x
	787	!------------------------------------------------------------------------------!
	788
	789	USE control_parameters
	790	USE cpulog
	791	USE indices
	792	USE interfaces
	793	USE pegrid
	794	USE transpose_indices
	795
	796	IMPLICIT NONE
	797
	798	INTEGER :: i, iend, iouter, ir, j, k
	799	INTEGER, PARAMETER :: stridex = 4
	800
	801	REAL, DIMENSION(0:ny,stridex) :: work_ffty
	802	#if defined( __nec )
	803	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
	804	#endif
	805	REAL, DIMENSION(nnx,1:nza,nys_x:nyn_xa,pdims(1)) :: f_in
	806	REAL, DIMENSION(1:nza,0:nya,nxl:nxra) :: f_out
	807	REAL, DIMENSION(nxl:nxra,1:nza,0:nya) :: work
	808
	809	!
	810	!-- Transpose array
	811	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	812	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	813	CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
	814	work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
	815	comm1dx, ierr )
	816	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	817
	818	!
	819	!-- Resort the data in a way that y becomes the first index and carry out the
	820	!-- backward fft along y.
	821	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
	822
	823	IF ( host(1:3) == 'nec' ) THEN
	824	#if defined( __nec )
	825	!
	826	!-- Code optimized for vector processors
[85]	827	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	828	!$OMP DO
	829	DO k = 1, nz
	830	DO j = 0, ny
	831	DO i = nxl, nxr
	832	work_ffty_vec(j,k,i) = work(i,k,j)
	833	ENDDO
	834	ENDDO
	835	ENDDO
	836
	837	!$OMP DO
	838	DO i = nxl, nxr
	839
	840	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' )
	841
	842	DO j = 0, ny
	843	DO k = 1, nz
	844	f_out(k,j,i) = work_ffty_vec(j,k,i)
	845	ENDDO
	846	ENDDO
	847
	848	ENDDO
	849	!$OMP END PARALLEL
	850	#endif
	851
	852	ELSE
	853
	854	!
	855	!-- Cache optimized code.
	856	!-- The i-(x-)direction is split into a strided outer loop and an inner
	857	!-- loop for better cache performance
	858	!$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty )
	859	!$OMP DO
	860	DO iouter = nxl, nxr, stridex
	861
	862	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
	863
	864	DO k = 1, nz
	865	!
	866	!-- Resort
	867	DO j = 0, ny
	868	DO i = iouter, iend
	869	work_ffty(j,i-iouter+1) = work(i,k,j)
	870	ENDDO
	871	ENDDO
	872
	873	DO i = iouter, iend
	874
	875	!
	876	!-- FFT along y
	877	ir = i-iouter+1 ! counter within a stride
	878	CALL fft_y( work_ffty(:,ir), 'backward' )
	879
	880	DO j = 0, ny
	881	f_out(k,j,i) = work_ffty(j,ir)
	882	ENDDO
	883	ENDDO
	884
	885	ENDDO
	886
	887	ENDDO
	888	!$OMP END PARALLEL
	889
	890	ENDIF
	891
	892	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	893
	894	END SUBROUTINE tr_xy_ffty
	895
	896
	897	SUBROUTINE fftx_tri_fftx( ar )
	898
	899	!------------------------------------------------------------------------------!
	900	! FFT along x, solution of the tridiagonal system and backward FFT for
	901	! a 1d-decomposition along x
	902	!
	903	! WARNING: this subroutine may still not work for hybrid parallelization
	904	! with OpenMP (for possible necessary changes see the original
	905	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
	906	!------------------------------------------------------------------------------!
	907
	908	USE control_parameters
	909	USE cpulog
	910	USE grid_variables
	911	USE indices
	912	USE interfaces
	913	USE pegrid
	914	USE transpose_indices
	915
	916	IMPLICIT NONE
	917
	918	character(len=3) :: myth_char
	919
	920	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
	921
	922	REAL, DIMENSION(0:nx) :: work_fftx
	923	REAL, DIMENSION(0:nx,1:nz) :: work_trix
	924	REAL, DIMENSION(nnx,1:nza,nys_x:nyn_xa,pdims(1)) :: ar
	925	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
	926
	927
	928	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'start' )
	929
	930	ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) )
	931
	932	tn = 0 ! Default thread number in case of one thread
	933	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_fftx, work_trix )
	934	DO j = nys_x, nyn_x
	935
	936	!$ tn = omp_get_thread_num()
	937
	938	IF ( host(1:3) == 'nec' ) THEN
	939	!
	940	!-- Code optimized for vector processors
	941	DO k = 1, nz
	942
	943	m = 0
	944	DO n = 1, pdims(1)
	945	DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
	946	work_trix(m,k) = ar(i,k,j,n)
	947	m = m + 1
	948	ENDDO
	949	ENDDO
	950
	951	ENDDO
	952
	953	CALL fft_x_m( work_trix, 'forward' )
	954
	955	ELSE
	956	!
	957	!-- Cache optimized code
	958	DO k = 1, nz
	959
	960	m = 0
	961	DO n = 1, pdims(1)
	962	DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
	963	work_fftx(m) = ar(i,k,j,n)
	964	m = m + 1
	965	ENDDO
	966	ENDDO
	967
	968	CALL fft_x( work_fftx, 'forward' )
	969
	970	DO i = 0, nx
	971	work_trix(i,k) = work_fftx(i)
	972	ENDDO
	973
	974	ENDDO
	975
	976	ENDIF
	977
	978	!
	979	!-- Solve the linear equation system
	980	CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) )
	981
	982	IF ( host(1:3) == 'nec' ) THEN
	983	!
	984	!-- Code optimized for vector processors
	985	CALL fft_x_m( work_trix, 'backward' )
	986
	987	DO k = 1, nz
	988
	989	m = 0
	990	DO n = 1, pdims(1)
	991	DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
	992	ar(i,k,j,n) = work_trix(m,k)
	993	m = m + 1
	994	ENDDO
	995	ENDDO
	996
	997	ENDDO
	998
	999	ELSE
	1000	!
	1001	!-- Cache optimized code
	1002	DO k = 1, nz
	1003
	1004	DO i = 0, nx
	1005	work_fftx(i) = work_trix(i,k)
	1006	ENDDO
	1007
	1008	CALL fft_x( work_fftx, 'backward' )
	1009
	1010	m = 0
	1011	DO n = 1, pdims(1)
	1012	DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
	1013	ar(i,k,j,n) = work_fftx(m)
	1014	m = m + 1
	1015	ENDDO
	1016	ENDDO
	1017
	1018	ENDDO
	1019
	1020	ENDIF
	1021
	1022	ENDDO
	1023
	1024	DEALLOCATE( tri )
	1025
	1026	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'stop' )
	1027
	1028	END SUBROUTINE fftx_tri_fftx
	1029
	1030
	1031	SUBROUTINE fftx_tr_xy( f_in, work, f_out )
	1032
	1033	!------------------------------------------------------------------------------!
	1034	! Fourier-transformation along x with subsequent transposition x --> y for
	1035	! a 1d-decomposition along y
	1036	!
	1037	! ATTENTION: The NEC-branch of this routine may significantly profit from
	1038	! further optimizations. So far, performance is much worse than
	1039	! for routine ffty_tr_yx (more than three times slower).
	1040	!------------------------------------------------------------------------------!
	1041
	1042	USE control_parameters
	1043	USE cpulog
	1044	USE indices
	1045	USE interfaces
	1046	USE pegrid
	1047	USE transpose_indices
	1048
	1049	IMPLICIT NONE
	1050
	1051	INTEGER :: i, j, k
	1052
	1053	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
	1054	REAL, DIMENSION(1:nza,nys:nyna,0:nxa) :: f_in
	1055	REAL, DIMENSION(nny,1:nza,nxl_y:nxr_ya,pdims(2)) :: f_out
	1056	REAL, DIMENSION(nys:nyna,1:nza,0:nxa) :: work
	1057
	1058	!
	1059	!-- Carry out the FFT along x, where all data are present due to the
	1060	!-- 1d-decomposition along y. Resort the data in a way that y becomes
	1061	!-- the first index.
	1062	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
	1063
	1064	IF ( host(1:3) == 'nec' ) THEN
	1065	!
	1066	!-- Code for vector processors
[85]	1067	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	1068	!$OMP DO
	1069	DO i = 0, nx
	1070
	1071	DO j = nys, nyn
	1072	DO k = 1, nz
	1073	work_fftx(i,k,j) = f_in(k,j,i)
	1074	ENDDO
	1075	ENDDO
	1076
	1077	ENDDO
	1078
	1079	!$OMP DO
	1080	DO j = nys, nyn
	1081
	1082	CALL fft_x_m( work_fftx(:,:,j), 'forward' )
	1083
	1084	DO k = 1, nz
	1085	DO i = 0, nx
	1086	work(j,k,i) = work_fftx(i,k,j)
	1087	ENDDO
	1088	ENDDO
	1089
	1090	ENDDO
	1091	!$OMP END PARALLEL
	1092
	1093	ELSE
	1094
	1095	!
	1096	!-- Cache optimized code (there might be still a potential for better
	1097	!-- optimization).
[696]	1098	!$OMP PARALLEL PRIVATE (i,j,k)
[1]	1099	!$OMP DO
	1100	DO i = 0, nx
	1101
	1102	DO j = nys, nyn
	1103	DO k = 1, nz
	1104	work_fftx(i,k,j) = f_in(k,j,i)
	1105	ENDDO
	1106	ENDDO
	1107
	1108	ENDDO
	1109
	1110	!$OMP DO
	1111	DO j = nys, nyn
	1112	DO k = 1, nz
	1113
	1114	CALL fft_x( work_fftx(0:nx,k,j), 'forward' )
	1115
	1116	DO i = 0, nx
	1117	work(j,k,i) = work_fftx(i,k,j)
	1118	ENDDO
	1119	ENDDO
	1120
	1121	ENDDO
	1122	!$OMP END PARALLEL
	1123
	1124	ENDIF
	1125	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
	1126
	1127	!
	1128	!-- Transpose array
	1129	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	1130	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	1131	CALL MPI_ALLTOALL( work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
	1132	f_out(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
	1133	comm1dy, ierr )
	1134	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	1135
	1136	END SUBROUTINE fftx_tr_xy
	1137
	1138
	1139	SUBROUTINE tr_yx_fftx( f_in, work, f_out )
	1140
	1141	!------------------------------------------------------------------------------!
	1142	! Transposition y --> x with a subsequent backward Fourier transformation for
	1143	! a 1d-decomposition along x
	1144	!------------------------------------------------------------------------------!
	1145
	1146	USE control_parameters
	1147	USE cpulog
	1148	USE indices
	1149	USE interfaces
	1150	USE pegrid
	1151	USE transpose_indices
	1152
	1153	IMPLICIT NONE
	1154
	1155	INTEGER :: i, j, k
	1156
	1157	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
	1158	REAL, DIMENSION(nny,1:nza,nxl_y:nxr_ya,pdims(2)) :: f_in
	1159	REAL, DIMENSION(1:nza,nys:nyna,0:nxa) :: f_out
	1160	REAL, DIMENSION(nys:nyna,1:nza,0:nxa) :: work
	1161
	1162	!
	1163	!-- Transpose array
	1164	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	1165	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	1166	CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
	1167	work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
	1168	comm1dy, ierr )
	1169	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	1170
	1171	!
	1172	!-- Carry out the FFT along x, where all data are present due to the
	1173	!-- 1d-decomposition along y. Resort the data in a way that y becomes
	1174	!-- the first index.
	1175	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
	1176
	1177	IF ( host(1:3) == 'nec' ) THEN
	1178	!
	1179	!-- Code optimized for vector processors
[85]	1180	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	1181	!$OMP DO
	1182	DO j = nys, nyn
	1183
	1184	DO k = 1, nz
	1185	DO i = 0, nx
	1186	work_fftx(i,k,j) = work(j,k,i)
	1187	ENDDO
	1188	ENDDO
	1189
	1190	CALL fft_x_m( work_fftx(:,:,j), 'backward' )
	1191
	1192	ENDDO
	1193
	1194	!$OMP DO
	1195	DO i = 0, nx
	1196	DO j = nys, nyn
	1197	DO k = 1, nz
	1198	f_out(k,j,i) = work_fftx(i,k,j)
	1199	ENDDO
	1200	ENDDO
	1201	ENDDO
	1202	!$OMP END PARALLEL
	1203
	1204	ELSE
	1205
	1206	!
	1207	!-- Cache optimized code (there might be still a potential for better
	1208	!-- optimization).
[696]	1209	!$OMP PARALLEL PRIVATE (i,j,k)
[1]	1210	!$OMP DO
	1211	DO j = nys, nyn
	1212	DO k = 1, nz
	1213
	1214	DO i = 0, nx
	1215	work_fftx(i,k,j) = work(j,k,i)
	1216	ENDDO
	1217
	1218	CALL fft_x( work_fftx(0:nx,k,j), 'backward' )
	1219
	1220	ENDDO
	1221	ENDDO
	1222
	1223	!$OMP DO
	1224	DO i = 0, nx
	1225	DO j = nys, nyn
	1226	DO k = 1, nz
	1227	f_out(k,j,i) = work_fftx(i,k,j)
	1228	ENDDO
	1229	ENDDO
	1230	ENDDO
	1231	!$OMP END PARALLEL
	1232
	1233	ENDIF
	1234	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	1235
	1236	END SUBROUTINE tr_yx_fftx
	1237
	1238
	1239	SUBROUTINE ffty_tri_ffty( ar )
	1240
	1241	!------------------------------------------------------------------------------!
	1242	! FFT along y, solution of the tridiagonal system and backward FFT for
	1243	! a 1d-decomposition along y
	1244	!
	1245	! WARNING: this subroutine may still not work for hybrid parallelization
	1246	! with OpenMP (for possible necessary changes see the original
	1247	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
	1248	!------------------------------------------------------------------------------!
	1249
	1250	USE control_parameters
	1251	USE cpulog
	1252	USE grid_variables
	1253	USE indices
	1254	USE interfaces
	1255	USE pegrid
	1256	USE transpose_indices
	1257
	1258	IMPLICIT NONE
	1259
	1260	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
	1261
	1262	REAL, DIMENSION(0:ny) :: work_ffty
	1263	REAL, DIMENSION(0:ny,1:nz) :: work_triy
	1264	REAL, DIMENSION(nny,1:nza,nxl_y:nxr_ya,pdims(2)) :: ar
	1265	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
	1266
	1267
	1268	CALL cpu_log( log_point_s(39), 'fft_y + tridia', 'start' )
	1269
	1270	ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) )
	1271
	1272	tn = 0 ! Default thread number in case of one thread
[696]	1273	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy )
[1]	1274	DO i = nxl_y, nxr_y
	1275
	1276	!$ tn = omp_get_thread_num()
	1277
	1278	IF ( host(1:3) == 'nec' ) THEN
	1279	!
	1280	!-- Code optimized for vector processors
	1281	DO k = 1, nz
	1282
	1283	m = 0
	1284	DO n = 1, pdims(2)
	1285	DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
	1286	work_triy(m,k) = ar(j,k,i,n)
	1287	m = m + 1
	1288	ENDDO
	1289	ENDDO
	1290
	1291	ENDDO
	1292
	1293	CALL fft_y_m( work_triy, ny, 'forward' )
	1294
	1295	ELSE
	1296	!
	1297	!-- Cache optimized code
	1298	DO k = 1, nz
	1299
	1300	m = 0
	1301	DO n = 1, pdims(2)
	1302	DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
	1303	work_ffty(m) = ar(j,k,i,n)
	1304	m = m + 1
	1305	ENDDO
	1306	ENDDO
	1307
	1308	CALL fft_y( work_ffty, 'forward' )
	1309
	1310	DO j = 0, ny
	1311	work_triy(j,k) = work_ffty(j)
	1312	ENDDO
	1313
	1314	ENDDO
	1315
	1316	ENDIF
	1317
	1318	!
	1319	!-- Solve the linear equation system
	1320	CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) )
	1321
	1322	IF ( host(1:3) == 'nec' ) THEN
	1323	!
	1324	!-- Code optimized for vector processors
	1325	CALL fft_y_m( work_triy, ny, 'backward' )
	1326
	1327	DO k = 1, nz
	1328
	1329	m = 0
	1330	DO n = 1, pdims(2)
	1331	DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
	1332	ar(j,k,i,n) = work_triy(m,k)
	1333	m = m + 1
	1334	ENDDO
	1335	ENDDO
	1336
	1337	ENDDO
	1338
	1339	ELSE
	1340	!
	1341	!-- Cache optimized code
	1342	DO k = 1, nz
	1343
	1344	DO j = 0, ny
	1345	work_ffty(j) = work_triy(j,k)
	1346	ENDDO
	1347
	1348	CALL fft_y( work_ffty, 'backward' )
	1349
	1350	m = 0
	1351	DO n = 1, pdims(2)
	1352	DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
	1353	ar(j,k,i,n) = work_ffty(m)
	1354	m = m + 1
	1355	ENDDO
	1356	ENDDO
	1357
	1358	ENDDO
	1359
	1360	ENDIF
	1361
	1362	ENDDO
	1363
	1364	DEALLOCATE( tri )
	1365
	1366	CALL cpu_log( log_point_s(39), 'fft_y + tridia', 'stop' )
	1367
	1368	END SUBROUTINE ffty_tri_ffty
	1369
	1370
	1371	SUBROUTINE tridia_1dd( ddx2, ddy2, nx, ny, j, ar, tri )
	1372
	1373	!------------------------------------------------------------------------------!
	1374	! Solves the linear system of equations for a 1d-decomposition along x (see
	1375	! tridia)
	1376	!
	1377	! Attention: when using the intel compiler, array tri must be passed as an
	1378	! argument to the contained subroutines. Otherwise addres faults
	1379	! will occur.
	1380	! On NEC, tri should not be passed (except for routine substi_1dd)
	1381	! because this causes very bad performance.
	1382	!------------------------------------------------------------------------------!
	1383
	1384	USE arrays_3d
	1385	USE control_parameters
	1386
	1387	USE pegrid
	1388
	1389	IMPLICIT NONE
	1390
	1391	INTEGER :: i, j, k, nnyh, nx, ny, omp_get_thread_num, tn
	1392
	1393	REAL :: ddx2, ddy2
	1394
	1395	REAL, DIMENSION(0:nx,1:nz) :: ar
	1396	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
	1397	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	1398
	1399
	1400	nnyh = ( ny + 1 ) / 2
	1401
	1402	!
	1403	!-- Define constant elements of the tridiagonal matrix.
	1404	!-- The compiler on SX6 does loop exchange. If 0:nx is a high power of 2,
	1405	!-- the exchanged loops create bank conflicts. The following directive
	1406	!-- prohibits loop exchange and the loops perform much better.
	1407	! tn = omp_get_thread_num()
	1408	! WRITE( 120+tn, * ) '+++ id=',myid,' nx=',nx,' thread=', omp_get_thread_num()
[82]	1409	! CALL local_flush( 120+tn )
[1]	1410	!CDIR NOLOOPCHG
	1411	DO k = 0, nz-1
	1412	DO i = 0,nx
[667]	1413	tri(2,i,k) = ddzu_pres(k+1) * ddzw(k+1)
	1414	tri(3,i,k) = ddzu_pres(k+2) * ddzw(k+1)
[1]	1415	ENDDO
	1416	ENDDO
	1417	! WRITE( 120+tn, * ) '+++ id=',myid,' end of first tridia loop thread=', omp_get_thread_num()
[82]	1418	! CALL local_flush( 120+tn )
[1]	1419
	1420	IF ( j <= nnyh ) THEN
[377]	1421	#if defined( __lc )
[1]	1422	CALL maketri_1dd( j, tri )
	1423	#else
	1424	CALL maketri_1dd( j )
	1425	#endif
	1426	ELSE
[377]	1427	#if defined( __lc )
[1]	1428	CALL maketri_1dd( ny+1-j, tri )
	1429	#else
	1430	CALL maketri_1dd( ny+1-j )
	1431	#endif
	1432	ENDIF
[377]	1433	#if defined( __lc )
[1]	1434	CALL split_1dd( tri )
	1435	#else
	1436	CALL split_1dd
	1437	#endif
	1438	CALL substi_1dd( ar, tri )
	1439
	1440	CONTAINS
	1441
[377]	1442	#if defined( __lc )
[1]	1443	SUBROUTINE maketri_1dd( j, tri )
	1444	#else
	1445	SUBROUTINE maketri_1dd( j )
	1446	#endif
	1447
	1448	!------------------------------------------------------------------------------!
	1449	! computes the i- and j-dependent component of the matrix
	1450	!------------------------------------------------------------------------------!
	1451
	1452	USE constants
	1453
	1454	IMPLICIT NONE
	1455
	1456	INTEGER :: i, j, k, nnxh
	1457	REAL :: a, c
	1458
	1459	REAL, DIMENSION(0:nx) :: l
	1460
[377]	1461	#if defined( __lc )
[1]	1462	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	1463	#endif
	1464
	1465
	1466	nnxh = ( nx + 1 ) / 2
	1467	!
	1468	!-- Provide the tridiagonal matrix for solution of the Poisson equation in
	1469	!-- Fourier space. The coefficients are computed following the method of
	1470	!-- Schmidt et al. (DFVLR-Mitteilung 84-15), which departs from Stephan
	1471	!-- Siano's original version by discretizing the Poisson equation,
	1472	!-- before it is Fourier-transformed
	1473	DO i = 0, nx
[128]	1474	IF ( i >= 0 .AND. i <= nnxh ) THEN
[1]	1475	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
	1476	FLOAT( nx+1 ) ) ) * ddx2 + &
	1477	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
	1478	FLOAT( ny+1 ) ) ) * ddy2
	1479	ELSE
	1480	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
	1481	FLOAT( nx+1 ) ) ) * ddx2 + &
	1482	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
	1483	FLOAT( ny+1 ) ) ) * ddy2
	1484	ENDIF
	1485	ENDDO
	1486
	1487	DO k = 0, nz-1
	1488	DO i = 0, nx
[667]	1489	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
	1490	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
[1]	1491	tri(1,i,k) = a + c - l(i)
	1492	ENDDO
	1493	ENDDO
	1494	IF ( ibc_p_b == 1 .OR. ibc_p_b == 2 ) THEN
	1495	DO i = 0, nx
	1496	tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
	1497	ENDDO
	1498	ENDIF
	1499	IF ( ibc_p_t == 1 ) THEN
	1500	DO i = 0, nx
	1501	tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
	1502	ENDDO
	1503	ENDIF
	1504
	1505	END SUBROUTINE maketri_1dd
	1506
	1507
[377]	1508	#if defined( __lc )
[1]	1509	SUBROUTINE split_1dd( tri )
	1510	#else
	1511	SUBROUTINE split_1dd
	1512	#endif
	1513
	1514	!------------------------------------------------------------------------------!
	1515	! Splitting of the tridiagonal matrix (Thomas algorithm)
	1516	!------------------------------------------------------------------------------!
	1517
	1518	IMPLICIT NONE
	1519
	1520	INTEGER :: i, k
	1521
[377]	1522	#if defined( __lc )
[1]	1523	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	1524	#endif
	1525
	1526
	1527	!
	1528	!-- Splitting
	1529	DO i = 0, nx
	1530	tri(4,i,0) = tri(1,i,0)
	1531	ENDDO
	1532	DO k = 1, nz-1
	1533	DO i = 0, nx
	1534	tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
	1535	tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
	1536	ENDDO
	1537	ENDDO
	1538
	1539	END SUBROUTINE split_1dd
	1540
	1541
	1542	SUBROUTINE substi_1dd( ar, tri )
	1543
	1544	!------------------------------------------------------------------------------!
	1545	! Substitution (Forward and Backward) (Thomas algorithm)
	1546	!------------------------------------------------------------------------------!
	1547
	1548	IMPLICIT NONE
	1549
[76]	1550	INTEGER :: i, k
[1]	1551
	1552	REAL, DIMENSION(0:nx,nz) :: ar
	1553	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
	1554	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	1555
	1556	!
	1557	!-- Forward substitution
	1558	DO i = 0, nx
	1559	ar1(i,0) = ar(i,1)
	1560	ENDDO
	1561	DO k = 1, nz-1
	1562	DO i = 0, nx
	1563	ar1(i,k) = ar(i,k+1) - tri(5,i,k) * ar1(i,k-1)
	1564	ENDDO
	1565	ENDDO
	1566
	1567	!
	1568	!-- Backward substitution
[763]	1569	!-- Note, the add of 1.0E-20 in the denominator is due to avoid divisions
	1570	!-- by zero appearing if the pressure bc is set to neumann at the top of
	1571	!-- the model domain.
[1]	1572	DO i = 0, nx
[761]	1573	ar(i,nz) = ar1(i,nz-1) / ( tri(4,i,nz-1) + 1.0E-20 )
[1]	1574	ENDDO
	1575	DO k = nz-2, 0, -1
	1576	DO i = 0, nx
	1577	ar(i,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,k+2) ) &
	1578	/ tri(4,i,k)
	1579	ENDDO
	1580	ENDDO
	1581
[76]	1582	!
	1583	!-- Indices i=0, j=0 correspond to horizontally averaged pressure.
	1584	!-- The respective values of ar should be zero at all k-levels if
	1585	!-- acceleration of horizontally averaged vertical velocity is zero.
	1586	IF ( ibc_p_b == 1 .AND. ibc_p_t == 1 ) THEN
	1587	IF ( j == 0 ) THEN
	1588	DO k = 1, nz
	1589	ar(0,k) = 0.0
	1590	ENDDO
	1591	ENDIF
	1592	ENDIF
	1593
[1]	1594	END SUBROUTINE substi_1dd
	1595
	1596	END SUBROUTINE tridia_1dd
	1597
	1598	#endif
[807]	1599	#endif
[1]	1600	END MODULE poisfft_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |