Home

Context Navigation

source: palm/trunk/SOURCE/poisfft.f90 @ 881

Last change on this file since 881 was 878, checked in by suehring, 13 years ago
last commit documented
Property svn:keywords set to `Id`
File size: 46.9 KB

Rev	Line
[1]	1	MODULE poisfft_mod
	2
	3	!------------------------------------------------------------------------------!
[484]	4	! Current revisions:
[1]	5	! -----------------
	6	!
	7	! Former revisions:
	8	! -----------------
[3]	9	! $Id: poisfft.f90 878 2012-04-03 11:23:30Z raasch $
[77]	10	!
[878]	11	! 877 2012-04-03 11:21:44Z suehring
	12	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
	13	! pressure at the top of the model domain.
	14	!
[810]	15	! 809 2012-01-30 13:32:58Z maronga
	16	! Bugfix: replaced .AND. and .NOT. with && and ! in the preprocessor directives
	17	!
[808]	18	! 807 2012-01-25 11:53:51Z maronga
	19	! New cpp directive "__check" implemented which is used by check_namelist_files
	20	! (most of the code is unneeded by check_namelist_files).
	21	!
[764]	22	! 763 2011-10-06 09:32:09Z suehring
	23	! Comment added concerning the last change.
	24	!
[762]	25	! 761 2011-10-05 17:58:52Z suehring
	26	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
	27	! pressure at the top of the model domain.
	28	!
[697]	29	! 696 2011-03-18 07:03:49Z raasch
	30	! work_fftx removed from PRIVATE clauses in fftx_tr_xy and tr_yx_fftx
	31	!
[684]	32	! 683 2011-02-09 14:25:15Z raasch
	33	! openMP parallelization for 2d-domain-decomposition
	34	!
[668]	35	! 667 2010-12-23 12:06:00Z suehring/gryschka
	36	! ddzu replaced by ddzu_pres due to changes in zu(0)
	37	!
[623]	38	! 622 2010-12-10 08:08:13Z raasch
	39	! optional barriers included in order to speed up collective operations
	40	!
[392]	41	! 377 2009-09-04 11:09:00Z raasch
	42	! __lcmuk changed to __lc to avoid problems with Intel compiler on sgi-ice
	43	!
[198]	44	! 164 2008-05-15 08:46:15Z raasch
	45	! Arguments removed from transpose routines
	46	!
[139]	47	! 128 2007-10-26 13:11:14Z raasch
	48	! Bugfix: wavenumber calculation for even nx in routines maketri
	49	!
[90]	50	! 85 2007-05-11 09:35:14Z raasch
	51	! Bugfix: work_fft*_vec removed from some PRIVATE-declarations
	52	!
[77]	53	! 76 2007-03-29 00:58:32Z raasch
	54	! Tridiagonal coefficients adjusted for Neumann boundary conditions both at
	55	! the bottom and the top.
	56	!
[3]	57	! RCS Log replace by Id keyword, revision history cleaned up
	58	!
[1]	59	! Revision 1.24 2006/08/04 15:00:24 raasch
	60	! Default setting of the thread number tn in case of not using OpenMP
	61	!
	62	! Revision 1.23 2006/02/23 12:48:38 raasch
	63	! Additional compiler directive in routine tridia_1dd for preventing loop
	64	! exchange on NEC-SX6
	65	!
	66	! Revision 1.20 2004/04/30 12:38:09 raasch
	67	! Parts of former poisfft_hybrid moved to this subroutine,
	68	! former subroutine changed to a module, renaming of FFT-subroutines and
	69	! -module, FFTs completely substituted by calls of fft_x and fft_y,
	70	! NAG fft used in the non-parallel case completely removed, l in maketri
	71	! is now a 1d-array, variables passed by modules instead of using parameter
	72	! lists, enlarged transposition arrays introduced
	73	!
	74	! Revision 1.1 1997/07/24 11:24:14 raasch
	75	! Initial revision
	76	!
	77	!
	78	! Description:
	79	! ------------
	80	! See below.
	81	!------------------------------------------------------------------------------!
	82
	83	!--------------------------------------------------------------------------!
	84	! poisfft !
	85	! !
	86	! Original version: Stephan Siano (pois3d) !
	87	! !
	88	! Institute of Meteorology and Climatology, University of Hannover !
	89	! Germany !
	90	! !
	91	! Version as of July 23,1996 !
	92	! !
	93	! !
	94	! Version for parallel computers: Siegfried Raasch !
	95	! !
	96	! Version as of July 03,1997 !
	97	! !
	98	! Solves the Poisson equation with a 2D spectral method !
	99	! d^2 p / dx^2 + d^2 p / dy^2 + d^2 p / dz^2 = s !
	100	! !
	101	! Input: !
	102	! real ar contains in the (nnx,nny,nnz) elements, !
	103	! starting from the element (1,nys,nxl), the !
	104	! values for s !
	105	! real work Temporary array !
	106	! !
	107	! Output: !
	108	! real ar contains the solution for p !
	109	!--------------------------------------------------------------------------!
	110
	111	USE fft_xy
	112	USE indices
	113	USE transpose_indices
	114
	115	IMPLICIT NONE
	116
	117	PRIVATE
[807]	118
[809]	119	#if ! defined ( __check )
[1]	120	PUBLIC poisfft, poisfft_init
	121
	122	INTERFACE poisfft
	123	MODULE PROCEDURE poisfft
	124	END INTERFACE poisfft
	125
	126	INTERFACE poisfft_init
	127	MODULE PROCEDURE poisfft_init
	128	END INTERFACE poisfft_init
[807]	129	#else
	130	PUBLIC poisfft_init
[1]	131
[807]	132	INTERFACE poisfft_init
	133	MODULE PROCEDURE poisfft_init
	134	END INTERFACE poisfft_init
	135	#endif
	136
[1]	137	CONTAINS
	138
	139	SUBROUTINE poisfft_init
	140
	141	CALL fft_init
	142
	143	END SUBROUTINE poisfft_init
	144
[809]	145	#if ! defined ( __check )
[1]	146	SUBROUTINE poisfft( ar, work )
	147
	148	USE cpulog
	149	USE interfaces
	150	USE pegrid
	151
	152	IMPLICIT NONE
	153
	154	REAL, DIMENSION(1:nza,nys:nyna,nxl:nxra) :: ar, work
	155
	156
	157	CALL cpu_log( log_point_s(3), 'poisfft', 'start' )
	158
	159	!
	160	!-- Two-dimensional Fourier Transformation in x- and y-direction.
	161	#if defined( __parallel )
	162	IF ( pdims(2) == 1 ) THEN
	163
	164	!
	165	!-- 1d-domain-decomposition along x:
	166	!-- FFT along y and transposition y --> x
	167	CALL ffty_tr_yx( ar, work, ar )
	168
	169	!
	170	!-- FFT along x, solving the tridiagonal system and backward FFT
	171	CALL fftx_tri_fftx( ar )
	172
	173	!
	174	!-- Transposition x --> y and backward FFT along y
	175	CALL tr_xy_ffty( ar, work, ar )
	176
	177	ELSEIF ( pdims(1) == 1 ) THEN
	178
	179	!
	180	!-- 1d-domain-decomposition along y:
	181	!-- FFT along x and transposition x --> y
	182	CALL fftx_tr_xy( ar, work, ar )
	183
	184	!
	185	!-- FFT along y, solving the tridiagonal system and backward FFT
	186	CALL ffty_tri_ffty( ar )
	187
	188	!
	189	!-- Transposition y --> x and backward FFT along x
	190	CALL tr_yx_fftx( ar, work, ar )
	191
	192	ELSE
	193
	194	!
	195	!-- 2d-domain-decomposition
	196	!-- Transposition z --> x
	197	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
[164]	198	CALL transpose_zx( ar, work, ar )
[1]	199	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
	200
	201	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
	202	CALL fftxp( ar, 'forward' )
	203	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
	204
	205	!
	206	!-- Transposition x --> y
	207	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
[164]	208	CALL transpose_xy( ar, work, ar )
[1]	209	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
	210
	211	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
	212	CALL fftyp( ar, 'forward' )
	213	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	214
	215	!
	216	!-- Transposition y --> z
	217	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
[164]	218	CALL transpose_yz( ar, work, ar )
[1]	219	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
	220
	221	!
	222	!-- Solve the Poisson equation in z-direction in cartesian space.
	223	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
	224	CALL tridia( ar )
	225	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
	226
	227	!
	228	!-- Inverse Fourier Transformation
	229	!-- Transposition z --> y
	230	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
[164]	231	CALL transpose_zy( ar, work, ar )
[1]	232	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
	233
	234	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
	235	CALL fftyp( ar, 'backward' )
	236	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	237
	238	!
	239	!-- Transposition y --> x
	240	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
[164]	241	CALL transpose_yx( ar, work, ar )
[1]	242	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
	243
	244	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
	245	CALL fftxp( ar, 'backward' )
	246	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	247
	248	!
	249	!-- Transposition x --> z
	250	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
[164]	251	CALL transpose_xz( ar, work, ar )
[1]	252	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
	253
	254	ENDIF
	255
	256	#else
	257
	258	!
	259	!-- Two-dimensional Fourier Transformation along x- and y-direction.
	260	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
	261	CALL fftx( ar, 'forward' )
	262	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
	263	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
	264	CALL ffty( ar, 'forward' )
	265	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	266
	267	!
	268	!-- Solve the Poisson equation in z-direction in cartesian space.
	269	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
	270	CALL tridia( ar )
	271	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
	272
	273	!
	274	!-- Inverse Fourier Transformation.
	275	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
	276	CALL ffty( ar, 'backward' )
	277	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	278	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
	279	CALL fftx( ar, 'backward' )
	280	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	281
	282	#endif
	283
	284	CALL cpu_log( log_point_s(3), 'poisfft', 'stop' )
	285
	286	END SUBROUTINE poisfft
	287
	288
	289
	290	SUBROUTINE tridia( ar )
	291
	292	!------------------------------------------------------------------------------!
	293	! solves the linear system of equations:
	294	!
	295	! -(4 pi^2(i^2/(dx^2nnx^2)+j^2/(dy^2nny^2))+
	296	! 1/(dzu(k)dzw(k))+1/(dzu(k-1)dzw(k)))*p(i,j,k)+
	297	! 1/(dzu(k)dzw(k))p(i,j,k+1)+1/(dzu(k-1)dzw(k))p(i,j,k-1)=d(i,j,k)
	298	!
	299	! by using the Thomas algorithm
	300	!------------------------------------------------------------------------------!
	301
	302	USE arrays_3d
	303
	304	IMPLICIT NONE
	305
	306	INTEGER :: i, j, k, nnyh
	307
	308	REAL, DIMENSION(nxl_z:nxr_z,0:nz-1) :: ar1
	309	REAL, DIMENSION(5,nxl_z:nxr_z,0:nz-1) :: tri
	310
	311	#if defined( __parallel )
	312	REAL :: ar(nxl_z:nxr_za,nys_z:nyn_za,1:nza)
	313	#else
	314	REAL :: ar(1:nz,nys_z:nyn_z,nxl_z:nxr_z)
	315	#endif
	316
	317
	318	nnyh = (ny+1) / 2
	319
	320	!
	321	!-- Define constant elements of the tridiagonal matrix.
[683]	322	!$OMP PARALLEL PRIVATE ( k, i )
	323	!$OMP DO
[1]	324	DO k = 0, nz-1
	325	DO i = nxl_z, nxr_z
[667]	326	tri(2,i,k) = ddzu_pres(k+1) * ddzw(k+1)
	327	tri(3,i,k) = ddzu_pres(k+2) * ddzw(k+1)
[1]	328	ENDDO
	329	ENDDO
[683]	330	!$OMP END PARALLEL
[1]	331
	332	#if defined( __parallel )
	333	!
	334	!-- Repeat for all y-levels.
[683]	335	!$OMP PARALLEL FIRSTPRIVATE( tri ) PRIVATE ( ar1, j )
	336	!$OMP DO
[1]	337	DO j = nys_z, nyn_z
	338	IF ( j <= nnyh ) THEN
	339	CALL maketri( tri, j )
	340	ELSE
	341	CALL maketri( tri, ny+1-j )
	342	ENDIF
	343	CALL split( tri )
	344	CALL substi( ar, ar1, tri, j )
	345	ENDDO
[683]	346	!$OMP END PARALLEL
[1]	347	#else
	348	!
	349	!-- First y-level.
	350	CALL maketri( tri, nys_z )
	351	CALL split( tri )
	352	CALL substi( ar, ar1, tri, 0 )
	353
	354	!
	355	!-- Further y-levels.
	356	DO j = 1, nnyh - 1
	357	CALL maketri( tri, j )
	358	CALL split( tri )
	359	CALL substi( ar, ar1, tri, j )
	360	CALL substi( ar, ar1, tri, ny+1-j )
	361	ENDDO
	362	CALL maketri( tri, nnyh )
	363	CALL split( tri )
	364	CALL substi( ar, ar1, tri, nnyh+nys )
	365	#endif
	366
	367	CONTAINS
	368
	369	SUBROUTINE maketri( tri, j )
	370
	371	!------------------------------------------------------------------------------!
	372	! Computes the i- and j-dependent component of the matrix
	373	!------------------------------------------------------------------------------!
	374
	375	USE arrays_3d
	376	USE constants
	377	USE control_parameters
	378	USE grid_variables
	379
	380	IMPLICIT NONE
	381
	382	INTEGER :: i, j, k, nnxh
	383	REAL :: a, c
	384	REAL :: ll(nxl_z:nxr_z)
	385	REAL :: tri(5,nxl_z:nxr_z,0:nz-1)
	386
	387
	388	nnxh = ( nx + 1 ) / 2
	389
	390	!
	391	!-- Provide the tridiagonal matrix for solution of the Poisson equation in
	392	!-- Fourier space. The coefficients are computed following the method of
	393	!-- Schmidt et al. (DFVLR-Mitteilung 84-15), which departs from Stephan
	394	!-- Siano's original version by discretizing the Poisson equation,
	395	!-- before it is Fourier-transformed
	396	#if defined( __parallel )
	397	DO i = nxl_z, nxr_z
[128]	398	IF ( i >= 0 .AND. i <= nnxh ) THEN
[1]	399	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
	400	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
	401	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
	402	FLOAT( ny+1 ) ) ) / ( dy * dy )
	403	ELSE
	404	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
	405	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
	406	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
	407	FLOAT( ny+1 ) ) ) / ( dy * dy )
	408	ENDIF
	409	DO k = 0,nz-1
[667]	410	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
	411	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
[1]	412	tri(1,i,k) = a + c - ll(i)
	413	ENDDO
	414	ENDDO
	415	#else
	416	DO i = 0, nnxh
	417	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / FLOAT( nx+1 ) ) ) / &
	418	( dx * dx ) + &
	419	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / FLOAT( ny+1 ) ) ) / &
	420	( dy * dy )
	421	DO k = 0, nz-1
[667]	422	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
	423	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
[1]	424	tri(1,i,k) = a + c - ll(i)
	425	IF ( i >= 1 .and. i < nnxh ) THEN
	426	tri(1,nx+1-i,k) = tri(1,i,k)
	427	ENDIF
	428	ENDDO
	429	ENDDO
	430	#endif
	431	IF ( ibc_p_b == 1 .OR. ibc_p_b == 2 ) THEN
	432	DO i = nxl_z, nxr_z
	433	tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
	434	ENDDO
	435	ENDIF
	436	IF ( ibc_p_t == 1 ) THEN
	437	DO i = nxl_z, nxr_z
	438	tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
	439	ENDDO
	440	ENDIF
	441
	442	END SUBROUTINE maketri
	443
	444
	445	SUBROUTINE substi( ar, ar1, tri, j )
	446
	447	!------------------------------------------------------------------------------!
	448	! Substitution (Forward and Backward) (Thomas algorithm)
	449	!------------------------------------------------------------------------------!
	450
[76]	451	USE control_parameters
	452
[1]	453	IMPLICIT NONE
	454
	455	INTEGER :: i, j, k
	456	REAL :: ar1(nxl_z:nxr_z,0:nz-1)
	457	REAL :: tri(5,nxl_z:nxr_z,0:nz-1)
	458	#if defined( __parallel )
	459	REAL :: ar(nxl_z:nxr_za,nys_z:nyn_za,1:nza)
	460	#else
	461	REAL :: ar(1:nz,nys_z:nyn_z,nxl_z:nxr_z)
	462	#endif
	463
	464	!
	465	!-- Forward substitution.
	466	DO i = nxl_z, nxr_z
	467	#if defined( __parallel )
	468	ar1(i,0) = ar(i,j,1)
	469	#else
	470	ar1(i,0) = ar(1,j,i)
	471	#endif
	472	ENDDO
	473	DO k = 1, nz - 1
	474	DO i = nxl_z, nxr_z
	475	#if defined( __parallel )
	476	ar1(i,k) = ar(i,j,k+1) - tri(5,i,k) * ar1(i,k-1)
	477	#else
	478	ar1(i,k) = ar(k+1,j,i) - tri(5,i,k) * ar1(i,k-1)
	479	#endif
	480	ENDDO
	481	ENDDO
	482
	483	!
[877]	484	!-- Backward substitution
	485	!-- Note, the 1.0E-20 in the denominator is due to avoid divisions
	486	!-- by zero appearing if the pressure bc is set to neumann at the top of
	487	!-- the model domain.
[1]	488	DO i = nxl_z, nxr_z
	489	#if defined( __parallel )
[877]	490	ar(i,j,nz) = ar1(i,nz-1) / ( tri(4,i,nz-1) + 1.0E-20 )
[1]	491	#else
[877]	492	ar(nz,j,i) = ar1(i,nz-1) / ( tri(4,i,nz-1) + 1.0E-20 )
[1]	493	#endif
	494	ENDDO
	495	DO k = nz-2, 0, -1
	496	DO i = nxl_z, nxr_z
	497	#if defined( __parallel )
	498	ar(i,j,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,j,k+2) ) &
	499	/ tri(4,i,k)
	500	#else
	501	ar(k+1,j,i) = ( ar1(i,k) - tri(3,i,k) * ar(k+2,j,i) ) &
	502	/ tri(4,i,k)
	503	#endif
	504	ENDDO
	505	ENDDO
	506
[76]	507	!
	508	!-- Indices i=0, j=0 correspond to horizontally averaged pressure.
	509	!-- The respective values of ar should be zero at all k-levels if
	510	!-- acceleration of horizontally averaged vertical velocity is zero.
	511	IF ( ibc_p_b == 1 .AND. ibc_p_t == 1 ) THEN
	512	IF ( j == 0 .AND. nxl_z == 0 ) THEN
	513	#if defined( __parallel )
	514	DO k = 1, nz
	515	ar(nxl_z,j,k) = 0.0
	516	ENDDO
	517	#else
	518	DO k = 1, nz
	519	ar(k,j,nxl_z) = 0.0
	520	ENDDO
	521	#endif
	522	ENDIF
	523	ENDIF
	524
[1]	525	END SUBROUTINE substi
	526
	527
	528	SUBROUTINE split( tri )
	529
	530	!------------------------------------------------------------------------------!
	531	! Splitting of the tridiagonal matrix (Thomas algorithm)
	532	!------------------------------------------------------------------------------!
	533
	534	IMPLICIT NONE
	535
	536	INTEGER :: i, k
	537	REAL :: tri(5,nxl_z:nxr_z,0:nz-1)
	538
	539	!
	540	!-- Splitting.
	541	DO i = nxl_z, nxr_z
	542	tri(4,i,0) = tri(1,i,0)
	543	ENDDO
	544	DO k = 1, nz-1
	545	DO i = nxl_z, nxr_z
	546	tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
	547	tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
	548	ENDDO
	549	ENDDO
	550
	551	END SUBROUTINE split
	552
	553	END SUBROUTINE tridia
	554
	555
	556	#if defined( __parallel )
	557	SUBROUTINE fftxp( ar, direction )
	558
	559	!------------------------------------------------------------------------------!
	560	! Fourier-transformation along x-direction Parallelized version
	561	!------------------------------------------------------------------------------!
	562
	563	IMPLICIT NONE
	564
	565	CHARACTER (LEN=*) :: direction
	566	INTEGER :: j, k
	567	REAL :: ar(0:nxa,nys_x:nyn_xa,nzb_x:nzt_xa)
	568
	569	!
	570	!-- Performing the fft with one of the methods implemented
[683]	571	!$OMP PARALLEL PRIVATE ( j, k )
	572	!$OMP DO
[1]	573	DO k = nzb_x, nzt_x
	574	DO j = nys_x, nyn_x
	575	CALL fft_x( ar(0:nx,j,k), direction )
	576	ENDDO
	577	ENDDO
[683]	578	!$OMP END PARALLEL
[1]	579
	580	END SUBROUTINE fftxp
	581
	582	#else
	583	SUBROUTINE fftx( ar, direction )
	584
	585	!------------------------------------------------------------------------------!
	586	! Fourier-transformation along x-direction Non parallel version
	587	!------------------------------------------------------------------------------!
	588
	589	IMPLICIT NONE
	590
	591	CHARACTER (LEN=*) :: direction
	592	INTEGER :: i, j, k
	593	REAL :: ar(1:nz,0:ny,0:nx)
	594
	595	!
	596	!-- Performing the fft with one of the methods implemented
[683]	597	!$OMP PARALLEL PRIVATE ( j, k )
	598	!$OMP DO
[1]	599	DO k = 1, nz
	600	DO j = 0, ny
	601	CALL fft_x( ar(k,j,0:nx), direction )
	602	ENDDO
	603	ENDDO
[683]	604	!$OMP END PARALLEL
[1]	605
	606	END SUBROUTINE fftx
	607	#endif
	608
	609
	610	#if defined( __parallel )
	611	SUBROUTINE fftyp( ar, direction )
	612
	613	!------------------------------------------------------------------------------!
	614	! Fourier-transformation along y-direction Parallelized version
	615	!------------------------------------------------------------------------------!
	616
	617	IMPLICIT NONE
	618
	619	CHARACTER (LEN=*) :: direction
	620	INTEGER :: i, k
	621	REAL :: ar(0:nya,nxl_y:nxr_ya,nzb_y:nzt_ya)
	622
	623	!
	624	!-- Performing the fft with one of the methods implemented
[683]	625	!$OMP PARALLEL PRIVATE ( i, k )
	626	!$OMP DO
[1]	627	DO k = nzb_y, nzt_y
	628	DO i = nxl_y, nxr_y
	629	CALL fft_y( ar(0:ny,i,k), direction )
	630	ENDDO
	631	ENDDO
[683]	632	!$OMP END PARALLEL
[1]	633
	634	END SUBROUTINE fftyp
	635
	636	#else
	637	SUBROUTINE ffty( ar, direction )
	638
	639	!------------------------------------------------------------------------------!
	640	! Fourier-transformation along y-direction Non parallel version
	641	!------------------------------------------------------------------------------!
	642
	643	IMPLICIT NONE
	644
	645	CHARACTER (LEN=*) :: direction
	646	INTEGER :: i, k
	647	REAL :: ar(1:nz,0:ny,0:nx)
	648
	649	!
	650	!-- Performing the fft with one of the methods implemented
[683]	651	!$OMP PARALLEL PRIVATE ( i, k )
	652	!$OMP DO
[1]	653	DO k = 1, nz
	654	DO i = 0, nx
	655	CALL fft_y( ar(k,0:ny,i), direction )
	656	ENDDO
	657	ENDDO
[683]	658	!$OMP END PARALLEL
[1]	659
	660	END SUBROUTINE ffty
	661	#endif
	662
	663	#if defined( __parallel )
	664	SUBROUTINE ffty_tr_yx( f_in, work, f_out )
	665
	666	!------------------------------------------------------------------------------!
	667	! Fourier-transformation along y with subsequent transposition y --> x for
	668	! a 1d-decomposition along x
	669	!
	670	! ATTENTION: The performance of this routine is much faster on the NEC-SX6,
	671	! if the first index of work_ffty_vec is odd. Otherwise
	672	! memory bank conflicts may occur (especially if the index is a
	673	! multiple of 128). That's why work_ffty_vec is dimensioned as
	674	! 0:ny+1.
	675	! Of course, this will not work if users are using an odd number
	676	! of gridpoints along y.
	677	!------------------------------------------------------------------------------!
	678
	679	USE control_parameters
	680	USE cpulog
	681	USE indices
	682	USE interfaces
	683	USE pegrid
	684	USE transpose_indices
	685
	686	IMPLICIT NONE
	687
	688	INTEGER :: i, iend, iouter, ir, j, k
	689	INTEGER, PARAMETER :: stridex = 4
	690
	691	REAL, DIMENSION(0:ny,stridex) :: work_ffty
	692	#if defined( __nec )
	693	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
	694	#endif
	695	REAL, DIMENSION(1:nza,0:nya,nxl:nxra) :: f_in
	696	REAL, DIMENSION(nnx,1:nza,nys_x:nyn_xa,pdims(1)) :: f_out
	697	REAL, DIMENSION(nxl:nxra,1:nza,0:nya) :: work
	698
	699	!
	700	!-- Carry out the FFT along y, where all data are present due to the
	701	!-- 1d-decomposition along x. Resort the data in a way that x becomes
	702	!-- the first index.
	703	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
	704
	705	IF ( host(1:3) == 'nec' ) THEN
	706	#if defined( __nec )
	707	!
	708	!-- Code optimized for vector processors
[85]	709	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	710	!$OMP DO
	711	DO i = nxl, nxr
	712
	713	DO j = 0, ny
	714	DO k = 1, nz
	715	work_ffty_vec(j,k,i) = f_in(k,j,i)
	716	ENDDO
	717	ENDDO
	718
	719	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' )
	720
	721	ENDDO
	722
	723	!$OMP DO
	724	DO k = 1, nz
	725	DO j = 0, ny
	726	DO i = nxl, nxr
	727	work(i,k,j) = work_ffty_vec(j,k,i)
	728	ENDDO
	729	ENDDO
	730	ENDDO
	731	!$OMP END PARALLEL
	732	#endif
	733
	734	ELSE
	735
	736	!
	737	!-- Cache optimized code.
	738	!-- The i-(x-)direction is split into a strided outer loop and an inner
	739	!-- loop for better cache performance
	740	!$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty)
	741	!$OMP DO
	742	DO iouter = nxl, nxr, stridex
	743
	744	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
	745
	746	DO k = 1, nz
	747
	748	DO i = iouter, iend
	749
	750	ir = i-iouter+1 ! counter within a stride
	751	DO j = 0, ny
	752	work_ffty(j,ir) = f_in(k,j,i)
	753	ENDDO
	754	!
	755	!-- FFT along y
	756	CALL fft_y( work_ffty(:,ir), 'forward' )
	757
	758	ENDDO
	759
	760	!
	761	!-- Resort
	762	DO j = 0, ny
	763	DO i = iouter, iend
	764	work(i,k,j) = work_ffty(j,i-iouter+1)
	765	ENDDO
	766	ENDDO
	767
	768	ENDDO
	769
	770	ENDDO
	771	!$OMP END PARALLEL
	772
	773	ENDIF
	774	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	775
	776	!
	777	!-- Transpose array
	778	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	779	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	780	CALL MPI_ALLTOALL( work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
	781	f_out(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
	782	comm1dx, ierr )
	783	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	784
	785	END SUBROUTINE ffty_tr_yx
	786
	787
	788	SUBROUTINE tr_xy_ffty( f_in, work, f_out )
	789
	790	!------------------------------------------------------------------------------!
	791	! Transposition x --> y with a subsequent backward Fourier transformation for
	792	! a 1d-decomposition along x
	793	!------------------------------------------------------------------------------!
	794
	795	USE control_parameters
	796	USE cpulog
	797	USE indices
	798	USE interfaces
	799	USE pegrid
	800	USE transpose_indices
	801
	802	IMPLICIT NONE
	803
	804	INTEGER :: i, iend, iouter, ir, j, k
	805	INTEGER, PARAMETER :: stridex = 4
	806
	807	REAL, DIMENSION(0:ny,stridex) :: work_ffty
	808	#if defined( __nec )
	809	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
	810	#endif
	811	REAL, DIMENSION(nnx,1:nza,nys_x:nyn_xa,pdims(1)) :: f_in
	812	REAL, DIMENSION(1:nza,0:nya,nxl:nxra) :: f_out
	813	REAL, DIMENSION(nxl:nxra,1:nza,0:nya) :: work
	814
	815	!
	816	!-- Transpose array
	817	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	818	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	819	CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
	820	work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
	821	comm1dx, ierr )
	822	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	823
	824	!
	825	!-- Resort the data in a way that y becomes the first index and carry out the
	826	!-- backward fft along y.
	827	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
	828
	829	IF ( host(1:3) == 'nec' ) THEN
	830	#if defined( __nec )
	831	!
	832	!-- Code optimized for vector processors
[85]	833	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	834	!$OMP DO
	835	DO k = 1, nz
	836	DO j = 0, ny
	837	DO i = nxl, nxr
	838	work_ffty_vec(j,k,i) = work(i,k,j)
	839	ENDDO
	840	ENDDO
	841	ENDDO
	842
	843	!$OMP DO
	844	DO i = nxl, nxr
	845
	846	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' )
	847
	848	DO j = 0, ny
	849	DO k = 1, nz
	850	f_out(k,j,i) = work_ffty_vec(j,k,i)
	851	ENDDO
	852	ENDDO
	853
	854	ENDDO
	855	!$OMP END PARALLEL
	856	#endif
	857
	858	ELSE
	859
	860	!
	861	!-- Cache optimized code.
	862	!-- The i-(x-)direction is split into a strided outer loop and an inner
	863	!-- loop for better cache performance
	864	!$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty )
	865	!$OMP DO
	866	DO iouter = nxl, nxr, stridex
	867
	868	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
	869
	870	DO k = 1, nz
	871	!
	872	!-- Resort
	873	DO j = 0, ny
	874	DO i = iouter, iend
	875	work_ffty(j,i-iouter+1) = work(i,k,j)
	876	ENDDO
	877	ENDDO
	878
	879	DO i = iouter, iend
	880
	881	!
	882	!-- FFT along y
	883	ir = i-iouter+1 ! counter within a stride
	884	CALL fft_y( work_ffty(:,ir), 'backward' )
	885
	886	DO j = 0, ny
	887	f_out(k,j,i) = work_ffty(j,ir)
	888	ENDDO
	889	ENDDO
	890
	891	ENDDO
	892
	893	ENDDO
	894	!$OMP END PARALLEL
	895
	896	ENDIF
	897
	898	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	899
	900	END SUBROUTINE tr_xy_ffty
	901
	902
	903	SUBROUTINE fftx_tri_fftx( ar )
	904
	905	!------------------------------------------------------------------------------!
	906	! FFT along x, solution of the tridiagonal system and backward FFT for
	907	! a 1d-decomposition along x
	908	!
	909	! WARNING: this subroutine may still not work for hybrid parallelization
	910	! with OpenMP (for possible necessary changes see the original
	911	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
	912	!------------------------------------------------------------------------------!
	913
	914	USE control_parameters
	915	USE cpulog
	916	USE grid_variables
	917	USE indices
	918	USE interfaces
	919	USE pegrid
	920	USE transpose_indices
	921
	922	IMPLICIT NONE
	923
	924	character(len=3) :: myth_char
	925
	926	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
	927
	928	REAL, DIMENSION(0:nx) :: work_fftx
	929	REAL, DIMENSION(0:nx,1:nz) :: work_trix
	930	REAL, DIMENSION(nnx,1:nza,nys_x:nyn_xa,pdims(1)) :: ar
	931	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
	932
	933
	934	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'start' )
	935
	936	ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) )
	937
	938	tn = 0 ! Default thread number in case of one thread
	939	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_fftx, work_trix )
	940	DO j = nys_x, nyn_x
	941
	942	!$ tn = omp_get_thread_num()
	943
	944	IF ( host(1:3) == 'nec' ) THEN
	945	!
	946	!-- Code optimized for vector processors
	947	DO k = 1, nz
	948
	949	m = 0
	950	DO n = 1, pdims(1)
	951	DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
	952	work_trix(m,k) = ar(i,k,j,n)
	953	m = m + 1
	954	ENDDO
	955	ENDDO
	956
	957	ENDDO
	958
	959	CALL fft_x_m( work_trix, 'forward' )
	960
	961	ELSE
	962	!
	963	!-- Cache optimized code
	964	DO k = 1, nz
	965
	966	m = 0
	967	DO n = 1, pdims(1)
	968	DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
	969	work_fftx(m) = ar(i,k,j,n)
	970	m = m + 1
	971	ENDDO
	972	ENDDO
	973
	974	CALL fft_x( work_fftx, 'forward' )
	975
	976	DO i = 0, nx
	977	work_trix(i,k) = work_fftx(i)
	978	ENDDO
	979
	980	ENDDO
	981
	982	ENDIF
	983
	984	!
	985	!-- Solve the linear equation system
	986	CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) )
	987
	988	IF ( host(1:3) == 'nec' ) THEN
	989	!
	990	!-- Code optimized for vector processors
	991	CALL fft_x_m( work_trix, 'backward' )
	992
	993	DO k = 1, nz
	994
	995	m = 0
	996	DO n = 1, pdims(1)
	997	DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
	998	ar(i,k,j,n) = work_trix(m,k)
	999	m = m + 1
	1000	ENDDO
	1001	ENDDO
	1002
	1003	ENDDO
	1004
	1005	ELSE
	1006	!
	1007	!-- Cache optimized code
	1008	DO k = 1, nz
	1009
	1010	DO i = 0, nx
	1011	work_fftx(i) = work_trix(i,k)
	1012	ENDDO
	1013
	1014	CALL fft_x( work_fftx, 'backward' )
	1015
	1016	m = 0
	1017	DO n = 1, pdims(1)
	1018	DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
	1019	ar(i,k,j,n) = work_fftx(m)
	1020	m = m + 1
	1021	ENDDO
	1022	ENDDO
	1023
	1024	ENDDO
	1025
	1026	ENDIF
	1027
	1028	ENDDO
	1029
	1030	DEALLOCATE( tri )
	1031
	1032	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'stop' )
	1033
	1034	END SUBROUTINE fftx_tri_fftx
	1035
	1036
	1037	SUBROUTINE fftx_tr_xy( f_in, work, f_out )
	1038
	1039	!------------------------------------------------------------------------------!
	1040	! Fourier-transformation along x with subsequent transposition x --> y for
	1041	! a 1d-decomposition along y
	1042	!
	1043	! ATTENTION: The NEC-branch of this routine may significantly profit from
	1044	! further optimizations. So far, performance is much worse than
	1045	! for routine ffty_tr_yx (more than three times slower).
	1046	!------------------------------------------------------------------------------!
	1047
	1048	USE control_parameters
	1049	USE cpulog
	1050	USE indices
	1051	USE interfaces
	1052	USE pegrid
	1053	USE transpose_indices
	1054
	1055	IMPLICIT NONE
	1056
	1057	INTEGER :: i, j, k
	1058
	1059	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
	1060	REAL, DIMENSION(1:nza,nys:nyna,0:nxa) :: f_in
	1061	REAL, DIMENSION(nny,1:nza,nxl_y:nxr_ya,pdims(2)) :: f_out
	1062	REAL, DIMENSION(nys:nyna,1:nza,0:nxa) :: work
	1063
	1064	!
	1065	!-- Carry out the FFT along x, where all data are present due to the
	1066	!-- 1d-decomposition along y. Resort the data in a way that y becomes
	1067	!-- the first index.
	1068	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
	1069
	1070	IF ( host(1:3) == 'nec' ) THEN
	1071	!
	1072	!-- Code for vector processors
[85]	1073	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	1074	!$OMP DO
	1075	DO i = 0, nx
	1076
	1077	DO j = nys, nyn
	1078	DO k = 1, nz
	1079	work_fftx(i,k,j) = f_in(k,j,i)
	1080	ENDDO
	1081	ENDDO
	1082
	1083	ENDDO
	1084
	1085	!$OMP DO
	1086	DO j = nys, nyn
	1087
	1088	CALL fft_x_m( work_fftx(:,:,j), 'forward' )
	1089
	1090	DO k = 1, nz
	1091	DO i = 0, nx
	1092	work(j,k,i) = work_fftx(i,k,j)
	1093	ENDDO
	1094	ENDDO
	1095
	1096	ENDDO
	1097	!$OMP END PARALLEL
	1098
	1099	ELSE
	1100
	1101	!
	1102	!-- Cache optimized code (there might be still a potential for better
	1103	!-- optimization).
[696]	1104	!$OMP PARALLEL PRIVATE (i,j,k)
[1]	1105	!$OMP DO
	1106	DO i = 0, nx
	1107
	1108	DO j = nys, nyn
	1109	DO k = 1, nz
	1110	work_fftx(i,k,j) = f_in(k,j,i)
	1111	ENDDO
	1112	ENDDO
	1113
	1114	ENDDO
	1115
	1116	!$OMP DO
	1117	DO j = nys, nyn
	1118	DO k = 1, nz
	1119
	1120	CALL fft_x( work_fftx(0:nx,k,j), 'forward' )
	1121
	1122	DO i = 0, nx
	1123	work(j,k,i) = work_fftx(i,k,j)
	1124	ENDDO
	1125	ENDDO
	1126
	1127	ENDDO
	1128	!$OMP END PARALLEL
	1129
	1130	ENDIF
	1131	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
	1132
	1133	!
	1134	!-- Transpose array
	1135	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	1136	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	1137	CALL MPI_ALLTOALL( work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
	1138	f_out(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
	1139	comm1dy, ierr )
	1140	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	1141
	1142	END SUBROUTINE fftx_tr_xy
	1143
	1144
	1145	SUBROUTINE tr_yx_fftx( f_in, work, f_out )
	1146
	1147	!------------------------------------------------------------------------------!
	1148	! Transposition y --> x with a subsequent backward Fourier transformation for
	1149	! a 1d-decomposition along x
	1150	!------------------------------------------------------------------------------!
	1151
	1152	USE control_parameters
	1153	USE cpulog
	1154	USE indices
	1155	USE interfaces
	1156	USE pegrid
	1157	USE transpose_indices
	1158
	1159	IMPLICIT NONE
	1160
	1161	INTEGER :: i, j, k
	1162
	1163	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
	1164	REAL, DIMENSION(nny,1:nza,nxl_y:nxr_ya,pdims(2)) :: f_in
	1165	REAL, DIMENSION(1:nza,nys:nyna,0:nxa) :: f_out
	1166	REAL, DIMENSION(nys:nyna,1:nza,0:nxa) :: work
	1167
	1168	!
	1169	!-- Transpose array
	1170	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	1171	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	1172	CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
	1173	work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
	1174	comm1dy, ierr )
	1175	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	1176
	1177	!
	1178	!-- Carry out the FFT along x, where all data are present due to the
	1179	!-- 1d-decomposition along y. Resort the data in a way that y becomes
	1180	!-- the first index.
	1181	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
	1182
	1183	IF ( host(1:3) == 'nec' ) THEN
	1184	!
	1185	!-- Code optimized for vector processors
[85]	1186	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	1187	!$OMP DO
	1188	DO j = nys, nyn
	1189
	1190	DO k = 1, nz
	1191	DO i = 0, nx
	1192	work_fftx(i,k,j) = work(j,k,i)
	1193	ENDDO
	1194	ENDDO
	1195
	1196	CALL fft_x_m( work_fftx(:,:,j), 'backward' )
	1197
	1198	ENDDO
	1199
	1200	!$OMP DO
	1201	DO i = 0, nx
	1202	DO j = nys, nyn
	1203	DO k = 1, nz
	1204	f_out(k,j,i) = work_fftx(i,k,j)
	1205	ENDDO
	1206	ENDDO
	1207	ENDDO
	1208	!$OMP END PARALLEL
	1209
	1210	ELSE
	1211
	1212	!
	1213	!-- Cache optimized code (there might be still a potential for better
	1214	!-- optimization).
[696]	1215	!$OMP PARALLEL PRIVATE (i,j,k)
[1]	1216	!$OMP DO
	1217	DO j = nys, nyn
	1218	DO k = 1, nz
	1219
	1220	DO i = 0, nx
	1221	work_fftx(i,k,j) = work(j,k,i)
	1222	ENDDO
	1223
	1224	CALL fft_x( work_fftx(0:nx,k,j), 'backward' )
	1225
	1226	ENDDO
	1227	ENDDO
	1228
	1229	!$OMP DO
	1230	DO i = 0, nx
	1231	DO j = nys, nyn
	1232	DO k = 1, nz
	1233	f_out(k,j,i) = work_fftx(i,k,j)
	1234	ENDDO
	1235	ENDDO
	1236	ENDDO
	1237	!$OMP END PARALLEL
	1238
	1239	ENDIF
	1240	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	1241
	1242	END SUBROUTINE tr_yx_fftx
	1243
	1244
	1245	SUBROUTINE ffty_tri_ffty( ar )
	1246
	1247	!------------------------------------------------------------------------------!
	1248	! FFT along y, solution of the tridiagonal system and backward FFT for
	1249	! a 1d-decomposition along y
	1250	!
	1251	! WARNING: this subroutine may still not work for hybrid parallelization
	1252	! with OpenMP (for possible necessary changes see the original
	1253	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
	1254	!------------------------------------------------------------------------------!
	1255
	1256	USE control_parameters
	1257	USE cpulog
	1258	USE grid_variables
	1259	USE indices
	1260	USE interfaces
	1261	USE pegrid
	1262	USE transpose_indices
	1263
	1264	IMPLICIT NONE
	1265
	1266	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
	1267
	1268	REAL, DIMENSION(0:ny) :: work_ffty
	1269	REAL, DIMENSION(0:ny,1:nz) :: work_triy
	1270	REAL, DIMENSION(nny,1:nza,nxl_y:nxr_ya,pdims(2)) :: ar
	1271	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
	1272
	1273
	1274	CALL cpu_log( log_point_s(39), 'fft_y + tridia', 'start' )
	1275
	1276	ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) )
	1277
	1278	tn = 0 ! Default thread number in case of one thread
[696]	1279	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy )
[1]	1280	DO i = nxl_y, nxr_y
	1281
	1282	!$ tn = omp_get_thread_num()
	1283
	1284	IF ( host(1:3) == 'nec' ) THEN
	1285	!
	1286	!-- Code optimized for vector processors
	1287	DO k = 1, nz
	1288
	1289	m = 0
	1290	DO n = 1, pdims(2)
	1291	DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
	1292	work_triy(m,k) = ar(j,k,i,n)
	1293	m = m + 1
	1294	ENDDO
	1295	ENDDO
	1296
	1297	ENDDO
	1298
	1299	CALL fft_y_m( work_triy, ny, 'forward' )
	1300
	1301	ELSE
	1302	!
	1303	!-- Cache optimized code
	1304	DO k = 1, nz
	1305
	1306	m = 0
	1307	DO n = 1, pdims(2)
	1308	DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
	1309	work_ffty(m) = ar(j,k,i,n)
	1310	m = m + 1
	1311	ENDDO
	1312	ENDDO
	1313
	1314	CALL fft_y( work_ffty, 'forward' )
	1315
	1316	DO j = 0, ny
	1317	work_triy(j,k) = work_ffty(j)
	1318	ENDDO
	1319
	1320	ENDDO
	1321
	1322	ENDIF
	1323
	1324	!
	1325	!-- Solve the linear equation system
	1326	CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) )
	1327
	1328	IF ( host(1:3) == 'nec' ) THEN
	1329	!
	1330	!-- Code optimized for vector processors
	1331	CALL fft_y_m( work_triy, ny, 'backward' )
	1332
	1333	DO k = 1, nz
	1334
	1335	m = 0
	1336	DO n = 1, pdims(2)
	1337	DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
	1338	ar(j,k,i,n) = work_triy(m,k)
	1339	m = m + 1
	1340	ENDDO
	1341	ENDDO
	1342
	1343	ENDDO
	1344
	1345	ELSE
	1346	!
	1347	!-- Cache optimized code
	1348	DO k = 1, nz
	1349
	1350	DO j = 0, ny
	1351	work_ffty(j) = work_triy(j,k)
	1352	ENDDO
	1353
	1354	CALL fft_y( work_ffty, 'backward' )
	1355
	1356	m = 0
	1357	DO n = 1, pdims(2)
	1358	DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
	1359	ar(j,k,i,n) = work_ffty(m)
	1360	m = m + 1
	1361	ENDDO
	1362	ENDDO
	1363
	1364	ENDDO
	1365
	1366	ENDIF
	1367
	1368	ENDDO
	1369
	1370	DEALLOCATE( tri )
	1371
	1372	CALL cpu_log( log_point_s(39), 'fft_y + tridia', 'stop' )
	1373
	1374	END SUBROUTINE ffty_tri_ffty
	1375
	1376
	1377	SUBROUTINE tridia_1dd( ddx2, ddy2, nx, ny, j, ar, tri )
	1378
	1379	!------------------------------------------------------------------------------!
	1380	! Solves the linear system of equations for a 1d-decomposition along x (see
	1381	! tridia)
	1382	!
	1383	! Attention: when using the intel compiler, array tri must be passed as an
	1384	! argument to the contained subroutines. Otherwise addres faults
	1385	! will occur.
	1386	! On NEC, tri should not be passed (except for routine substi_1dd)
	1387	! because this causes very bad performance.
	1388	!------------------------------------------------------------------------------!
	1389
	1390	USE arrays_3d
	1391	USE control_parameters
	1392
	1393	USE pegrid
	1394
	1395	IMPLICIT NONE
	1396
	1397	INTEGER :: i, j, k, nnyh, nx, ny, omp_get_thread_num, tn
	1398
	1399	REAL :: ddx2, ddy2
	1400
	1401	REAL, DIMENSION(0:nx,1:nz) :: ar
	1402	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
	1403	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	1404
	1405
	1406	nnyh = ( ny + 1 ) / 2
	1407
	1408	!
	1409	!-- Define constant elements of the tridiagonal matrix.
	1410	!-- The compiler on SX6 does loop exchange. If 0:nx is a high power of 2,
	1411	!-- the exchanged loops create bank conflicts. The following directive
	1412	!-- prohibits loop exchange and the loops perform much better.
	1413	! tn = omp_get_thread_num()
	1414	! WRITE( 120+tn, * ) '+++ id=',myid,' nx=',nx,' thread=', omp_get_thread_num()
[82]	1415	! CALL local_flush( 120+tn )
[1]	1416	!CDIR NOLOOPCHG
	1417	DO k = 0, nz-1
	1418	DO i = 0,nx
[667]	1419	tri(2,i,k) = ddzu_pres(k+1) * ddzw(k+1)
	1420	tri(3,i,k) = ddzu_pres(k+2) * ddzw(k+1)
[1]	1421	ENDDO
	1422	ENDDO
	1423	! WRITE( 120+tn, * ) '+++ id=',myid,' end of first tridia loop thread=', omp_get_thread_num()
[82]	1424	! CALL local_flush( 120+tn )
[1]	1425
	1426	IF ( j <= nnyh ) THEN
[377]	1427	#if defined( __lc )
[1]	1428	CALL maketri_1dd( j, tri )
	1429	#else
	1430	CALL maketri_1dd( j )
	1431	#endif
	1432	ELSE
[377]	1433	#if defined( __lc )
[1]	1434	CALL maketri_1dd( ny+1-j, tri )
	1435	#else
	1436	CALL maketri_1dd( ny+1-j )
	1437	#endif
	1438	ENDIF
[377]	1439	#if defined( __lc )
[1]	1440	CALL split_1dd( tri )
	1441	#else
	1442	CALL split_1dd
	1443	#endif
	1444	CALL substi_1dd( ar, tri )
	1445
	1446	CONTAINS
	1447
[377]	1448	#if defined( __lc )
[1]	1449	SUBROUTINE maketri_1dd( j, tri )
	1450	#else
	1451	SUBROUTINE maketri_1dd( j )
	1452	#endif
	1453
	1454	!------------------------------------------------------------------------------!
	1455	! computes the i- and j-dependent component of the matrix
	1456	!------------------------------------------------------------------------------!
	1457
	1458	USE constants
	1459
	1460	IMPLICIT NONE
	1461
	1462	INTEGER :: i, j, k, nnxh
	1463	REAL :: a, c
	1464
	1465	REAL, DIMENSION(0:nx) :: l
	1466
[377]	1467	#if defined( __lc )
[1]	1468	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	1469	#endif
	1470
	1471
	1472	nnxh = ( nx + 1 ) / 2
	1473	!
	1474	!-- Provide the tridiagonal matrix for solution of the Poisson equation in
	1475	!-- Fourier space. The coefficients are computed following the method of
	1476	!-- Schmidt et al. (DFVLR-Mitteilung 84-15), which departs from Stephan
	1477	!-- Siano's original version by discretizing the Poisson equation,
	1478	!-- before it is Fourier-transformed
	1479	DO i = 0, nx
[128]	1480	IF ( i >= 0 .AND. i <= nnxh ) THEN
[1]	1481	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
	1482	FLOAT( nx+1 ) ) ) * ddx2 + &
	1483	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
	1484	FLOAT( ny+1 ) ) ) * ddy2
	1485	ELSE
	1486	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
	1487	FLOAT( nx+1 ) ) ) * ddx2 + &
	1488	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
	1489	FLOAT( ny+1 ) ) ) * ddy2
	1490	ENDIF
	1491	ENDDO
	1492
	1493	DO k = 0, nz-1
	1494	DO i = 0, nx
[667]	1495	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
	1496	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
[1]	1497	tri(1,i,k) = a + c - l(i)
	1498	ENDDO
	1499	ENDDO
	1500	IF ( ibc_p_b == 1 .OR. ibc_p_b == 2 ) THEN
	1501	DO i = 0, nx
	1502	tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
	1503	ENDDO
	1504	ENDIF
	1505	IF ( ibc_p_t == 1 ) THEN
	1506	DO i = 0, nx
	1507	tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
	1508	ENDDO
	1509	ENDIF
	1510
	1511	END SUBROUTINE maketri_1dd
	1512
	1513
[377]	1514	#if defined( __lc )
[1]	1515	SUBROUTINE split_1dd( tri )
	1516	#else
	1517	SUBROUTINE split_1dd
	1518	#endif
	1519
	1520	!------------------------------------------------------------------------------!
	1521	! Splitting of the tridiagonal matrix (Thomas algorithm)
	1522	!------------------------------------------------------------------------------!
	1523
	1524	IMPLICIT NONE
	1525
	1526	INTEGER :: i, k
	1527
[377]	1528	#if defined( __lc )
[1]	1529	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	1530	#endif
	1531
	1532
	1533	!
	1534	!-- Splitting
	1535	DO i = 0, nx
	1536	tri(4,i,0) = tri(1,i,0)
	1537	ENDDO
	1538	DO k = 1, nz-1
	1539	DO i = 0, nx
	1540	tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
	1541	tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
	1542	ENDDO
	1543	ENDDO
	1544
	1545	END SUBROUTINE split_1dd
	1546
	1547
	1548	SUBROUTINE substi_1dd( ar, tri )
	1549
	1550	!------------------------------------------------------------------------------!
	1551	! Substitution (Forward and Backward) (Thomas algorithm)
	1552	!------------------------------------------------------------------------------!
	1553
	1554	IMPLICIT NONE
	1555
[76]	1556	INTEGER :: i, k
[1]	1557
	1558	REAL, DIMENSION(0:nx,nz) :: ar
	1559	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
	1560	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	1561
	1562	!
	1563	!-- Forward substitution
	1564	DO i = 0, nx
	1565	ar1(i,0) = ar(i,1)
	1566	ENDDO
	1567	DO k = 1, nz-1
	1568	DO i = 0, nx
	1569	ar1(i,k) = ar(i,k+1) - tri(5,i,k) * ar1(i,k-1)
	1570	ENDDO
	1571	ENDDO
	1572
	1573	!
	1574	!-- Backward substitution
[763]	1575	!-- Note, the add of 1.0E-20 in the denominator is due to avoid divisions
	1576	!-- by zero appearing if the pressure bc is set to neumann at the top of
	1577	!-- the model domain.
[1]	1578	DO i = 0, nx
[761]	1579	ar(i,nz) = ar1(i,nz-1) / ( tri(4,i,nz-1) + 1.0E-20 )
[1]	1580	ENDDO
	1581	DO k = nz-2, 0, -1
	1582	DO i = 0, nx
	1583	ar(i,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,k+2) ) &
	1584	/ tri(4,i,k)
	1585	ENDDO
	1586	ENDDO
	1587
[76]	1588	!
	1589	!-- Indices i=0, j=0 correspond to horizontally averaged pressure.
	1590	!-- The respective values of ar should be zero at all k-levels if
	1591	!-- acceleration of horizontally averaged vertical velocity is zero.
	1592	IF ( ibc_p_b == 1 .AND. ibc_p_t == 1 ) THEN
	1593	IF ( j == 0 ) THEN
	1594	DO k = 1, nz
	1595	ar(0,k) = 0.0
	1596	ENDDO
	1597	ENDIF
	1598	ENDIF
	1599
[1]	1600	END SUBROUTINE substi_1dd
	1601
	1602	END SUBROUTINE tridia_1dd
	1603
	1604	#endif
[807]	1605	#endif
[1]	1606	END MODULE poisfft_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |