Home

Context Navigation

source: palm/trunk/SOURCE/poisfft.f90 @ 877

Last change on this file since 877 was 877, checked in by suehring, 13 years ago
Avoid divisions by zero in case of using a 'neumann' bc for the pressure at the top of the model domain.
Property svn:keywords set to `Id`
File size: 46.8 KB

Rev	Line
[1]	1	MODULE poisfft_mod
	2
	3	!------------------------------------------------------------------------------!
[484]	4	! Current revisions:
[1]	5	! -----------------
[877]	6	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
	7	! pressure at the top of the model domain.
[1]	8	!
	9	! Former revisions:
	10	! -----------------
[3]	11	! $Id: poisfft.f90 877 2012-04-03 11:21:44Z suehring $
[77]	12	!
[810]	13	! 809 2012-01-30 13:32:58Z maronga
	14	! Bugfix: replaced .AND. and .NOT. with && and ! in the preprocessor directives
	15	!
[808]	16	! 807 2012-01-25 11:53:51Z maronga
	17	! New cpp directive "__check" implemented which is used by check_namelist_files
	18	! (most of the code is unneeded by check_namelist_files).
	19	!
[764]	20	! 763 2011-10-06 09:32:09Z suehring
	21	! Comment added concerning the last change.
	22	!
[762]	23	! 761 2011-10-05 17:58:52Z suehring
	24	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
	25	! pressure at the top of the model domain.
	26	!
[697]	27	! 696 2011-03-18 07:03:49Z raasch
	28	! work_fftx removed from PRIVATE clauses in fftx_tr_xy and tr_yx_fftx
	29	!
[684]	30	! 683 2011-02-09 14:25:15Z raasch
	31	! openMP parallelization for 2d-domain-decomposition
	32	!
[668]	33	! 667 2010-12-23 12:06:00Z suehring/gryschka
	34	! ddzu replaced by ddzu_pres due to changes in zu(0)
	35	!
[623]	36	! 622 2010-12-10 08:08:13Z raasch
	37	! optional barriers included in order to speed up collective operations
	38	!
[392]	39	! 377 2009-09-04 11:09:00Z raasch
	40	! __lcmuk changed to __lc to avoid problems with Intel compiler on sgi-ice
	41	!
[198]	42	! 164 2008-05-15 08:46:15Z raasch
	43	! Arguments removed from transpose routines
	44	!
[139]	45	! 128 2007-10-26 13:11:14Z raasch
	46	! Bugfix: wavenumber calculation for even nx in routines maketri
	47	!
[90]	48	! 85 2007-05-11 09:35:14Z raasch
	49	! Bugfix: work_fft*_vec removed from some PRIVATE-declarations
	50	!
[77]	51	! 76 2007-03-29 00:58:32Z raasch
	52	! Tridiagonal coefficients adjusted for Neumann boundary conditions both at
	53	! the bottom and the top.
	54	!
[3]	55	! RCS Log replace by Id keyword, revision history cleaned up
	56	!
[1]	57	! Revision 1.24 2006/08/04 15:00:24 raasch
	58	! Default setting of the thread number tn in case of not using OpenMP
	59	!
	60	! Revision 1.23 2006/02/23 12:48:38 raasch
	61	! Additional compiler directive in routine tridia_1dd for preventing loop
	62	! exchange on NEC-SX6
	63	!
	64	! Revision 1.20 2004/04/30 12:38:09 raasch
	65	! Parts of former poisfft_hybrid moved to this subroutine,
	66	! former subroutine changed to a module, renaming of FFT-subroutines and
	67	! -module, FFTs completely substituted by calls of fft_x and fft_y,
	68	! NAG fft used in the non-parallel case completely removed, l in maketri
	69	! is now a 1d-array, variables passed by modules instead of using parameter
	70	! lists, enlarged transposition arrays introduced
	71	!
	72	! Revision 1.1 1997/07/24 11:24:14 raasch
	73	! Initial revision
	74	!
	75	!
	76	! Description:
	77	! ------------
	78	! See below.
	79	!------------------------------------------------------------------------------!
	80
	81	!--------------------------------------------------------------------------!
	82	! poisfft !
	83	! !
	84	! Original version: Stephan Siano (pois3d) !
	85	! !
	86	! Institute of Meteorology and Climatology, University of Hannover !
	87	! Germany !
	88	! !
	89	! Version as of July 23,1996 !
	90	! !
	91	! !
	92	! Version for parallel computers: Siegfried Raasch !
	93	! !
	94	! Version as of July 03,1997 !
	95	! !
	96	! Solves the Poisson equation with a 2D spectral method !
	97	! d^2 p / dx^2 + d^2 p / dy^2 + d^2 p / dz^2 = s !
	98	! !
	99	! Input: !
	100	! real ar contains in the (nnx,nny,nnz) elements, !
	101	! starting from the element (1,nys,nxl), the !
	102	! values for s !
	103	! real work Temporary array !
	104	! !
	105	! Output: !
	106	! real ar contains the solution for p !
	107	!--------------------------------------------------------------------------!
	108
	109	USE fft_xy
	110	USE indices
	111	USE transpose_indices
	112
	113	IMPLICIT NONE
	114
	115	PRIVATE
[807]	116
[809]	117	#if ! defined ( __check )
[1]	118	PUBLIC poisfft, poisfft_init
	119
	120	INTERFACE poisfft
	121	MODULE PROCEDURE poisfft
	122	END INTERFACE poisfft
	123
	124	INTERFACE poisfft_init
	125	MODULE PROCEDURE poisfft_init
	126	END INTERFACE poisfft_init
[807]	127	#else
	128	PUBLIC poisfft_init
[1]	129
[807]	130	INTERFACE poisfft_init
	131	MODULE PROCEDURE poisfft_init
	132	END INTERFACE poisfft_init
	133	#endif
	134
[1]	135	CONTAINS
	136
	137	SUBROUTINE poisfft_init
	138
	139	CALL fft_init
	140
	141	END SUBROUTINE poisfft_init
	142
[809]	143	#if ! defined ( __check )
[1]	144	SUBROUTINE poisfft( ar, work )
	145
	146	USE cpulog
	147	USE interfaces
	148	USE pegrid
	149
	150	IMPLICIT NONE
	151
	152	REAL, DIMENSION(1:nza,nys:nyna,nxl:nxra) :: ar, work
	153
	154
	155	CALL cpu_log( log_point_s(3), 'poisfft', 'start' )
	156
	157	!
	158	!-- Two-dimensional Fourier Transformation in x- and y-direction.
	159	#if defined( __parallel )
	160	IF ( pdims(2) == 1 ) THEN
	161
	162	!
	163	!-- 1d-domain-decomposition along x:
	164	!-- FFT along y and transposition y --> x
	165	CALL ffty_tr_yx( ar, work, ar )
	166
	167	!
	168	!-- FFT along x, solving the tridiagonal system and backward FFT
	169	CALL fftx_tri_fftx( ar )
	170
	171	!
	172	!-- Transposition x --> y and backward FFT along y
	173	CALL tr_xy_ffty( ar, work, ar )
	174
	175	ELSEIF ( pdims(1) == 1 ) THEN
	176
	177	!
	178	!-- 1d-domain-decomposition along y:
	179	!-- FFT along x and transposition x --> y
	180	CALL fftx_tr_xy( ar, work, ar )
	181
	182	!
	183	!-- FFT along y, solving the tridiagonal system and backward FFT
	184	CALL ffty_tri_ffty( ar )
	185
	186	!
	187	!-- Transposition y --> x and backward FFT along x
	188	CALL tr_yx_fftx( ar, work, ar )
	189
	190	ELSE
	191
	192	!
	193	!-- 2d-domain-decomposition
	194	!-- Transposition z --> x
	195	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
[164]	196	CALL transpose_zx( ar, work, ar )
[1]	197	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
	198
	199	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
	200	CALL fftxp( ar, 'forward' )
	201	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
	202
	203	!
	204	!-- Transposition x --> y
	205	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
[164]	206	CALL transpose_xy( ar, work, ar )
[1]	207	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
	208
	209	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
	210	CALL fftyp( ar, 'forward' )
	211	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	212
	213	!
	214	!-- Transposition y --> z
	215	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
[164]	216	CALL transpose_yz( ar, work, ar )
[1]	217	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
	218
	219	!
	220	!-- Solve the Poisson equation in z-direction in cartesian space.
	221	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
	222	CALL tridia( ar )
	223	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
	224
	225	!
	226	!-- Inverse Fourier Transformation
	227	!-- Transposition z --> y
	228	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
[164]	229	CALL transpose_zy( ar, work, ar )
[1]	230	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
	231
	232	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
	233	CALL fftyp( ar, 'backward' )
	234	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	235
	236	!
	237	!-- Transposition y --> x
	238	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
[164]	239	CALL transpose_yx( ar, work, ar )
[1]	240	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
	241
	242	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
	243	CALL fftxp( ar, 'backward' )
	244	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	245
	246	!
	247	!-- Transposition x --> z
	248	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
[164]	249	CALL transpose_xz( ar, work, ar )
[1]	250	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
	251
	252	ENDIF
	253
	254	#else
	255
	256	!
	257	!-- Two-dimensional Fourier Transformation along x- and y-direction.
	258	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
	259	CALL fftx( ar, 'forward' )
	260	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
	261	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
	262	CALL ffty( ar, 'forward' )
	263	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	264
	265	!
	266	!-- Solve the Poisson equation in z-direction in cartesian space.
	267	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
	268	CALL tridia( ar )
	269	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
	270
	271	!
	272	!-- Inverse Fourier Transformation.
	273	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
	274	CALL ffty( ar, 'backward' )
	275	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	276	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
	277	CALL fftx( ar, 'backward' )
	278	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	279
	280	#endif
	281
	282	CALL cpu_log( log_point_s(3), 'poisfft', 'stop' )
	283
	284	END SUBROUTINE poisfft
	285
	286
	287
	288	SUBROUTINE tridia( ar )
	289
	290	!------------------------------------------------------------------------------!
	291	! solves the linear system of equations:
	292	!
	293	! -(4 pi^2(i^2/(dx^2nnx^2)+j^2/(dy^2nny^2))+
	294	! 1/(dzu(k)dzw(k))+1/(dzu(k-1)dzw(k)))*p(i,j,k)+
	295	! 1/(dzu(k)dzw(k))p(i,j,k+1)+1/(dzu(k-1)dzw(k))p(i,j,k-1)=d(i,j,k)
	296	!
	297	! by using the Thomas algorithm
	298	!------------------------------------------------------------------------------!
	299
	300	USE arrays_3d
	301
	302	IMPLICIT NONE
	303
	304	INTEGER :: i, j, k, nnyh
	305
	306	REAL, DIMENSION(nxl_z:nxr_z,0:nz-1) :: ar1
	307	REAL, DIMENSION(5,nxl_z:nxr_z,0:nz-1) :: tri
	308
	309	#if defined( __parallel )
	310	REAL :: ar(nxl_z:nxr_za,nys_z:nyn_za,1:nza)
	311	#else
	312	REAL :: ar(1:nz,nys_z:nyn_z,nxl_z:nxr_z)
	313	#endif
	314
	315
	316	nnyh = (ny+1) / 2
	317
	318	!
	319	!-- Define constant elements of the tridiagonal matrix.
[683]	320	!$OMP PARALLEL PRIVATE ( k, i )
	321	!$OMP DO
[1]	322	DO k = 0, nz-1
	323	DO i = nxl_z, nxr_z
[667]	324	tri(2,i,k) = ddzu_pres(k+1) * ddzw(k+1)
	325	tri(3,i,k) = ddzu_pres(k+2) * ddzw(k+1)
[1]	326	ENDDO
	327	ENDDO
[683]	328	!$OMP END PARALLEL
[1]	329
	330	#if defined( __parallel )
	331	!
	332	!-- Repeat for all y-levels.
[683]	333	!$OMP PARALLEL FIRSTPRIVATE( tri ) PRIVATE ( ar1, j )
	334	!$OMP DO
[1]	335	DO j = nys_z, nyn_z
	336	IF ( j <= nnyh ) THEN
	337	CALL maketri( tri, j )
	338	ELSE
	339	CALL maketri( tri, ny+1-j )
	340	ENDIF
	341	CALL split( tri )
	342	CALL substi( ar, ar1, tri, j )
	343	ENDDO
[683]	344	!$OMP END PARALLEL
[1]	345	#else
	346	!
	347	!-- First y-level.
	348	CALL maketri( tri, nys_z )
	349	CALL split( tri )
	350	CALL substi( ar, ar1, tri, 0 )
	351
	352	!
	353	!-- Further y-levels.
	354	DO j = 1, nnyh - 1
	355	CALL maketri( tri, j )
	356	CALL split( tri )
	357	CALL substi( ar, ar1, tri, j )
	358	CALL substi( ar, ar1, tri, ny+1-j )
	359	ENDDO
	360	CALL maketri( tri, nnyh )
	361	CALL split( tri )
	362	CALL substi( ar, ar1, tri, nnyh+nys )
	363	#endif
	364
	365	CONTAINS
	366
	367	SUBROUTINE maketri( tri, j )
	368
	369	!------------------------------------------------------------------------------!
	370	! Computes the i- and j-dependent component of the matrix
	371	!------------------------------------------------------------------------------!
	372
	373	USE arrays_3d
	374	USE constants
	375	USE control_parameters
	376	USE grid_variables
	377
	378	IMPLICIT NONE
	379
	380	INTEGER :: i, j, k, nnxh
	381	REAL :: a, c
	382	REAL :: ll(nxl_z:nxr_z)
	383	REAL :: tri(5,nxl_z:nxr_z,0:nz-1)
	384
	385
	386	nnxh = ( nx + 1 ) / 2
	387
	388	!
	389	!-- Provide the tridiagonal matrix for solution of the Poisson equation in
	390	!-- Fourier space. The coefficients are computed following the method of
	391	!-- Schmidt et al. (DFVLR-Mitteilung 84-15), which departs from Stephan
	392	!-- Siano's original version by discretizing the Poisson equation,
	393	!-- before it is Fourier-transformed
	394	#if defined( __parallel )
	395	DO i = nxl_z, nxr_z
[128]	396	IF ( i >= 0 .AND. i <= nnxh ) THEN
[1]	397	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
	398	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
	399	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
	400	FLOAT( ny+1 ) ) ) / ( dy * dy )
	401	ELSE
	402	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
	403	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
	404	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
	405	FLOAT( ny+1 ) ) ) / ( dy * dy )
	406	ENDIF
	407	DO k = 0,nz-1
[667]	408	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
	409	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
[1]	410	tri(1,i,k) = a + c - ll(i)
	411	ENDDO
	412	ENDDO
	413	#else
	414	DO i = 0, nnxh
	415	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / FLOAT( nx+1 ) ) ) / &
	416	( dx * dx ) + &
	417	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / FLOAT( ny+1 ) ) ) / &
	418	( dy * dy )
	419	DO k = 0, nz-1
[667]	420	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
	421	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
[1]	422	tri(1,i,k) = a + c - ll(i)
	423	IF ( i >= 1 .and. i < nnxh ) THEN
	424	tri(1,nx+1-i,k) = tri(1,i,k)
	425	ENDIF
	426	ENDDO
	427	ENDDO
	428	#endif
	429	IF ( ibc_p_b == 1 .OR. ibc_p_b == 2 ) THEN
	430	DO i = nxl_z, nxr_z
	431	tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
	432	ENDDO
	433	ENDIF
	434	IF ( ibc_p_t == 1 ) THEN
	435	DO i = nxl_z, nxr_z
	436	tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
	437	ENDDO
	438	ENDIF
	439
	440	END SUBROUTINE maketri
	441
	442
	443	SUBROUTINE substi( ar, ar1, tri, j )
	444
	445	!------------------------------------------------------------------------------!
	446	! Substitution (Forward and Backward) (Thomas algorithm)
	447	!------------------------------------------------------------------------------!
	448
[76]	449	USE control_parameters
	450
[1]	451	IMPLICIT NONE
	452
	453	INTEGER :: i, j, k
	454	REAL :: ar1(nxl_z:nxr_z,0:nz-1)
	455	REAL :: tri(5,nxl_z:nxr_z,0:nz-1)
	456	#if defined( __parallel )
	457	REAL :: ar(nxl_z:nxr_za,nys_z:nyn_za,1:nza)
	458	#else
	459	REAL :: ar(1:nz,nys_z:nyn_z,nxl_z:nxr_z)
	460	#endif
	461
	462	!
	463	!-- Forward substitution.
	464	DO i = nxl_z, nxr_z
	465	#if defined( __parallel )
	466	ar1(i,0) = ar(i,j,1)
	467	#else
	468	ar1(i,0) = ar(1,j,i)
	469	#endif
	470	ENDDO
	471	DO k = 1, nz - 1
	472	DO i = nxl_z, nxr_z
	473	#if defined( __parallel )
	474	ar1(i,k) = ar(i,j,k+1) - tri(5,i,k) * ar1(i,k-1)
	475	#else
	476	ar1(i,k) = ar(k+1,j,i) - tri(5,i,k) * ar1(i,k-1)
	477	#endif
	478	ENDDO
	479	ENDDO
	480
	481	!
[877]	482	!-- Backward substitution
	483	!-- Note, the 1.0E-20 in the denominator is due to avoid divisions
	484	!-- by zero appearing if the pressure bc is set to neumann at the top of
	485	!-- the model domain.
[1]	486	DO i = nxl_z, nxr_z
	487	#if defined( __parallel )
[877]	488	ar(i,j,nz) = ar1(i,nz-1) / ( tri(4,i,nz-1) + 1.0E-20 )
[1]	489	#else
[877]	490	ar(nz,j,i) = ar1(i,nz-1) / ( tri(4,i,nz-1) + 1.0E-20 )
[1]	491	#endif
	492	ENDDO
	493	DO k = nz-2, 0, -1
	494	DO i = nxl_z, nxr_z
	495	#if defined( __parallel )
	496	ar(i,j,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,j,k+2) ) &
	497	/ tri(4,i,k)
	498	#else
	499	ar(k+1,j,i) = ( ar1(i,k) - tri(3,i,k) * ar(k+2,j,i) ) &
	500	/ tri(4,i,k)
	501	#endif
	502	ENDDO
	503	ENDDO
	504
[76]	505	!
	506	!-- Indices i=0, j=0 correspond to horizontally averaged pressure.
	507	!-- The respective values of ar should be zero at all k-levels if
	508	!-- acceleration of horizontally averaged vertical velocity is zero.
	509	IF ( ibc_p_b == 1 .AND. ibc_p_t == 1 ) THEN
	510	IF ( j == 0 .AND. nxl_z == 0 ) THEN
	511	#if defined( __parallel )
	512	DO k = 1, nz
	513	ar(nxl_z,j,k) = 0.0
	514	ENDDO
	515	#else
	516	DO k = 1, nz
	517	ar(k,j,nxl_z) = 0.0
	518	ENDDO
	519	#endif
	520	ENDIF
	521	ENDIF
	522
[1]	523	END SUBROUTINE substi
	524
	525
	526	SUBROUTINE split( tri )
	527
	528	!------------------------------------------------------------------------------!
	529	! Splitting of the tridiagonal matrix (Thomas algorithm)
	530	!------------------------------------------------------------------------------!
	531
	532	IMPLICIT NONE
	533
	534	INTEGER :: i, k
	535	REAL :: tri(5,nxl_z:nxr_z,0:nz-1)
	536
	537	!
	538	!-- Splitting.
	539	DO i = nxl_z, nxr_z
	540	tri(4,i,0) = tri(1,i,0)
	541	ENDDO
	542	DO k = 1, nz-1
	543	DO i = nxl_z, nxr_z
	544	tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
	545	tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
	546	ENDDO
	547	ENDDO
	548
	549	END SUBROUTINE split
	550
	551	END SUBROUTINE tridia
	552
	553
	554	#if defined( __parallel )
	555	SUBROUTINE fftxp( ar, direction )
	556
	557	!------------------------------------------------------------------------------!
	558	! Fourier-transformation along x-direction Parallelized version
	559	!------------------------------------------------------------------------------!
	560
	561	IMPLICIT NONE
	562
	563	CHARACTER (LEN=*) :: direction
	564	INTEGER :: j, k
	565	REAL :: ar(0:nxa,nys_x:nyn_xa,nzb_x:nzt_xa)
	566
	567	!
	568	!-- Performing the fft with one of the methods implemented
[683]	569	!$OMP PARALLEL PRIVATE ( j, k )
	570	!$OMP DO
[1]	571	DO k = nzb_x, nzt_x
	572	DO j = nys_x, nyn_x
	573	CALL fft_x( ar(0:nx,j,k), direction )
	574	ENDDO
	575	ENDDO
[683]	576	!$OMP END PARALLEL
[1]	577
	578	END SUBROUTINE fftxp
	579
	580	#else
	581	SUBROUTINE fftx( ar, direction )
	582
	583	!------------------------------------------------------------------------------!
	584	! Fourier-transformation along x-direction Non parallel version
	585	!------------------------------------------------------------------------------!
	586
	587	IMPLICIT NONE
	588
	589	CHARACTER (LEN=*) :: direction
	590	INTEGER :: i, j, k
	591	REAL :: ar(1:nz,0:ny,0:nx)
	592
	593	!
	594	!-- Performing the fft with one of the methods implemented
[683]	595	!$OMP PARALLEL PRIVATE ( j, k )
	596	!$OMP DO
[1]	597	DO k = 1, nz
	598	DO j = 0, ny
	599	CALL fft_x( ar(k,j,0:nx), direction )
	600	ENDDO
	601	ENDDO
[683]	602	!$OMP END PARALLEL
[1]	603
	604	END SUBROUTINE fftx
	605	#endif
	606
	607
	608	#if defined( __parallel )
	609	SUBROUTINE fftyp( ar, direction )
	610
	611	!------------------------------------------------------------------------------!
	612	! Fourier-transformation along y-direction Parallelized version
	613	!------------------------------------------------------------------------------!
	614
	615	IMPLICIT NONE
	616
	617	CHARACTER (LEN=*) :: direction
	618	INTEGER :: i, k
	619	REAL :: ar(0:nya,nxl_y:nxr_ya,nzb_y:nzt_ya)
	620
	621	!
	622	!-- Performing the fft with one of the methods implemented
[683]	623	!$OMP PARALLEL PRIVATE ( i, k )
	624	!$OMP DO
[1]	625	DO k = nzb_y, nzt_y
	626	DO i = nxl_y, nxr_y
	627	CALL fft_y( ar(0:ny,i,k), direction )
	628	ENDDO
	629	ENDDO
[683]	630	!$OMP END PARALLEL
[1]	631
	632	END SUBROUTINE fftyp
	633
	634	#else
	635	SUBROUTINE ffty( ar, direction )
	636
	637	!------------------------------------------------------------------------------!
	638	! Fourier-transformation along y-direction Non parallel version
	639	!------------------------------------------------------------------------------!
	640
	641	IMPLICIT NONE
	642
	643	CHARACTER (LEN=*) :: direction
	644	INTEGER :: i, k
	645	REAL :: ar(1:nz,0:ny,0:nx)
	646
	647	!
	648	!-- Performing the fft with one of the methods implemented
[683]	649	!$OMP PARALLEL PRIVATE ( i, k )
	650	!$OMP DO
[1]	651	DO k = 1, nz
	652	DO i = 0, nx
	653	CALL fft_y( ar(k,0:ny,i), direction )
	654	ENDDO
	655	ENDDO
[683]	656	!$OMP END PARALLEL
[1]	657
	658	END SUBROUTINE ffty
	659	#endif
	660
	661	#if defined( __parallel )
	662	SUBROUTINE ffty_tr_yx( f_in, work, f_out )
	663
	664	!------------------------------------------------------------------------------!
	665	! Fourier-transformation along y with subsequent transposition y --> x for
	666	! a 1d-decomposition along x
	667	!
	668	! ATTENTION: The performance of this routine is much faster on the NEC-SX6,
	669	! if the first index of work_ffty_vec is odd. Otherwise
	670	! memory bank conflicts may occur (especially if the index is a
	671	! multiple of 128). That's why work_ffty_vec is dimensioned as
	672	! 0:ny+1.
	673	! Of course, this will not work if users are using an odd number
	674	! of gridpoints along y.
	675	!------------------------------------------------------------------------------!
	676
	677	USE control_parameters
	678	USE cpulog
	679	USE indices
	680	USE interfaces
	681	USE pegrid
	682	USE transpose_indices
	683
	684	IMPLICIT NONE
	685
	686	INTEGER :: i, iend, iouter, ir, j, k
	687	INTEGER, PARAMETER :: stridex = 4
	688
	689	REAL, DIMENSION(0:ny,stridex) :: work_ffty
	690	#if defined( __nec )
	691	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
	692	#endif
	693	REAL, DIMENSION(1:nza,0:nya,nxl:nxra) :: f_in
	694	REAL, DIMENSION(nnx,1:nza,nys_x:nyn_xa,pdims(1)) :: f_out
	695	REAL, DIMENSION(nxl:nxra,1:nza,0:nya) :: work
	696
	697	!
	698	!-- Carry out the FFT along y, where all data are present due to the
	699	!-- 1d-decomposition along x. Resort the data in a way that x becomes
	700	!-- the first index.
	701	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
	702
	703	IF ( host(1:3) == 'nec' ) THEN
	704	#if defined( __nec )
	705	!
	706	!-- Code optimized for vector processors
[85]	707	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	708	!$OMP DO
	709	DO i = nxl, nxr
	710
	711	DO j = 0, ny
	712	DO k = 1, nz
	713	work_ffty_vec(j,k,i) = f_in(k,j,i)
	714	ENDDO
	715	ENDDO
	716
	717	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' )
	718
	719	ENDDO
	720
	721	!$OMP DO
	722	DO k = 1, nz
	723	DO j = 0, ny
	724	DO i = nxl, nxr
	725	work(i,k,j) = work_ffty_vec(j,k,i)
	726	ENDDO
	727	ENDDO
	728	ENDDO
	729	!$OMP END PARALLEL
	730	#endif
	731
	732	ELSE
	733
	734	!
	735	!-- Cache optimized code.
	736	!-- The i-(x-)direction is split into a strided outer loop and an inner
	737	!-- loop for better cache performance
	738	!$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty)
	739	!$OMP DO
	740	DO iouter = nxl, nxr, stridex
	741
	742	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
	743
	744	DO k = 1, nz
	745
	746	DO i = iouter, iend
	747
	748	ir = i-iouter+1 ! counter within a stride
	749	DO j = 0, ny
	750	work_ffty(j,ir) = f_in(k,j,i)
	751	ENDDO
	752	!
	753	!-- FFT along y
	754	CALL fft_y( work_ffty(:,ir), 'forward' )
	755
	756	ENDDO
	757
	758	!
	759	!-- Resort
	760	DO j = 0, ny
	761	DO i = iouter, iend
	762	work(i,k,j) = work_ffty(j,i-iouter+1)
	763	ENDDO
	764	ENDDO
	765
	766	ENDDO
	767
	768	ENDDO
	769	!$OMP END PARALLEL
	770
	771	ENDIF
	772	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	773
	774	!
	775	!-- Transpose array
	776	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	777	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	778	CALL MPI_ALLTOALL( work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
	779	f_out(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
	780	comm1dx, ierr )
	781	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	782
	783	END SUBROUTINE ffty_tr_yx
	784
	785
	786	SUBROUTINE tr_xy_ffty( f_in, work, f_out )
	787
	788	!------------------------------------------------------------------------------!
	789	! Transposition x --> y with a subsequent backward Fourier transformation for
	790	! a 1d-decomposition along x
	791	!------------------------------------------------------------------------------!
	792
	793	USE control_parameters
	794	USE cpulog
	795	USE indices
	796	USE interfaces
	797	USE pegrid
	798	USE transpose_indices
	799
	800	IMPLICIT NONE
	801
	802	INTEGER :: i, iend, iouter, ir, j, k
	803	INTEGER, PARAMETER :: stridex = 4
	804
	805	REAL, DIMENSION(0:ny,stridex) :: work_ffty
	806	#if defined( __nec )
	807	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
	808	#endif
	809	REAL, DIMENSION(nnx,1:nza,nys_x:nyn_xa,pdims(1)) :: f_in
	810	REAL, DIMENSION(1:nza,0:nya,nxl:nxra) :: f_out
	811	REAL, DIMENSION(nxl:nxra,1:nza,0:nya) :: work
	812
	813	!
	814	!-- Transpose array
	815	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	816	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	817	CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
	818	work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
	819	comm1dx, ierr )
	820	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	821
	822	!
	823	!-- Resort the data in a way that y becomes the first index and carry out the
	824	!-- backward fft along y.
	825	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
	826
	827	IF ( host(1:3) == 'nec' ) THEN
	828	#if defined( __nec )
	829	!
	830	!-- Code optimized for vector processors
[85]	831	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	832	!$OMP DO
	833	DO k = 1, nz
	834	DO j = 0, ny
	835	DO i = nxl, nxr
	836	work_ffty_vec(j,k,i) = work(i,k,j)
	837	ENDDO
	838	ENDDO
	839	ENDDO
	840
	841	!$OMP DO
	842	DO i = nxl, nxr
	843
	844	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' )
	845
	846	DO j = 0, ny
	847	DO k = 1, nz
	848	f_out(k,j,i) = work_ffty_vec(j,k,i)
	849	ENDDO
	850	ENDDO
	851
	852	ENDDO
	853	!$OMP END PARALLEL
	854	#endif
	855
	856	ELSE
	857
	858	!
	859	!-- Cache optimized code.
	860	!-- The i-(x-)direction is split into a strided outer loop and an inner
	861	!-- loop for better cache performance
	862	!$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty )
	863	!$OMP DO
	864	DO iouter = nxl, nxr, stridex
	865
	866	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
	867
	868	DO k = 1, nz
	869	!
	870	!-- Resort
	871	DO j = 0, ny
	872	DO i = iouter, iend
	873	work_ffty(j,i-iouter+1) = work(i,k,j)
	874	ENDDO
	875	ENDDO
	876
	877	DO i = iouter, iend
	878
	879	!
	880	!-- FFT along y
	881	ir = i-iouter+1 ! counter within a stride
	882	CALL fft_y( work_ffty(:,ir), 'backward' )
	883
	884	DO j = 0, ny
	885	f_out(k,j,i) = work_ffty(j,ir)
	886	ENDDO
	887	ENDDO
	888
	889	ENDDO
	890
	891	ENDDO
	892	!$OMP END PARALLEL
	893
	894	ENDIF
	895
	896	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	897
	898	END SUBROUTINE tr_xy_ffty
	899
	900
	901	SUBROUTINE fftx_tri_fftx( ar )
	902
	903	!------------------------------------------------------------------------------!
	904	! FFT along x, solution of the tridiagonal system and backward FFT for
	905	! a 1d-decomposition along x
	906	!
	907	! WARNING: this subroutine may still not work for hybrid parallelization
	908	! with OpenMP (for possible necessary changes see the original
	909	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
	910	!------------------------------------------------------------------------------!
	911
	912	USE control_parameters
	913	USE cpulog
	914	USE grid_variables
	915	USE indices
	916	USE interfaces
	917	USE pegrid
	918	USE transpose_indices
	919
	920	IMPLICIT NONE
	921
	922	character(len=3) :: myth_char
	923
	924	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
	925
	926	REAL, DIMENSION(0:nx) :: work_fftx
	927	REAL, DIMENSION(0:nx,1:nz) :: work_trix
	928	REAL, DIMENSION(nnx,1:nza,nys_x:nyn_xa,pdims(1)) :: ar
	929	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
	930
	931
	932	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'start' )
	933
	934	ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) )
	935
	936	tn = 0 ! Default thread number in case of one thread
	937	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_fftx, work_trix )
	938	DO j = nys_x, nyn_x
	939
	940	!$ tn = omp_get_thread_num()
	941
	942	IF ( host(1:3) == 'nec' ) THEN
	943	!
	944	!-- Code optimized for vector processors
	945	DO k = 1, nz
	946
	947	m = 0
	948	DO n = 1, pdims(1)
	949	DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
	950	work_trix(m,k) = ar(i,k,j,n)
	951	m = m + 1
	952	ENDDO
	953	ENDDO
	954
	955	ENDDO
	956
	957	CALL fft_x_m( work_trix, 'forward' )
	958
	959	ELSE
	960	!
	961	!-- Cache optimized code
	962	DO k = 1, nz
	963
	964	m = 0
	965	DO n = 1, pdims(1)
	966	DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
	967	work_fftx(m) = ar(i,k,j,n)
	968	m = m + 1
	969	ENDDO
	970	ENDDO
	971
	972	CALL fft_x( work_fftx, 'forward' )
	973
	974	DO i = 0, nx
	975	work_trix(i,k) = work_fftx(i)
	976	ENDDO
	977
	978	ENDDO
	979
	980	ENDIF
	981
	982	!
	983	!-- Solve the linear equation system
	984	CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) )
	985
	986	IF ( host(1:3) == 'nec' ) THEN
	987	!
	988	!-- Code optimized for vector processors
	989	CALL fft_x_m( work_trix, 'backward' )
	990
	991	DO k = 1, nz
	992
	993	m = 0
	994	DO n = 1, pdims(1)
	995	DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
	996	ar(i,k,j,n) = work_trix(m,k)
	997	m = m + 1
	998	ENDDO
	999	ENDDO
	1000
	1001	ENDDO
	1002
	1003	ELSE
	1004	!
	1005	!-- Cache optimized code
	1006	DO k = 1, nz
	1007
	1008	DO i = 0, nx
	1009	work_fftx(i) = work_trix(i,k)
	1010	ENDDO
	1011
	1012	CALL fft_x( work_fftx, 'backward' )
	1013
	1014	m = 0
	1015	DO n = 1, pdims(1)
	1016	DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
	1017	ar(i,k,j,n) = work_fftx(m)
	1018	m = m + 1
	1019	ENDDO
	1020	ENDDO
	1021
	1022	ENDDO
	1023
	1024	ENDIF
	1025
	1026	ENDDO
	1027
	1028	DEALLOCATE( tri )
	1029
	1030	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'stop' )
	1031
	1032	END SUBROUTINE fftx_tri_fftx
	1033
	1034
	1035	SUBROUTINE fftx_tr_xy( f_in, work, f_out )
	1036
	1037	!------------------------------------------------------------------------------!
	1038	! Fourier-transformation along x with subsequent transposition x --> y for
	1039	! a 1d-decomposition along y
	1040	!
	1041	! ATTENTION: The NEC-branch of this routine may significantly profit from
	1042	! further optimizations. So far, performance is much worse than
	1043	! for routine ffty_tr_yx (more than three times slower).
	1044	!------------------------------------------------------------------------------!
	1045
	1046	USE control_parameters
	1047	USE cpulog
	1048	USE indices
	1049	USE interfaces
	1050	USE pegrid
	1051	USE transpose_indices
	1052
	1053	IMPLICIT NONE
	1054
	1055	INTEGER :: i, j, k
	1056
	1057	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
	1058	REAL, DIMENSION(1:nza,nys:nyna,0:nxa) :: f_in
	1059	REAL, DIMENSION(nny,1:nza,nxl_y:nxr_ya,pdims(2)) :: f_out
	1060	REAL, DIMENSION(nys:nyna,1:nza,0:nxa) :: work
	1061
	1062	!
	1063	!-- Carry out the FFT along x, where all data are present due to the
	1064	!-- 1d-decomposition along y. Resort the data in a way that y becomes
	1065	!-- the first index.
	1066	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
	1067
	1068	IF ( host(1:3) == 'nec' ) THEN
	1069	!
	1070	!-- Code for vector processors
[85]	1071	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	1072	!$OMP DO
	1073	DO i = 0, nx
	1074
	1075	DO j = nys, nyn
	1076	DO k = 1, nz
	1077	work_fftx(i,k,j) = f_in(k,j,i)
	1078	ENDDO
	1079	ENDDO
	1080
	1081	ENDDO
	1082
	1083	!$OMP DO
	1084	DO j = nys, nyn
	1085
	1086	CALL fft_x_m( work_fftx(:,:,j), 'forward' )
	1087
	1088	DO k = 1, nz
	1089	DO i = 0, nx
	1090	work(j,k,i) = work_fftx(i,k,j)
	1091	ENDDO
	1092	ENDDO
	1093
	1094	ENDDO
	1095	!$OMP END PARALLEL
	1096
	1097	ELSE
	1098
	1099	!
	1100	!-- Cache optimized code (there might be still a potential for better
	1101	!-- optimization).
[696]	1102	!$OMP PARALLEL PRIVATE (i,j,k)
[1]	1103	!$OMP DO
	1104	DO i = 0, nx
	1105
	1106	DO j = nys, nyn
	1107	DO k = 1, nz
	1108	work_fftx(i,k,j) = f_in(k,j,i)
	1109	ENDDO
	1110	ENDDO
	1111
	1112	ENDDO
	1113
	1114	!$OMP DO
	1115	DO j = nys, nyn
	1116	DO k = 1, nz
	1117
	1118	CALL fft_x( work_fftx(0:nx,k,j), 'forward' )
	1119
	1120	DO i = 0, nx
	1121	work(j,k,i) = work_fftx(i,k,j)
	1122	ENDDO
	1123	ENDDO
	1124
	1125	ENDDO
	1126	!$OMP END PARALLEL
	1127
	1128	ENDIF
	1129	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
	1130
	1131	!
	1132	!-- Transpose array
	1133	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	1134	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	1135	CALL MPI_ALLTOALL( work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
	1136	f_out(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
	1137	comm1dy, ierr )
	1138	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	1139
	1140	END SUBROUTINE fftx_tr_xy
	1141
	1142
	1143	SUBROUTINE tr_yx_fftx( f_in, work, f_out )
	1144
	1145	!------------------------------------------------------------------------------!
	1146	! Transposition y --> x with a subsequent backward Fourier transformation for
	1147	! a 1d-decomposition along x
	1148	!------------------------------------------------------------------------------!
	1149
	1150	USE control_parameters
	1151	USE cpulog
	1152	USE indices
	1153	USE interfaces
	1154	USE pegrid
	1155	USE transpose_indices
	1156
	1157	IMPLICIT NONE
	1158
	1159	INTEGER :: i, j, k
	1160
	1161	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
	1162	REAL, DIMENSION(nny,1:nza,nxl_y:nxr_ya,pdims(2)) :: f_in
	1163	REAL, DIMENSION(1:nza,nys:nyna,0:nxa) :: f_out
	1164	REAL, DIMENSION(nys:nyna,1:nza,0:nxa) :: work
	1165
	1166	!
	1167	!-- Transpose array
	1168	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	1169	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	1170	CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
	1171	work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
	1172	comm1dy, ierr )
	1173	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	1174
	1175	!
	1176	!-- Carry out the FFT along x, where all data are present due to the
	1177	!-- 1d-decomposition along y. Resort the data in a way that y becomes
	1178	!-- the first index.
	1179	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
	1180
	1181	IF ( host(1:3) == 'nec' ) THEN
	1182	!
	1183	!-- Code optimized for vector processors
[85]	1184	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	1185	!$OMP DO
	1186	DO j = nys, nyn
	1187
	1188	DO k = 1, nz
	1189	DO i = 0, nx
	1190	work_fftx(i,k,j) = work(j,k,i)
	1191	ENDDO
	1192	ENDDO
	1193
	1194	CALL fft_x_m( work_fftx(:,:,j), 'backward' )
	1195
	1196	ENDDO
	1197
	1198	!$OMP DO
	1199	DO i = 0, nx
	1200	DO j = nys, nyn
	1201	DO k = 1, nz
	1202	f_out(k,j,i) = work_fftx(i,k,j)
	1203	ENDDO
	1204	ENDDO
	1205	ENDDO
	1206	!$OMP END PARALLEL
	1207
	1208	ELSE
	1209
	1210	!
	1211	!-- Cache optimized code (there might be still a potential for better
	1212	!-- optimization).
[696]	1213	!$OMP PARALLEL PRIVATE (i,j,k)
[1]	1214	!$OMP DO
	1215	DO j = nys, nyn
	1216	DO k = 1, nz
	1217
	1218	DO i = 0, nx
	1219	work_fftx(i,k,j) = work(j,k,i)
	1220	ENDDO
	1221
	1222	CALL fft_x( work_fftx(0:nx,k,j), 'backward' )
	1223
	1224	ENDDO
	1225	ENDDO
	1226
	1227	!$OMP DO
	1228	DO i = 0, nx
	1229	DO j = nys, nyn
	1230	DO k = 1, nz
	1231	f_out(k,j,i) = work_fftx(i,k,j)
	1232	ENDDO
	1233	ENDDO
	1234	ENDDO
	1235	!$OMP END PARALLEL
	1236
	1237	ENDIF
	1238	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	1239
	1240	END SUBROUTINE tr_yx_fftx
	1241
	1242
	1243	SUBROUTINE ffty_tri_ffty( ar )
	1244
	1245	!------------------------------------------------------------------------------!
	1246	! FFT along y, solution of the tridiagonal system and backward FFT for
	1247	! a 1d-decomposition along y
	1248	!
	1249	! WARNING: this subroutine may still not work for hybrid parallelization
	1250	! with OpenMP (for possible necessary changes see the original
	1251	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
	1252	!------------------------------------------------------------------------------!
	1253
	1254	USE control_parameters
	1255	USE cpulog
	1256	USE grid_variables
	1257	USE indices
	1258	USE interfaces
	1259	USE pegrid
	1260	USE transpose_indices
	1261
	1262	IMPLICIT NONE
	1263
	1264	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
	1265
	1266	REAL, DIMENSION(0:ny) :: work_ffty
	1267	REAL, DIMENSION(0:ny,1:nz) :: work_triy
	1268	REAL, DIMENSION(nny,1:nza,nxl_y:nxr_ya,pdims(2)) :: ar
	1269	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
	1270
	1271
	1272	CALL cpu_log( log_point_s(39), 'fft_y + tridia', 'start' )
	1273
	1274	ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) )
	1275
	1276	tn = 0 ! Default thread number in case of one thread
[696]	1277	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy )
[1]	1278	DO i = nxl_y, nxr_y
	1279
	1280	!$ tn = omp_get_thread_num()
	1281
	1282	IF ( host(1:3) == 'nec' ) THEN
	1283	!
	1284	!-- Code optimized for vector processors
	1285	DO k = 1, nz
	1286
	1287	m = 0
	1288	DO n = 1, pdims(2)
	1289	DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
	1290	work_triy(m,k) = ar(j,k,i,n)
	1291	m = m + 1
	1292	ENDDO
	1293	ENDDO
	1294
	1295	ENDDO
	1296
	1297	CALL fft_y_m( work_triy, ny, 'forward' )
	1298
	1299	ELSE
	1300	!
	1301	!-- Cache optimized code
	1302	DO k = 1, nz
	1303
	1304	m = 0
	1305	DO n = 1, pdims(2)
	1306	DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
	1307	work_ffty(m) = ar(j,k,i,n)
	1308	m = m + 1
	1309	ENDDO
	1310	ENDDO
	1311
	1312	CALL fft_y( work_ffty, 'forward' )
	1313
	1314	DO j = 0, ny
	1315	work_triy(j,k) = work_ffty(j)
	1316	ENDDO
	1317
	1318	ENDDO
	1319
	1320	ENDIF
	1321
	1322	!
	1323	!-- Solve the linear equation system
	1324	CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) )
	1325
	1326	IF ( host(1:3) == 'nec' ) THEN
	1327	!
	1328	!-- Code optimized for vector processors
	1329	CALL fft_y_m( work_triy, ny, 'backward' )
	1330
	1331	DO k = 1, nz
	1332
	1333	m = 0
	1334	DO n = 1, pdims(2)
	1335	DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
	1336	ar(j,k,i,n) = work_triy(m,k)
	1337	m = m + 1
	1338	ENDDO
	1339	ENDDO
	1340
	1341	ENDDO
	1342
	1343	ELSE
	1344	!
	1345	!-- Cache optimized code
	1346	DO k = 1, nz
	1347
	1348	DO j = 0, ny
	1349	work_ffty(j) = work_triy(j,k)
	1350	ENDDO
	1351
	1352	CALL fft_y( work_ffty, 'backward' )
	1353
	1354	m = 0
	1355	DO n = 1, pdims(2)
	1356	DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
	1357	ar(j,k,i,n) = work_ffty(m)
	1358	m = m + 1
	1359	ENDDO
	1360	ENDDO
	1361
	1362	ENDDO
	1363
	1364	ENDIF
	1365
	1366	ENDDO
	1367
	1368	DEALLOCATE( tri )
	1369
	1370	CALL cpu_log( log_point_s(39), 'fft_y + tridia', 'stop' )
	1371
	1372	END SUBROUTINE ffty_tri_ffty
	1373
	1374
	1375	SUBROUTINE tridia_1dd( ddx2, ddy2, nx, ny, j, ar, tri )
	1376
	1377	!------------------------------------------------------------------------------!
	1378	! Solves the linear system of equations for a 1d-decomposition along x (see
	1379	! tridia)
	1380	!
	1381	! Attention: when using the intel compiler, array tri must be passed as an
	1382	! argument to the contained subroutines. Otherwise addres faults
	1383	! will occur.
	1384	! On NEC, tri should not be passed (except for routine substi_1dd)
	1385	! because this causes very bad performance.
	1386	!------------------------------------------------------------------------------!
	1387
	1388	USE arrays_3d
	1389	USE control_parameters
	1390
	1391	USE pegrid
	1392
	1393	IMPLICIT NONE
	1394
	1395	INTEGER :: i, j, k, nnyh, nx, ny, omp_get_thread_num, tn
	1396
	1397	REAL :: ddx2, ddy2
	1398
	1399	REAL, DIMENSION(0:nx,1:nz) :: ar
	1400	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
	1401	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	1402
	1403
	1404	nnyh = ( ny + 1 ) / 2
	1405
	1406	!
	1407	!-- Define constant elements of the tridiagonal matrix.
	1408	!-- The compiler on SX6 does loop exchange. If 0:nx is a high power of 2,
	1409	!-- the exchanged loops create bank conflicts. The following directive
	1410	!-- prohibits loop exchange and the loops perform much better.
	1411	! tn = omp_get_thread_num()
	1412	! WRITE( 120+tn, * ) '+++ id=',myid,' nx=',nx,' thread=', omp_get_thread_num()
[82]	1413	! CALL local_flush( 120+tn )
[1]	1414	!CDIR NOLOOPCHG
	1415	DO k = 0, nz-1
	1416	DO i = 0,nx
[667]	1417	tri(2,i,k) = ddzu_pres(k+1) * ddzw(k+1)
	1418	tri(3,i,k) = ddzu_pres(k+2) * ddzw(k+1)
[1]	1419	ENDDO
	1420	ENDDO
	1421	! WRITE( 120+tn, * ) '+++ id=',myid,' end of first tridia loop thread=', omp_get_thread_num()
[82]	1422	! CALL local_flush( 120+tn )
[1]	1423
	1424	IF ( j <= nnyh ) THEN
[377]	1425	#if defined( __lc )
[1]	1426	CALL maketri_1dd( j, tri )
	1427	#else
	1428	CALL maketri_1dd( j )
	1429	#endif
	1430	ELSE
[377]	1431	#if defined( __lc )
[1]	1432	CALL maketri_1dd( ny+1-j, tri )
	1433	#else
	1434	CALL maketri_1dd( ny+1-j )
	1435	#endif
	1436	ENDIF
[377]	1437	#if defined( __lc )
[1]	1438	CALL split_1dd( tri )
	1439	#else
	1440	CALL split_1dd
	1441	#endif
	1442	CALL substi_1dd( ar, tri )
	1443
	1444	CONTAINS
	1445
[377]	1446	#if defined( __lc )
[1]	1447	SUBROUTINE maketri_1dd( j, tri )
	1448	#else
	1449	SUBROUTINE maketri_1dd( j )
	1450	#endif
	1451
	1452	!------------------------------------------------------------------------------!
	1453	! computes the i- and j-dependent component of the matrix
	1454	!------------------------------------------------------------------------------!
	1455
	1456	USE constants
	1457
	1458	IMPLICIT NONE
	1459
	1460	INTEGER :: i, j, k, nnxh
	1461	REAL :: a, c
	1462
	1463	REAL, DIMENSION(0:nx) :: l
	1464
[377]	1465	#if defined( __lc )
[1]	1466	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	1467	#endif
	1468
	1469
	1470	nnxh = ( nx + 1 ) / 2
	1471	!
	1472	!-- Provide the tridiagonal matrix for solution of the Poisson equation in
	1473	!-- Fourier space. The coefficients are computed following the method of
	1474	!-- Schmidt et al. (DFVLR-Mitteilung 84-15), which departs from Stephan
	1475	!-- Siano's original version by discretizing the Poisson equation,
	1476	!-- before it is Fourier-transformed
	1477	DO i = 0, nx
[128]	1478	IF ( i >= 0 .AND. i <= nnxh ) THEN
[1]	1479	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
	1480	FLOAT( nx+1 ) ) ) * ddx2 + &
	1481	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
	1482	FLOAT( ny+1 ) ) ) * ddy2
	1483	ELSE
	1484	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
	1485	FLOAT( nx+1 ) ) ) * ddx2 + &
	1486	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
	1487	FLOAT( ny+1 ) ) ) * ddy2
	1488	ENDIF
	1489	ENDDO
	1490
	1491	DO k = 0, nz-1
	1492	DO i = 0, nx
[667]	1493	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
	1494	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
[1]	1495	tri(1,i,k) = a + c - l(i)
	1496	ENDDO
	1497	ENDDO
	1498	IF ( ibc_p_b == 1 .OR. ibc_p_b == 2 ) THEN
	1499	DO i = 0, nx
	1500	tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
	1501	ENDDO
	1502	ENDIF
	1503	IF ( ibc_p_t == 1 ) THEN
	1504	DO i = 0, nx
	1505	tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
	1506	ENDDO
	1507	ENDIF
	1508
	1509	END SUBROUTINE maketri_1dd
	1510
	1511
[377]	1512	#if defined( __lc )
[1]	1513	SUBROUTINE split_1dd( tri )
	1514	#else
	1515	SUBROUTINE split_1dd
	1516	#endif
	1517
	1518	!------------------------------------------------------------------------------!
	1519	! Splitting of the tridiagonal matrix (Thomas algorithm)
	1520	!------------------------------------------------------------------------------!
	1521
	1522	IMPLICIT NONE
	1523
	1524	INTEGER :: i, k
	1525
[377]	1526	#if defined( __lc )
[1]	1527	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	1528	#endif
	1529
	1530
	1531	!
	1532	!-- Splitting
	1533	DO i = 0, nx
	1534	tri(4,i,0) = tri(1,i,0)
	1535	ENDDO
	1536	DO k = 1, nz-1
	1537	DO i = 0, nx
	1538	tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
	1539	tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
	1540	ENDDO
	1541	ENDDO
	1542
	1543	END SUBROUTINE split_1dd
	1544
	1545
	1546	SUBROUTINE substi_1dd( ar, tri )
	1547
	1548	!------------------------------------------------------------------------------!
	1549	! Substitution (Forward and Backward) (Thomas algorithm)
	1550	!------------------------------------------------------------------------------!
	1551
	1552	IMPLICIT NONE
	1553
[76]	1554	INTEGER :: i, k
[1]	1555
	1556	REAL, DIMENSION(0:nx,nz) :: ar
	1557	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
	1558	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	1559
	1560	!
	1561	!-- Forward substitution
	1562	DO i = 0, nx
	1563	ar1(i,0) = ar(i,1)
	1564	ENDDO
	1565	DO k = 1, nz-1
	1566	DO i = 0, nx
	1567	ar1(i,k) = ar(i,k+1) - tri(5,i,k) * ar1(i,k-1)
	1568	ENDDO
	1569	ENDDO
	1570
	1571	!
	1572	!-- Backward substitution
[763]	1573	!-- Note, the add of 1.0E-20 in the denominator is due to avoid divisions
	1574	!-- by zero appearing if the pressure bc is set to neumann at the top of
	1575	!-- the model domain.
[1]	1576	DO i = 0, nx
[761]	1577	ar(i,nz) = ar1(i,nz-1) / ( tri(4,i,nz-1) + 1.0E-20 )
[1]	1578	ENDDO
	1579	DO k = nz-2, 0, -1
	1580	DO i = 0, nx
	1581	ar(i,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,k+2) ) &
	1582	/ tri(4,i,k)
	1583	ENDDO
	1584	ENDDO
	1585
[76]	1586	!
	1587	!-- Indices i=0, j=0 correspond to horizontally averaged pressure.
	1588	!-- The respective values of ar should be zero at all k-levels if
	1589	!-- acceleration of horizontally averaged vertical velocity is zero.
	1590	IF ( ibc_p_b == 1 .AND. ibc_p_t == 1 ) THEN
	1591	IF ( j == 0 ) THEN
	1592	DO k = 1, nz
	1593	ar(0,k) = 0.0
	1594	ENDDO
	1595	ENDIF
	1596	ENDIF
	1597
[1]	1598	END SUBROUTINE substi_1dd
	1599
	1600	END SUBROUTINE tridia_1dd
	1601
	1602	#endif
[807]	1603	#endif
[1]	1604	END MODULE poisfft_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |