Home

Context Navigation

source: palm/trunk/SOURCE/poisfft.f90 @ 1096

Last change on this file since 1096 was 1093, checked in by raasch, 12 years ago
last commit documented
Property svn:keywords set to `Id`
File size: 47.7 KB

Rev	Line
[1]	1	MODULE poisfft_mod
	2
[1036]	3	!--------------------------------------------------------------------------------!
	4	! This file is part of PALM.
	5	!
	6	! PALM is free software: you can redistribute it and/or modify it under the terms
	7	! of the GNU General Public License as published by the Free Software Foundation,
	8	! either version 3 of the License, or (at your option) any later version.
	9	!
	10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
	11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
	12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
	13	!
	14	! You should have received a copy of the GNU General Public License along with
	15	! PALM. If not, see <http://www.gnu.org/licenses/>.
	16	!
	17	! Copyright 1997-2012 Leibniz University Hannover
	18	!--------------------------------------------------------------------------------!
	19	!
[484]	20	! Current revisions:
[1]	21	! -----------------
[1093]	22	!
[1]	23	!
	24	! Former revisions:
	25	! -----------------
[3]	26	! $Id: poisfft.f90 1093 2013-02-02 12:58:49Z raasch $
[77]	27	!
[1093]	28	! 1092 2013-02-02 11:24:22Z raasch
	29	! unused variables removed
	30	!
[1037]	31	! 1036 2012-10-22 13:43:42Z raasch
	32	! code put under GPL (PALM 3.9)
	33	!
[1014]	34	! 2012-09-21 07:03:55Z raasch
	35	! FLOAT type conversion replaced by REAL
	36	!
[1004]	37	! 1003 2012-09-14 14:35:53Z raasch
	38	! indices nxa, nya, etc. replaced by nx, ny, etc.
	39	!
[941]	40	! 940 2012-07-09 14:31:00Z raasch
	41	! special handling of tri-array as an argument in tridia_1dd routines switched
	42	! off because it caused segmentation faults with intel 12.1 compiler
	43	!
[878]	44	! 877 2012-04-03 11:21:44Z suehring
	45	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
	46	! pressure at the top of the model domain.
	47	!
[810]	48	! 809 2012-01-30 13:32:58Z maronga
	49	! Bugfix: replaced .AND. and .NOT. with && and ! in the preprocessor directives
	50	!
[808]	51	! 807 2012-01-25 11:53:51Z maronga
	52	! New cpp directive "__check" implemented which is used by check_namelist_files
	53	! (most of the code is unneeded by check_namelist_files).
	54	!
[764]	55	! 763 2011-10-06 09:32:09Z suehring
	56	! Comment added concerning the last change.
	57	!
[762]	58	! 761 2011-10-05 17:58:52Z suehring
	59	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
	60	! pressure at the top of the model domain.
	61	!
[697]	62	! 696 2011-03-18 07:03:49Z raasch
	63	! work_fftx removed from PRIVATE clauses in fftx_tr_xy and tr_yx_fftx
	64	!
[684]	65	! 683 2011-02-09 14:25:15Z raasch
	66	! openMP parallelization for 2d-domain-decomposition
	67	!
[668]	68	! 667 2010-12-23 12:06:00Z suehring/gryschka
	69	! ddzu replaced by ddzu_pres due to changes in zu(0)
	70	!
[623]	71	! 622 2010-12-10 08:08:13Z raasch
	72	! optional barriers included in order to speed up collective operations
	73	!
[392]	74	! 377 2009-09-04 11:09:00Z raasch
	75	! __lcmuk changed to __lc to avoid problems with Intel compiler on sgi-ice
	76	!
[198]	77	! 164 2008-05-15 08:46:15Z raasch
	78	! Arguments removed from transpose routines
	79	!
[139]	80	! 128 2007-10-26 13:11:14Z raasch
	81	! Bugfix: wavenumber calculation for even nx in routines maketri
	82	!
[90]	83	! 85 2007-05-11 09:35:14Z raasch
	84	! Bugfix: work_fft*_vec removed from some PRIVATE-declarations
	85	!
[77]	86	! 76 2007-03-29 00:58:32Z raasch
	87	! Tridiagonal coefficients adjusted for Neumann boundary conditions both at
	88	! the bottom and the top.
	89	!
[3]	90	! RCS Log replace by Id keyword, revision history cleaned up
	91	!
[1]	92	! Revision 1.24 2006/08/04 15:00:24 raasch
	93	! Default setting of the thread number tn in case of not using OpenMP
	94	!
	95	! Revision 1.23 2006/02/23 12:48:38 raasch
	96	! Additional compiler directive in routine tridia_1dd for preventing loop
	97	! exchange on NEC-SX6
	98	!
	99	! Revision 1.20 2004/04/30 12:38:09 raasch
	100	! Parts of former poisfft_hybrid moved to this subroutine,
	101	! former subroutine changed to a module, renaming of FFT-subroutines and
	102	! -module, FFTs completely substituted by calls of fft_x and fft_y,
	103	! NAG fft used in the non-parallel case completely removed, l in maketri
	104	! is now a 1d-array, variables passed by modules instead of using parameter
	105	! lists, enlarged transposition arrays introduced
	106	!
	107	! Revision 1.1 1997/07/24 11:24:14 raasch
	108	! Initial revision
	109	!
	110	!
	111	! Description:
	112	! ------------
	113	! See below.
	114	!------------------------------------------------------------------------------!
	115
	116	!--------------------------------------------------------------------------!
	117	! poisfft !
	118	! !
	119	! Original version: Stephan Siano (pois3d) !
	120	! !
	121	! Institute of Meteorology and Climatology, University of Hannover !
	122	! Germany !
	123	! !
	124	! Version as of July 23,1996 !
	125	! !
	126	! !
	127	! Version for parallel computers: Siegfried Raasch !
	128	! !
	129	! Version as of July 03,1997 !
	130	! !
	131	! Solves the Poisson equation with a 2D spectral method !
	132	! d^2 p / dx^2 + d^2 p / dy^2 + d^2 p / dz^2 = s !
	133	! !
	134	! Input: !
	135	! real ar contains in the (nnx,nny,nnz) elements, !
	136	! starting from the element (1,nys,nxl), the !
	137	! values for s !
	138	! real work Temporary array !
	139	! !
	140	! Output: !
	141	! real ar contains the solution for p !
	142	!--------------------------------------------------------------------------!
	143
	144	USE fft_xy
	145	USE indices
	146	USE transpose_indices
	147
	148	IMPLICIT NONE
	149
	150	PRIVATE
[807]	151
[809]	152	#if ! defined ( __check )
[1]	153	PUBLIC poisfft, poisfft_init
	154
	155	INTERFACE poisfft
	156	MODULE PROCEDURE poisfft
	157	END INTERFACE poisfft
	158
	159	INTERFACE poisfft_init
	160	MODULE PROCEDURE poisfft_init
	161	END INTERFACE poisfft_init
[807]	162	#else
	163	PUBLIC poisfft_init
[1]	164
[807]	165	INTERFACE poisfft_init
	166	MODULE PROCEDURE poisfft_init
	167	END INTERFACE poisfft_init
	168	#endif
	169
[1]	170	CONTAINS
	171
	172	SUBROUTINE poisfft_init
	173
	174	CALL fft_init
	175
	176	END SUBROUTINE poisfft_init
	177
[809]	178	#if ! defined ( __check )
[1]	179	SUBROUTINE poisfft( ar, work )
	180
	181	USE cpulog
	182	USE interfaces
	183	USE pegrid
	184
	185	IMPLICIT NONE
	186
[1003]	187	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar, work
[1]	188
	189
	190	CALL cpu_log( log_point_s(3), 'poisfft', 'start' )
	191
	192	!
	193	!-- Two-dimensional Fourier Transformation in x- and y-direction.
	194	#if defined( __parallel )
	195	IF ( pdims(2) == 1 ) THEN
	196
	197	!
	198	!-- 1d-domain-decomposition along x:
	199	!-- FFT along y and transposition y --> x
	200	CALL ffty_tr_yx( ar, work, ar )
	201
	202	!
	203	!-- FFT along x, solving the tridiagonal system and backward FFT
	204	CALL fftx_tri_fftx( ar )
	205
	206	!
	207	!-- Transposition x --> y and backward FFT along y
	208	CALL tr_xy_ffty( ar, work, ar )
	209
	210	ELSEIF ( pdims(1) == 1 ) THEN
	211
	212	!
	213	!-- 1d-domain-decomposition along y:
	214	!-- FFT along x and transposition x --> y
	215	CALL fftx_tr_xy( ar, work, ar )
	216
	217	!
	218	!-- FFT along y, solving the tridiagonal system and backward FFT
	219	CALL ffty_tri_ffty( ar )
	220
	221	!
	222	!-- Transposition y --> x and backward FFT along x
	223	CALL tr_yx_fftx( ar, work, ar )
	224
	225	ELSE
	226
	227	!
	228	!-- 2d-domain-decomposition
	229	!-- Transposition z --> x
	230	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
[164]	231	CALL transpose_zx( ar, work, ar )
[1]	232	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
	233
	234	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
	235	CALL fftxp( ar, 'forward' )
	236	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
	237
	238	!
	239	!-- Transposition x --> y
	240	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
[164]	241	CALL transpose_xy( ar, work, ar )
[1]	242	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
	243
	244	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
	245	CALL fftyp( ar, 'forward' )
	246	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	247
	248	!
	249	!-- Transposition y --> z
	250	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
[164]	251	CALL transpose_yz( ar, work, ar )
[1]	252	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
	253
	254	!
	255	!-- Solve the Poisson equation in z-direction in cartesian space.
	256	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
	257	CALL tridia( ar )
	258	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
	259
	260	!
	261	!-- Inverse Fourier Transformation
	262	!-- Transposition z --> y
	263	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
[164]	264	CALL transpose_zy( ar, work, ar )
[1]	265	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
	266
	267	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
	268	CALL fftyp( ar, 'backward' )
	269	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	270
	271	!
	272	!-- Transposition y --> x
	273	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
[164]	274	CALL transpose_yx( ar, work, ar )
[1]	275	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
	276
	277	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
	278	CALL fftxp( ar, 'backward' )
	279	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	280
	281	!
	282	!-- Transposition x --> z
	283	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
[164]	284	CALL transpose_xz( ar, work, ar )
[1]	285	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
	286
	287	ENDIF
	288
	289	#else
	290
	291	!
	292	!-- Two-dimensional Fourier Transformation along x- and y-direction.
	293	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
	294	CALL fftx( ar, 'forward' )
	295	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
	296	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
	297	CALL ffty( ar, 'forward' )
	298	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	299
	300	!
	301	!-- Solve the Poisson equation in z-direction in cartesian space.
	302	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
	303	CALL tridia( ar )
	304	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
	305
	306	!
	307	!-- Inverse Fourier Transformation.
	308	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
	309	CALL ffty( ar, 'backward' )
	310	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	311	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
	312	CALL fftx( ar, 'backward' )
	313	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	314
	315	#endif
	316
	317	CALL cpu_log( log_point_s(3), 'poisfft', 'stop' )
	318
	319	END SUBROUTINE poisfft
	320
	321
	322
	323	SUBROUTINE tridia( ar )
	324
	325	!------------------------------------------------------------------------------!
	326	! solves the linear system of equations:
	327	!
	328	! -(4 pi^2(i^2/(dx^2nnx^2)+j^2/(dy^2nny^2))+
	329	! 1/(dzu(k)dzw(k))+1/(dzu(k-1)dzw(k)))*p(i,j,k)+
	330	! 1/(dzu(k)dzw(k))p(i,j,k+1)+1/(dzu(k-1)dzw(k))p(i,j,k-1)=d(i,j,k)
	331	!
	332	! by using the Thomas algorithm
	333	!------------------------------------------------------------------------------!
	334
	335	USE arrays_3d
	336
	337	IMPLICIT NONE
	338
	339	INTEGER :: i, j, k, nnyh
	340
	341	REAL, DIMENSION(nxl_z:nxr_z,0:nz-1) :: ar1
	342	REAL, DIMENSION(5,nxl_z:nxr_z,0:nz-1) :: tri
	343
	344	#if defined( __parallel )
[1003]	345	REAL :: ar(nxl_z:nxr_z,nys_z:nyn_z,1:nz)
[1]	346	#else
	347	REAL :: ar(1:nz,nys_z:nyn_z,nxl_z:nxr_z)
	348	#endif
	349
	350
	351	nnyh = (ny+1) / 2
	352
	353	!
	354	!-- Define constant elements of the tridiagonal matrix.
[683]	355	!$OMP PARALLEL PRIVATE ( k, i )
	356	!$OMP DO
[1]	357	DO k = 0, nz-1
	358	DO i = nxl_z, nxr_z
[667]	359	tri(2,i,k) = ddzu_pres(k+1) * ddzw(k+1)
	360	tri(3,i,k) = ddzu_pres(k+2) * ddzw(k+1)
[1]	361	ENDDO
	362	ENDDO
[683]	363	!$OMP END PARALLEL
[1]	364
	365	#if defined( __parallel )
	366	!
	367	!-- Repeat for all y-levels.
[683]	368	!$OMP PARALLEL FIRSTPRIVATE( tri ) PRIVATE ( ar1, j )
	369	!$OMP DO
[1]	370	DO j = nys_z, nyn_z
	371	IF ( j <= nnyh ) THEN
	372	CALL maketri( tri, j )
	373	ELSE
	374	CALL maketri( tri, ny+1-j )
	375	ENDIF
	376	CALL split( tri )
	377	CALL substi( ar, ar1, tri, j )
	378	ENDDO
[683]	379	!$OMP END PARALLEL
[1]	380	#else
	381	!
	382	!-- First y-level.
	383	CALL maketri( tri, nys_z )
	384	CALL split( tri )
	385	CALL substi( ar, ar1, tri, 0 )
	386
	387	!
	388	!-- Further y-levels.
	389	DO j = 1, nnyh - 1
	390	CALL maketri( tri, j )
	391	CALL split( tri )
	392	CALL substi( ar, ar1, tri, j )
	393	CALL substi( ar, ar1, tri, ny+1-j )
	394	ENDDO
	395	CALL maketri( tri, nnyh )
	396	CALL split( tri )
	397	CALL substi( ar, ar1, tri, nnyh+nys )
	398	#endif
	399
	400	CONTAINS
	401
	402	SUBROUTINE maketri( tri, j )
	403
	404	!------------------------------------------------------------------------------!
	405	! Computes the i- and j-dependent component of the matrix
	406	!------------------------------------------------------------------------------!
	407
	408	USE arrays_3d
	409	USE constants
	410	USE control_parameters
	411	USE grid_variables
	412
	413	IMPLICIT NONE
	414
	415	INTEGER :: i, j, k, nnxh
	416	REAL :: a, c
	417	REAL :: ll(nxl_z:nxr_z)
	418	REAL :: tri(5,nxl_z:nxr_z,0:nz-1)
	419
	420
	421	nnxh = ( nx + 1 ) / 2
	422
	423	!
	424	!-- Provide the tridiagonal matrix for solution of the Poisson equation in
	425	!-- Fourier space. The coefficients are computed following the method of
	426	!-- Schmidt et al. (DFVLR-Mitteilung 84-15), which departs from Stephan
	427	!-- Siano's original version by discretizing the Poisson equation,
	428	!-- before it is Fourier-transformed
	429	#if defined( __parallel )
	430	DO i = nxl_z, nxr_z
[128]	431	IF ( i >= 0 .AND. i <= nnxh ) THEN
[1]	432	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
[1013]	433	REAL( nx+1 ) ) ) / ( dx * dx ) + &
[1]	434	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
[1013]	435	REAL( ny+1 ) ) ) / ( dy * dy )
[1]	436	ELSE
	437	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
[1013]	438	REAL( nx+1 ) ) ) / ( dx * dx ) + &
[1]	439	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
[1013]	440	REAL( ny+1 ) ) ) / ( dy * dy )
[1]	441	ENDIF
	442	DO k = 0,nz-1
[667]	443	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
	444	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
[1]	445	tri(1,i,k) = a + c - ll(i)
	446	ENDDO
	447	ENDDO
	448	#else
	449	DO i = 0, nnxh
[1013]	450	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / REAL( nx+1 ) ) ) / &
[1]	451	( dx * dx ) + &
[1013]	452	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / REAL( ny+1 ) ) ) / &
[1]	453	( dy * dy )
	454	DO k = 0, nz-1
[667]	455	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
	456	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
[1]	457	tri(1,i,k) = a + c - ll(i)
	458	IF ( i >= 1 .and. i < nnxh ) THEN
	459	tri(1,nx+1-i,k) = tri(1,i,k)
	460	ENDIF
	461	ENDDO
	462	ENDDO
	463	#endif
	464	IF ( ibc_p_b == 1 .OR. ibc_p_b == 2 ) THEN
	465	DO i = nxl_z, nxr_z
	466	tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
	467	ENDDO
	468	ENDIF
	469	IF ( ibc_p_t == 1 ) THEN
	470	DO i = nxl_z, nxr_z
	471	tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
	472	ENDDO
	473	ENDIF
	474
	475	END SUBROUTINE maketri
	476
	477
	478	SUBROUTINE substi( ar, ar1, tri, j )
	479
	480	!------------------------------------------------------------------------------!
	481	! Substitution (Forward and Backward) (Thomas algorithm)
	482	!------------------------------------------------------------------------------!
	483
[76]	484	USE control_parameters
	485
[1]	486	IMPLICIT NONE
	487
	488	INTEGER :: i, j, k
	489	REAL :: ar1(nxl_z:nxr_z,0:nz-1)
	490	REAL :: tri(5,nxl_z:nxr_z,0:nz-1)
	491	#if defined( __parallel )
[1003]	492	REAL :: ar(nxl_z:nxr_z,nys_z:nyn_z,1:nz)
[1]	493	#else
	494	REAL :: ar(1:nz,nys_z:nyn_z,nxl_z:nxr_z)
	495	#endif
	496
	497	!
	498	!-- Forward substitution.
	499	DO i = nxl_z, nxr_z
	500	#if defined( __parallel )
	501	ar1(i,0) = ar(i,j,1)
	502	#else
	503	ar1(i,0) = ar(1,j,i)
	504	#endif
	505	ENDDO
	506	DO k = 1, nz - 1
	507	DO i = nxl_z, nxr_z
	508	#if defined( __parallel )
	509	ar1(i,k) = ar(i,j,k+1) - tri(5,i,k) * ar1(i,k-1)
	510	#else
	511	ar1(i,k) = ar(k+1,j,i) - tri(5,i,k) * ar1(i,k-1)
	512	#endif
	513	ENDDO
	514	ENDDO
	515
	516	!
[877]	517	!-- Backward substitution
	518	!-- Note, the 1.0E-20 in the denominator is due to avoid divisions
	519	!-- by zero appearing if the pressure bc is set to neumann at the top of
	520	!-- the model domain.
[1]	521	DO i = nxl_z, nxr_z
	522	#if defined( __parallel )
[877]	523	ar(i,j,nz) = ar1(i,nz-1) / ( tri(4,i,nz-1) + 1.0E-20 )
[1]	524	#else
[877]	525	ar(nz,j,i) = ar1(i,nz-1) / ( tri(4,i,nz-1) + 1.0E-20 )
[1]	526	#endif
	527	ENDDO
	528	DO k = nz-2, 0, -1
	529	DO i = nxl_z, nxr_z
	530	#if defined( __parallel )
	531	ar(i,j,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,j,k+2) ) &
	532	/ tri(4,i,k)
	533	#else
	534	ar(k+1,j,i) = ( ar1(i,k) - tri(3,i,k) * ar(k+2,j,i) ) &
	535	/ tri(4,i,k)
	536	#endif
	537	ENDDO
	538	ENDDO
	539
[76]	540	!
	541	!-- Indices i=0, j=0 correspond to horizontally averaged pressure.
	542	!-- The respective values of ar should be zero at all k-levels if
	543	!-- acceleration of horizontally averaged vertical velocity is zero.
	544	IF ( ibc_p_b == 1 .AND. ibc_p_t == 1 ) THEN
	545	IF ( j == 0 .AND. nxl_z == 0 ) THEN
	546	#if defined( __parallel )
	547	DO k = 1, nz
	548	ar(nxl_z,j,k) = 0.0
	549	ENDDO
	550	#else
	551	DO k = 1, nz
	552	ar(k,j,nxl_z) = 0.0
	553	ENDDO
	554	#endif
	555	ENDIF
	556	ENDIF
	557
[1]	558	END SUBROUTINE substi
	559
	560
	561	SUBROUTINE split( tri )
	562
	563	!------------------------------------------------------------------------------!
	564	! Splitting of the tridiagonal matrix (Thomas algorithm)
	565	!------------------------------------------------------------------------------!
	566
	567	IMPLICIT NONE
	568
	569	INTEGER :: i, k
	570	REAL :: tri(5,nxl_z:nxr_z,0:nz-1)
	571
	572	!
	573	!-- Splitting.
	574	DO i = nxl_z, nxr_z
	575	tri(4,i,0) = tri(1,i,0)
	576	ENDDO
	577	DO k = 1, nz-1
	578	DO i = nxl_z, nxr_z
	579	tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
	580	tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
	581	ENDDO
	582	ENDDO
	583
	584	END SUBROUTINE split
	585
	586	END SUBROUTINE tridia
	587
	588
	589	#if defined( __parallel )
	590	SUBROUTINE fftxp( ar, direction )
	591
	592	!------------------------------------------------------------------------------!
	593	! Fourier-transformation along x-direction Parallelized version
	594	!------------------------------------------------------------------------------!
	595
	596	IMPLICIT NONE
	597
	598	CHARACTER (LEN=*) :: direction
	599	INTEGER :: j, k
[1003]	600	REAL :: ar(0:nx,nys_x:nyn_x,nzb_x:nzt_x)
[1]	601
	602	!
	603	!-- Performing the fft with one of the methods implemented
[683]	604	!$OMP PARALLEL PRIVATE ( j, k )
	605	!$OMP DO
[1]	606	DO k = nzb_x, nzt_x
	607	DO j = nys_x, nyn_x
	608	CALL fft_x( ar(0:nx,j,k), direction )
	609	ENDDO
	610	ENDDO
[683]	611	!$OMP END PARALLEL
[1]	612
	613	END SUBROUTINE fftxp
	614
	615	#else
	616	SUBROUTINE fftx( ar, direction )
	617
	618	!------------------------------------------------------------------------------!
	619	! Fourier-transformation along x-direction Non parallel version
	620	!------------------------------------------------------------------------------!
	621
	622	IMPLICIT NONE
	623
	624	CHARACTER (LEN=*) :: direction
	625	INTEGER :: i, j, k
	626	REAL :: ar(1:nz,0:ny,0:nx)
	627
	628	!
	629	!-- Performing the fft with one of the methods implemented
[683]	630	!$OMP PARALLEL PRIVATE ( j, k )
	631	!$OMP DO
[1]	632	DO k = 1, nz
	633	DO j = 0, ny
	634	CALL fft_x( ar(k,j,0:nx), direction )
	635	ENDDO
	636	ENDDO
[683]	637	!$OMP END PARALLEL
[1]	638
	639	END SUBROUTINE fftx
	640	#endif
	641
	642
	643	#if defined( __parallel )
	644	SUBROUTINE fftyp( ar, direction )
	645
	646	!------------------------------------------------------------------------------!
	647	! Fourier-transformation along y-direction Parallelized version
	648	!------------------------------------------------------------------------------!
	649
	650	IMPLICIT NONE
	651
	652	CHARACTER (LEN=*) :: direction
	653	INTEGER :: i, k
[1003]	654	REAL :: ar(0:ny,nxl_y:nxr_y,nzb_y:nzt_y)
[1]	655
	656	!
	657	!-- Performing the fft with one of the methods implemented
[683]	658	!$OMP PARALLEL PRIVATE ( i, k )
	659	!$OMP DO
[1]	660	DO k = nzb_y, nzt_y
	661	DO i = nxl_y, nxr_y
	662	CALL fft_y( ar(0:ny,i,k), direction )
	663	ENDDO
	664	ENDDO
[683]	665	!$OMP END PARALLEL
[1]	666
	667	END SUBROUTINE fftyp
	668
	669	#else
	670	SUBROUTINE ffty( ar, direction )
	671
	672	!------------------------------------------------------------------------------!
	673	! Fourier-transformation along y-direction Non parallel version
	674	!------------------------------------------------------------------------------!
	675
	676	IMPLICIT NONE
	677
	678	CHARACTER (LEN=*) :: direction
	679	INTEGER :: i, k
	680	REAL :: ar(1:nz,0:ny,0:nx)
	681
	682	!
	683	!-- Performing the fft with one of the methods implemented
[683]	684	!$OMP PARALLEL PRIVATE ( i, k )
	685	!$OMP DO
[1]	686	DO k = 1, nz
	687	DO i = 0, nx
	688	CALL fft_y( ar(k,0:ny,i), direction )
	689	ENDDO
	690	ENDDO
[683]	691	!$OMP END PARALLEL
[1]	692
	693	END SUBROUTINE ffty
	694	#endif
	695
	696	#if defined( __parallel )
	697	SUBROUTINE ffty_tr_yx( f_in, work, f_out )
	698
	699	!------------------------------------------------------------------------------!
	700	! Fourier-transformation along y with subsequent transposition y --> x for
	701	! a 1d-decomposition along x
	702	!
	703	! ATTENTION: The performance of this routine is much faster on the NEC-SX6,
	704	! if the first index of work_ffty_vec is odd. Otherwise
	705	! memory bank conflicts may occur (especially if the index is a
	706	! multiple of 128). That's why work_ffty_vec is dimensioned as
	707	! 0:ny+1.
	708	! Of course, this will not work if users are using an odd number
	709	! of gridpoints along y.
	710	!------------------------------------------------------------------------------!
	711
	712	USE control_parameters
	713	USE cpulog
	714	USE indices
	715	USE interfaces
	716	USE pegrid
	717	USE transpose_indices
	718
	719	IMPLICIT NONE
	720
	721	INTEGER :: i, iend, iouter, ir, j, k
	722	INTEGER, PARAMETER :: stridex = 4
	723
	724	REAL, DIMENSION(0:ny,stridex) :: work_ffty
	725	#if defined( __nec )
	726	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
	727	#endif
[1003]	728	REAL, DIMENSION(1:nz,0:ny,nxl:nxr) :: f_in
	729	REAL, DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_out
	730	REAL, DIMENSION(nxl:nxr,1:nz,0:ny) :: work
[1]	731
	732	!
	733	!-- Carry out the FFT along y, where all data are present due to the
	734	!-- 1d-decomposition along x. Resort the data in a way that x becomes
	735	!-- the first index.
	736	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
	737
	738	IF ( host(1:3) == 'nec' ) THEN
	739	#if defined( __nec )
	740	!
	741	!-- Code optimized for vector processors
[85]	742	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	743	!$OMP DO
	744	DO i = nxl, nxr
	745
	746	DO j = 0, ny
	747	DO k = 1, nz
	748	work_ffty_vec(j,k,i) = f_in(k,j,i)
	749	ENDDO
	750	ENDDO
	751
	752	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' )
	753
	754	ENDDO
	755
	756	!$OMP DO
	757	DO k = 1, nz
	758	DO j = 0, ny
	759	DO i = nxl, nxr
	760	work(i,k,j) = work_ffty_vec(j,k,i)
	761	ENDDO
	762	ENDDO
	763	ENDDO
	764	!$OMP END PARALLEL
	765	#endif
	766
	767	ELSE
	768
	769	!
	770	!-- Cache optimized code.
	771	!-- The i-(x-)direction is split into a strided outer loop and an inner
	772	!-- loop for better cache performance
	773	!$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty)
	774	!$OMP DO
	775	DO iouter = nxl, nxr, stridex
	776
	777	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
	778
	779	DO k = 1, nz
	780
	781	DO i = iouter, iend
	782
	783	ir = i-iouter+1 ! counter within a stride
	784	DO j = 0, ny
	785	work_ffty(j,ir) = f_in(k,j,i)
	786	ENDDO
	787	!
	788	!-- FFT along y
	789	CALL fft_y( work_ffty(:,ir), 'forward' )
	790
	791	ENDDO
	792
	793	!
	794	!-- Resort
	795	DO j = 0, ny
	796	DO i = iouter, iend
	797	work(i,k,j) = work_ffty(j,i-iouter+1)
	798	ENDDO
	799	ENDDO
	800
	801	ENDDO
	802
	803	ENDDO
	804	!$OMP END PARALLEL
	805
	806	ENDIF
	807	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	808
	809	!
	810	!-- Transpose array
	811	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	812	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	813	CALL MPI_ALLTOALL( work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
	814	f_out(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
	815	comm1dx, ierr )
	816	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	817
	818	END SUBROUTINE ffty_tr_yx
	819
	820
	821	SUBROUTINE tr_xy_ffty( f_in, work, f_out )
	822
	823	!------------------------------------------------------------------------------!
	824	! Transposition x --> y with a subsequent backward Fourier transformation for
	825	! a 1d-decomposition along x
	826	!------------------------------------------------------------------------------!
	827
	828	USE control_parameters
	829	USE cpulog
	830	USE indices
	831	USE interfaces
	832	USE pegrid
	833	USE transpose_indices
	834
	835	IMPLICIT NONE
	836
	837	INTEGER :: i, iend, iouter, ir, j, k
	838	INTEGER, PARAMETER :: stridex = 4
	839
	840	REAL, DIMENSION(0:ny,stridex) :: work_ffty
	841	#if defined( __nec )
	842	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
	843	#endif
[1003]	844	REAL, DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_in
	845	REAL, DIMENSION(1:nz,0:ny,nxl:nxr) :: f_out
	846	REAL, DIMENSION(nxl:nxr,1:nz,0:ny) :: work
[1]	847
	848	!
	849	!-- Transpose array
	850	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	851	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	852	CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
	853	work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
	854	comm1dx, ierr )
	855	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	856
	857	!
	858	!-- Resort the data in a way that y becomes the first index and carry out the
	859	!-- backward fft along y.
	860	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
	861
	862	IF ( host(1:3) == 'nec' ) THEN
	863	#if defined( __nec )
	864	!
	865	!-- Code optimized for vector processors
[85]	866	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	867	!$OMP DO
	868	DO k = 1, nz
	869	DO j = 0, ny
	870	DO i = nxl, nxr
	871	work_ffty_vec(j,k,i) = work(i,k,j)
	872	ENDDO
	873	ENDDO
	874	ENDDO
	875
	876	!$OMP DO
	877	DO i = nxl, nxr
	878
	879	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' )
	880
	881	DO j = 0, ny
	882	DO k = 1, nz
	883	f_out(k,j,i) = work_ffty_vec(j,k,i)
	884	ENDDO
	885	ENDDO
	886
	887	ENDDO
	888	!$OMP END PARALLEL
	889	#endif
	890
	891	ELSE
	892
	893	!
	894	!-- Cache optimized code.
	895	!-- The i-(x-)direction is split into a strided outer loop and an inner
	896	!-- loop for better cache performance
	897	!$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty )
	898	!$OMP DO
	899	DO iouter = nxl, nxr, stridex
	900
	901	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
	902
	903	DO k = 1, nz
	904	!
	905	!-- Resort
	906	DO j = 0, ny
	907	DO i = iouter, iend
	908	work_ffty(j,i-iouter+1) = work(i,k,j)
	909	ENDDO
	910	ENDDO
	911
	912	DO i = iouter, iend
	913
	914	!
	915	!-- FFT along y
	916	ir = i-iouter+1 ! counter within a stride
	917	CALL fft_y( work_ffty(:,ir), 'backward' )
	918
	919	DO j = 0, ny
	920	f_out(k,j,i) = work_ffty(j,ir)
	921	ENDDO
	922	ENDDO
	923
	924	ENDDO
	925
	926	ENDDO
	927	!$OMP END PARALLEL
	928
	929	ENDIF
	930
	931	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	932
	933	END SUBROUTINE tr_xy_ffty
	934
	935
	936	SUBROUTINE fftx_tri_fftx( ar )
	937
	938	!------------------------------------------------------------------------------!
	939	! FFT along x, solution of the tridiagonal system and backward FFT for
	940	! a 1d-decomposition along x
	941	!
	942	! WARNING: this subroutine may still not work for hybrid parallelization
	943	! with OpenMP (for possible necessary changes see the original
	944	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
	945	!------------------------------------------------------------------------------!
	946
	947	USE control_parameters
	948	USE cpulog
	949	USE grid_variables
	950	USE indices
	951	USE interfaces
	952	USE pegrid
	953	USE transpose_indices
	954
	955	IMPLICIT NONE
	956
	957	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
	958
[1003]	959	REAL, DIMENSION(0:nx) :: work_fftx
	960	REAL, DIMENSION(0:nx,1:nz) :: work_trix
	961	REAL, DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: ar
	962	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
[1]	963
	964
	965	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'start' )
	966
	967	ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) )
	968
	969	tn = 0 ! Default thread number in case of one thread
	970	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_fftx, work_trix )
	971	DO j = nys_x, nyn_x
	972
	973	!$ tn = omp_get_thread_num()
	974
	975	IF ( host(1:3) == 'nec' ) THEN
	976	!
	977	!-- Code optimized for vector processors
	978	DO k = 1, nz
	979
	980	m = 0
	981	DO n = 1, pdims(1)
[1003]	982	DO i = 1, nnx
[1]	983	work_trix(m,k) = ar(i,k,j,n)
	984	m = m + 1
	985	ENDDO
	986	ENDDO
	987
	988	ENDDO
	989
	990	CALL fft_x_m( work_trix, 'forward' )
	991
	992	ELSE
	993	!
	994	!-- Cache optimized code
	995	DO k = 1, nz
	996
	997	m = 0
	998	DO n = 1, pdims(1)
[1003]	999	DO i = 1, nnx
[1]	1000	work_fftx(m) = ar(i,k,j,n)
	1001	m = m + 1
	1002	ENDDO
	1003	ENDDO
	1004
	1005	CALL fft_x( work_fftx, 'forward' )
	1006
	1007	DO i = 0, nx
	1008	work_trix(i,k) = work_fftx(i)
	1009	ENDDO
	1010
	1011	ENDDO
	1012
	1013	ENDIF
	1014
	1015	!
	1016	!-- Solve the linear equation system
	1017	CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) )
	1018
	1019	IF ( host(1:3) == 'nec' ) THEN
	1020	!
	1021	!-- Code optimized for vector processors
	1022	CALL fft_x_m( work_trix, 'backward' )
	1023
	1024	DO k = 1, nz
	1025
	1026	m = 0
	1027	DO n = 1, pdims(1)
[1003]	1028	DO i = 1, nnx
[1]	1029	ar(i,k,j,n) = work_trix(m,k)
	1030	m = m + 1
	1031	ENDDO
	1032	ENDDO
	1033
	1034	ENDDO
	1035
	1036	ELSE
	1037	!
	1038	!-- Cache optimized code
	1039	DO k = 1, nz
	1040
	1041	DO i = 0, nx
	1042	work_fftx(i) = work_trix(i,k)
	1043	ENDDO
	1044
	1045	CALL fft_x( work_fftx, 'backward' )
	1046
	1047	m = 0
	1048	DO n = 1, pdims(1)
[1003]	1049	DO i = 1, nnx
[1]	1050	ar(i,k,j,n) = work_fftx(m)
	1051	m = m + 1
	1052	ENDDO
	1053	ENDDO
	1054
	1055	ENDDO
	1056
	1057	ENDIF
	1058
	1059	ENDDO
	1060
	1061	DEALLOCATE( tri )
	1062
	1063	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'stop' )
	1064
	1065	END SUBROUTINE fftx_tri_fftx
	1066
	1067
	1068	SUBROUTINE fftx_tr_xy( f_in, work, f_out )
	1069
	1070	!------------------------------------------------------------------------------!
	1071	! Fourier-transformation along x with subsequent transposition x --> y for
	1072	! a 1d-decomposition along y
	1073	!
	1074	! ATTENTION: The NEC-branch of this routine may significantly profit from
	1075	! further optimizations. So far, performance is much worse than
	1076	! for routine ffty_tr_yx (more than three times slower).
	1077	!------------------------------------------------------------------------------!
	1078
	1079	USE control_parameters
	1080	USE cpulog
	1081	USE indices
	1082	USE interfaces
	1083	USE pegrid
	1084	USE transpose_indices
	1085
	1086	IMPLICIT NONE
	1087
	1088	INTEGER :: i, j, k
	1089
[1003]	1090	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
	1091	REAL, DIMENSION(1:nz,nys:nyn,0:nx) :: f_in
	1092	REAL, DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_out
	1093	REAL, DIMENSION(nys:nyn,1:nz,0:nx) :: work
[1]	1094
	1095	!
	1096	!-- Carry out the FFT along x, where all data are present due to the
	1097	!-- 1d-decomposition along y. Resort the data in a way that y becomes
	1098	!-- the first index.
	1099	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
	1100
	1101	IF ( host(1:3) == 'nec' ) THEN
	1102	!
	1103	!-- Code for vector processors
[85]	1104	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	1105	!$OMP DO
	1106	DO i = 0, nx
	1107
	1108	DO j = nys, nyn
	1109	DO k = 1, nz
	1110	work_fftx(i,k,j) = f_in(k,j,i)
	1111	ENDDO
	1112	ENDDO
	1113
	1114	ENDDO
	1115
	1116	!$OMP DO
	1117	DO j = nys, nyn
	1118
	1119	CALL fft_x_m( work_fftx(:,:,j), 'forward' )
	1120
	1121	DO k = 1, nz
	1122	DO i = 0, nx
	1123	work(j,k,i) = work_fftx(i,k,j)
	1124	ENDDO
	1125	ENDDO
	1126
	1127	ENDDO
	1128	!$OMP END PARALLEL
	1129
	1130	ELSE
	1131
	1132	!
	1133	!-- Cache optimized code (there might be still a potential for better
	1134	!-- optimization).
[696]	1135	!$OMP PARALLEL PRIVATE (i,j,k)
[1]	1136	!$OMP DO
	1137	DO i = 0, nx
	1138
	1139	DO j = nys, nyn
	1140	DO k = 1, nz
	1141	work_fftx(i,k,j) = f_in(k,j,i)
	1142	ENDDO
	1143	ENDDO
	1144
	1145	ENDDO
	1146
	1147	!$OMP DO
	1148	DO j = nys, nyn
	1149	DO k = 1, nz
	1150
	1151	CALL fft_x( work_fftx(0:nx,k,j), 'forward' )
	1152
	1153	DO i = 0, nx
	1154	work(j,k,i) = work_fftx(i,k,j)
	1155	ENDDO
	1156	ENDDO
	1157
	1158	ENDDO
	1159	!$OMP END PARALLEL
	1160
	1161	ENDIF
	1162	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
	1163
	1164	!
	1165	!-- Transpose array
	1166	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	1167	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	1168	CALL MPI_ALLTOALL( work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
	1169	f_out(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
	1170	comm1dy, ierr )
	1171	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	1172
	1173	END SUBROUTINE fftx_tr_xy
	1174
	1175
	1176	SUBROUTINE tr_yx_fftx( f_in, work, f_out )
	1177
	1178	!------------------------------------------------------------------------------!
	1179	! Transposition y --> x with a subsequent backward Fourier transformation for
	1180	! a 1d-decomposition along x
	1181	!------------------------------------------------------------------------------!
	1182
	1183	USE control_parameters
	1184	USE cpulog
	1185	USE indices
	1186	USE interfaces
	1187	USE pegrid
	1188	USE transpose_indices
	1189
	1190	IMPLICIT NONE
	1191
	1192	INTEGER :: i, j, k
	1193
[1003]	1194	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
	1195	REAL, DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_in
	1196	REAL, DIMENSION(1:nz,nys:nyn,0:nx) :: f_out
	1197	REAL, DIMENSION(nys:nyn,1:nz,0:nx) :: work
[1]	1198
	1199	!
	1200	!-- Transpose array
	1201	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	1202	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	1203	CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
	1204	work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
	1205	comm1dy, ierr )
	1206	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	1207
	1208	!
	1209	!-- Carry out the FFT along x, where all data are present due to the
	1210	!-- 1d-decomposition along y. Resort the data in a way that y becomes
	1211	!-- the first index.
	1212	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
	1213
	1214	IF ( host(1:3) == 'nec' ) THEN
	1215	!
	1216	!-- Code optimized for vector processors
[85]	1217	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	1218	!$OMP DO
	1219	DO j = nys, nyn
	1220
	1221	DO k = 1, nz
	1222	DO i = 0, nx
	1223	work_fftx(i,k,j) = work(j,k,i)
	1224	ENDDO
	1225	ENDDO
	1226
	1227	CALL fft_x_m( work_fftx(:,:,j), 'backward' )
	1228
	1229	ENDDO
	1230
	1231	!$OMP DO
	1232	DO i = 0, nx
	1233	DO j = nys, nyn
	1234	DO k = 1, nz
	1235	f_out(k,j,i) = work_fftx(i,k,j)
	1236	ENDDO
	1237	ENDDO
	1238	ENDDO
	1239	!$OMP END PARALLEL
	1240
	1241	ELSE
	1242
	1243	!
	1244	!-- Cache optimized code (there might be still a potential for better
	1245	!-- optimization).
[696]	1246	!$OMP PARALLEL PRIVATE (i,j,k)
[1]	1247	!$OMP DO
	1248	DO j = nys, nyn
	1249	DO k = 1, nz
	1250
	1251	DO i = 0, nx
	1252	work_fftx(i,k,j) = work(j,k,i)
	1253	ENDDO
	1254
	1255	CALL fft_x( work_fftx(0:nx,k,j), 'backward' )
	1256
	1257	ENDDO
	1258	ENDDO
	1259
	1260	!$OMP DO
	1261	DO i = 0, nx
	1262	DO j = nys, nyn
	1263	DO k = 1, nz
	1264	f_out(k,j,i) = work_fftx(i,k,j)
	1265	ENDDO
	1266	ENDDO
	1267	ENDDO
	1268	!$OMP END PARALLEL
	1269
	1270	ENDIF
	1271	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	1272
	1273	END SUBROUTINE tr_yx_fftx
	1274
	1275
	1276	SUBROUTINE ffty_tri_ffty( ar )
	1277
	1278	!------------------------------------------------------------------------------!
	1279	! FFT along y, solution of the tridiagonal system and backward FFT for
	1280	! a 1d-decomposition along y
	1281	!
	1282	! WARNING: this subroutine may still not work for hybrid parallelization
	1283	! with OpenMP (for possible necessary changes see the original
	1284	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
	1285	!------------------------------------------------------------------------------!
	1286
	1287	USE control_parameters
	1288	USE cpulog
	1289	USE grid_variables
	1290	USE indices
	1291	USE interfaces
	1292	USE pegrid
	1293	USE transpose_indices
	1294
	1295	IMPLICIT NONE
	1296
	1297	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
	1298
[1003]	1299	REAL, DIMENSION(0:ny) :: work_ffty
	1300	REAL, DIMENSION(0:ny,1:nz) :: work_triy
	1301	REAL, DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: ar
	1302	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
[1]	1303
	1304
	1305	CALL cpu_log( log_point_s(39), 'fft_y + tridia', 'start' )
	1306
	1307	ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) )
	1308
	1309	tn = 0 ! Default thread number in case of one thread
[696]	1310	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy )
[1]	1311	DO i = nxl_y, nxr_y
	1312
	1313	!$ tn = omp_get_thread_num()
	1314
	1315	IF ( host(1:3) == 'nec' ) THEN
	1316	!
	1317	!-- Code optimized for vector processors
	1318	DO k = 1, nz
	1319
	1320	m = 0
	1321	DO n = 1, pdims(2)
[1003]	1322	DO j = 1, nny
[1]	1323	work_triy(m,k) = ar(j,k,i,n)
	1324	m = m + 1
	1325	ENDDO
	1326	ENDDO
	1327
	1328	ENDDO
	1329
	1330	CALL fft_y_m( work_triy, ny, 'forward' )
	1331
	1332	ELSE
	1333	!
	1334	!-- Cache optimized code
	1335	DO k = 1, nz
	1336
	1337	m = 0
	1338	DO n = 1, pdims(2)
[1003]	1339	DO j = 1, nny
[1]	1340	work_ffty(m) = ar(j,k,i,n)
	1341	m = m + 1
	1342	ENDDO
	1343	ENDDO
	1344
	1345	CALL fft_y( work_ffty, 'forward' )
	1346
	1347	DO j = 0, ny
	1348	work_triy(j,k) = work_ffty(j)
	1349	ENDDO
	1350
	1351	ENDDO
	1352
	1353	ENDIF
	1354
	1355	!
	1356	!-- Solve the linear equation system
	1357	CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) )
	1358
	1359	IF ( host(1:3) == 'nec' ) THEN
	1360	!
	1361	!-- Code optimized for vector processors
	1362	CALL fft_y_m( work_triy, ny, 'backward' )
	1363
	1364	DO k = 1, nz
	1365
	1366	m = 0
	1367	DO n = 1, pdims(2)
[1003]	1368	DO j = 1, nny
[1]	1369	ar(j,k,i,n) = work_triy(m,k)
	1370	m = m + 1
	1371	ENDDO
	1372	ENDDO
	1373
	1374	ENDDO
	1375
	1376	ELSE
	1377	!
	1378	!-- Cache optimized code
	1379	DO k = 1, nz
	1380
	1381	DO j = 0, ny
	1382	work_ffty(j) = work_triy(j,k)
	1383	ENDDO
	1384
	1385	CALL fft_y( work_ffty, 'backward' )
	1386
	1387	m = 0
	1388	DO n = 1, pdims(2)
[1003]	1389	DO j = 1, nny
[1]	1390	ar(j,k,i,n) = work_ffty(m)
	1391	m = m + 1
	1392	ENDDO
	1393	ENDDO
	1394
	1395	ENDDO
	1396
	1397	ENDIF
	1398
	1399	ENDDO
	1400
	1401	DEALLOCATE( tri )
	1402
	1403	CALL cpu_log( log_point_s(39), 'fft_y + tridia', 'stop' )
	1404
	1405	END SUBROUTINE ffty_tri_ffty
	1406
	1407
	1408	SUBROUTINE tridia_1dd( ddx2, ddy2, nx, ny, j, ar, tri )
	1409
	1410	!------------------------------------------------------------------------------!
	1411	! Solves the linear system of equations for a 1d-decomposition along x (see
	1412	! tridia)
	1413	!
[940]	1414	! Attention: when using the intel compilers older than 12.0, array tri must
	1415	! be passed as an argument to the contained subroutines. Otherwise
	1416	! addres faults will occur. This feature can be activated with
	1417	! cpp-switch __intel11
[1]	1418	! On NEC, tri should not be passed (except for routine substi_1dd)
	1419	! because this causes very bad performance.
	1420	!------------------------------------------------------------------------------!
	1421
	1422	USE arrays_3d
	1423	USE control_parameters
	1424
	1425	USE pegrid
	1426
	1427	IMPLICIT NONE
	1428
	1429	INTEGER :: i, j, k, nnyh, nx, ny, omp_get_thread_num, tn
	1430
	1431	REAL :: ddx2, ddy2
	1432
	1433	REAL, DIMENSION(0:nx,1:nz) :: ar
	1434	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	1435
	1436
	1437	nnyh = ( ny + 1 ) / 2
	1438
	1439	!
	1440	!-- Define constant elements of the tridiagonal matrix.
	1441	!-- The compiler on SX6 does loop exchange. If 0:nx is a high power of 2,
	1442	!-- the exchanged loops create bank conflicts. The following directive
	1443	!-- prohibits loop exchange and the loops perform much better.
	1444	! tn = omp_get_thread_num()
	1445	! WRITE( 120+tn, * ) '+++ id=',myid,' nx=',nx,' thread=', omp_get_thread_num()
[82]	1446	! CALL local_flush( 120+tn )
[1]	1447	!CDIR NOLOOPCHG
	1448	DO k = 0, nz-1
	1449	DO i = 0,nx
[667]	1450	tri(2,i,k) = ddzu_pres(k+1) * ddzw(k+1)
	1451	tri(3,i,k) = ddzu_pres(k+2) * ddzw(k+1)
[1]	1452	ENDDO
	1453	ENDDO
	1454	! WRITE( 120+tn, * ) '+++ id=',myid,' end of first tridia loop thread=', omp_get_thread_num()
[82]	1455	! CALL local_flush( 120+tn )
[1]	1456
	1457	IF ( j <= nnyh ) THEN
[940]	1458	#if defined( __intel11 )
[1]	1459	CALL maketri_1dd( j, tri )
	1460	#else
	1461	CALL maketri_1dd( j )
	1462	#endif
	1463	ELSE
[940]	1464	#if defined( __intel11 )
[1]	1465	CALL maketri_1dd( ny+1-j, tri )
	1466	#else
	1467	CALL maketri_1dd( ny+1-j )
	1468	#endif
	1469	ENDIF
[940]	1470	#if defined( __intel11 )
[1]	1471	CALL split_1dd( tri )
	1472	#else
	1473	CALL split_1dd
	1474	#endif
	1475	CALL substi_1dd( ar, tri )
	1476
	1477	CONTAINS
	1478
[940]	1479	#if defined( __intel11 )
[1]	1480	SUBROUTINE maketri_1dd( j, tri )
	1481	#else
	1482	SUBROUTINE maketri_1dd( j )
	1483	#endif
	1484
	1485	!------------------------------------------------------------------------------!
	1486	! computes the i- and j-dependent component of the matrix
	1487	!------------------------------------------------------------------------------!
	1488
	1489	USE constants
	1490
	1491	IMPLICIT NONE
	1492
	1493	INTEGER :: i, j, k, nnxh
	1494	REAL :: a, c
	1495
	1496	REAL, DIMENSION(0:nx) :: l
	1497
[940]	1498	#if defined( __intel11 )
[1]	1499	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	1500	#endif
	1501
	1502
	1503	nnxh = ( nx + 1 ) / 2
	1504	!
	1505	!-- Provide the tridiagonal matrix for solution of the Poisson equation in
	1506	!-- Fourier space. The coefficients are computed following the method of
	1507	!-- Schmidt et al. (DFVLR-Mitteilung 84-15), which departs from Stephan
	1508	!-- Siano's original version by discretizing the Poisson equation,
	1509	!-- before it is Fourier-transformed
	1510	DO i = 0, nx
[128]	1511	IF ( i >= 0 .AND. i <= nnxh ) THEN
[1]	1512	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
[1013]	1513	REAL( nx+1 ) ) ) * ddx2 + &
[1]	1514	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
[1013]	1515	REAL( ny+1 ) ) ) * ddy2
[1]	1516	ELSE
	1517	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
[1013]	1518	REAL( nx+1 ) ) ) * ddx2 + &
[1]	1519	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
[1013]	1520	REAL( ny+1 ) ) ) * ddy2
[1]	1521	ENDIF
	1522	ENDDO
	1523
	1524	DO k = 0, nz-1
	1525	DO i = 0, nx
[667]	1526	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
	1527	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
[1]	1528	tri(1,i,k) = a + c - l(i)
	1529	ENDDO
	1530	ENDDO
	1531	IF ( ibc_p_b == 1 .OR. ibc_p_b == 2 ) THEN
	1532	DO i = 0, nx
	1533	tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
	1534	ENDDO
	1535	ENDIF
	1536	IF ( ibc_p_t == 1 ) THEN
	1537	DO i = 0, nx
	1538	tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
	1539	ENDDO
	1540	ENDIF
	1541
	1542	END SUBROUTINE maketri_1dd
	1543
	1544
[940]	1545	#if defined( __intel11 )
[1]	1546	SUBROUTINE split_1dd( tri )
	1547	#else
	1548	SUBROUTINE split_1dd
	1549	#endif
	1550
	1551	!------------------------------------------------------------------------------!
	1552	! Splitting of the tridiagonal matrix (Thomas algorithm)
	1553	!------------------------------------------------------------------------------!
	1554
	1555	IMPLICIT NONE
	1556
	1557	INTEGER :: i, k
	1558
[940]	1559	#if defined( __intel11 )
[1]	1560	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	1561	#endif
	1562
	1563
	1564	!
	1565	!-- Splitting
	1566	DO i = 0, nx
	1567	tri(4,i,0) = tri(1,i,0)
	1568	ENDDO
	1569	DO k = 1, nz-1
	1570	DO i = 0, nx
	1571	tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
	1572	tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
	1573	ENDDO
	1574	ENDDO
	1575
	1576	END SUBROUTINE split_1dd
	1577
	1578
	1579	SUBROUTINE substi_1dd( ar, tri )
	1580
	1581	!------------------------------------------------------------------------------!
	1582	! Substitution (Forward and Backward) (Thomas algorithm)
	1583	!------------------------------------------------------------------------------!
	1584
	1585	IMPLICIT NONE
	1586
[76]	1587	INTEGER :: i, k
[1]	1588
	1589	REAL, DIMENSION(0:nx,nz) :: ar
	1590	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
	1591	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	1592
	1593	!
	1594	!-- Forward substitution
	1595	DO i = 0, nx
	1596	ar1(i,0) = ar(i,1)
	1597	ENDDO
	1598	DO k = 1, nz-1
	1599	DO i = 0, nx
	1600	ar1(i,k) = ar(i,k+1) - tri(5,i,k) * ar1(i,k-1)
	1601	ENDDO
	1602	ENDDO
	1603
	1604	!
	1605	!-- Backward substitution
[763]	1606	!-- Note, the add of 1.0E-20 in the denominator is due to avoid divisions
	1607	!-- by zero appearing if the pressure bc is set to neumann at the top of
	1608	!-- the model domain.
[1]	1609	DO i = 0, nx
[761]	1610	ar(i,nz) = ar1(i,nz-1) / ( tri(4,i,nz-1) + 1.0E-20 )
[1]	1611	ENDDO
	1612	DO k = nz-2, 0, -1
	1613	DO i = 0, nx
	1614	ar(i,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,k+2) ) &
	1615	/ tri(4,i,k)
	1616	ENDDO
	1617	ENDDO
	1618
[76]	1619	!
	1620	!-- Indices i=0, j=0 correspond to horizontally averaged pressure.
	1621	!-- The respective values of ar should be zero at all k-levels if
	1622	!-- acceleration of horizontally averaged vertical velocity is zero.
	1623	IF ( ibc_p_b == 1 .AND. ibc_p_t == 1 ) THEN
	1624	IF ( j == 0 ) THEN
	1625	DO k = 1, nz
	1626	ar(0,k) = 0.0
	1627	ENDDO
	1628	ENDIF
	1629	ENDIF
	1630
[1]	1631	END SUBROUTINE substi_1dd
	1632
	1633	END SUBROUTINE tridia_1dd
	1634
	1635	#endif
[807]	1636	#endif
[1]	1637	END MODULE poisfft_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |