Home

Context Navigation

source: palm/trunk/SOURCE/poisfft.f90 @ 1069

Last change on this file since 1069 was 1037, checked in by raasch, 12 years ago
last commit documented
Property svn:keywords set to `Id`
File size: 47.7 KB

Rev	Line
[1]	1	MODULE poisfft_mod
	2
[1036]	3	!--------------------------------------------------------------------------------!
	4	! This file is part of PALM.
	5	!
	6	! PALM is free software: you can redistribute it and/or modify it under the terms
	7	! of the GNU General Public License as published by the Free Software Foundation,
	8	! either version 3 of the License, or (at your option) any later version.
	9	!
	10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
	11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
	12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
	13	!
	14	! You should have received a copy of the GNU General Public License along with
	15	! PALM. If not, see <http://www.gnu.org/licenses/>.
	16	!
	17	! Copyright 1997-2012 Leibniz University Hannover
	18	!--------------------------------------------------------------------------------!
	19	!
[484]	20	! Current revisions:
[1]	21	! -----------------
	22	!
[1014]	23	!
[1]	24	! Former revisions:
	25	! -----------------
[3]	26	! $Id: poisfft.f90 1037 2012-10-22 14:10:22Z maronga $
[77]	27	!
[1037]	28	! 1036 2012-10-22 13:43:42Z raasch
	29	! code put under GPL (PALM 3.9)
	30	!
[1014]	31	! 2012-09-21 07:03:55Z raasch
	32	! FLOAT type conversion replaced by REAL
	33	!
[1004]	34	! 1003 2012-09-14 14:35:53Z raasch
	35	! indices nxa, nya, etc. replaced by nx, ny, etc.
	36	!
[941]	37	! 940 2012-07-09 14:31:00Z raasch
	38	! special handling of tri-array as an argument in tridia_1dd routines switched
	39	! off because it caused segmentation faults with intel 12.1 compiler
	40	!
[878]	41	! 877 2012-04-03 11:21:44Z suehring
	42	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
	43	! pressure at the top of the model domain.
	44	!
[810]	45	! 809 2012-01-30 13:32:58Z maronga
	46	! Bugfix: replaced .AND. and .NOT. with && and ! in the preprocessor directives
	47	!
[808]	48	! 807 2012-01-25 11:53:51Z maronga
	49	! New cpp directive "__check" implemented which is used by check_namelist_files
	50	! (most of the code is unneeded by check_namelist_files).
	51	!
[764]	52	! 763 2011-10-06 09:32:09Z suehring
	53	! Comment added concerning the last change.
	54	!
[762]	55	! 761 2011-10-05 17:58:52Z suehring
	56	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
	57	! pressure at the top of the model domain.
	58	!
[697]	59	! 696 2011-03-18 07:03:49Z raasch
	60	! work_fftx removed from PRIVATE clauses in fftx_tr_xy and tr_yx_fftx
	61	!
[684]	62	! 683 2011-02-09 14:25:15Z raasch
	63	! openMP parallelization for 2d-domain-decomposition
	64	!
[668]	65	! 667 2010-12-23 12:06:00Z suehring/gryschka
	66	! ddzu replaced by ddzu_pres due to changes in zu(0)
	67	!
[623]	68	! 622 2010-12-10 08:08:13Z raasch
	69	! optional barriers included in order to speed up collective operations
	70	!
[392]	71	! 377 2009-09-04 11:09:00Z raasch
	72	! __lcmuk changed to __lc to avoid problems with Intel compiler on sgi-ice
	73	!
[198]	74	! 164 2008-05-15 08:46:15Z raasch
	75	! Arguments removed from transpose routines
	76	!
[139]	77	! 128 2007-10-26 13:11:14Z raasch
	78	! Bugfix: wavenumber calculation for even nx in routines maketri
	79	!
[90]	80	! 85 2007-05-11 09:35:14Z raasch
	81	! Bugfix: work_fft*_vec removed from some PRIVATE-declarations
	82	!
[77]	83	! 76 2007-03-29 00:58:32Z raasch
	84	! Tridiagonal coefficients adjusted for Neumann boundary conditions both at
	85	! the bottom and the top.
	86	!
[3]	87	! RCS Log replace by Id keyword, revision history cleaned up
	88	!
[1]	89	! Revision 1.24 2006/08/04 15:00:24 raasch
	90	! Default setting of the thread number tn in case of not using OpenMP
	91	!
	92	! Revision 1.23 2006/02/23 12:48:38 raasch
	93	! Additional compiler directive in routine tridia_1dd for preventing loop
	94	! exchange on NEC-SX6
	95	!
	96	! Revision 1.20 2004/04/30 12:38:09 raasch
	97	! Parts of former poisfft_hybrid moved to this subroutine,
	98	! former subroutine changed to a module, renaming of FFT-subroutines and
	99	! -module, FFTs completely substituted by calls of fft_x and fft_y,
	100	! NAG fft used in the non-parallel case completely removed, l in maketri
	101	! is now a 1d-array, variables passed by modules instead of using parameter
	102	! lists, enlarged transposition arrays introduced
	103	!
	104	! Revision 1.1 1997/07/24 11:24:14 raasch
	105	! Initial revision
	106	!
	107	!
	108	! Description:
	109	! ------------
	110	! See below.
	111	!------------------------------------------------------------------------------!
	112
	113	!--------------------------------------------------------------------------!
	114	! poisfft !
	115	! !
	116	! Original version: Stephan Siano (pois3d) !
	117	! !
	118	! Institute of Meteorology and Climatology, University of Hannover !
	119	! Germany !
	120	! !
	121	! Version as of July 23,1996 !
	122	! !
	123	! !
	124	! Version for parallel computers: Siegfried Raasch !
	125	! !
	126	! Version as of July 03,1997 !
	127	! !
	128	! Solves the Poisson equation with a 2D spectral method !
	129	! d^2 p / dx^2 + d^2 p / dy^2 + d^2 p / dz^2 = s !
	130	! !
	131	! Input: !
	132	! real ar contains in the (nnx,nny,nnz) elements, !
	133	! starting from the element (1,nys,nxl), the !
	134	! values for s !
	135	! real work Temporary array !
	136	! !
	137	! Output: !
	138	! real ar contains the solution for p !
	139	!--------------------------------------------------------------------------!
	140
	141	USE fft_xy
	142	USE indices
	143	USE transpose_indices
	144
	145	IMPLICIT NONE
	146
	147	PRIVATE
[807]	148
[809]	149	#if ! defined ( __check )
[1]	150	PUBLIC poisfft, poisfft_init
	151
	152	INTERFACE poisfft
	153	MODULE PROCEDURE poisfft
	154	END INTERFACE poisfft
	155
	156	INTERFACE poisfft_init
	157	MODULE PROCEDURE poisfft_init
	158	END INTERFACE poisfft_init
[807]	159	#else
	160	PUBLIC poisfft_init
[1]	161
[807]	162	INTERFACE poisfft_init
	163	MODULE PROCEDURE poisfft_init
	164	END INTERFACE poisfft_init
	165	#endif
	166
[1]	167	CONTAINS
	168
	169	SUBROUTINE poisfft_init
	170
	171	CALL fft_init
	172
	173	END SUBROUTINE poisfft_init
	174
[809]	175	#if ! defined ( __check )
[1]	176	SUBROUTINE poisfft( ar, work )
	177
	178	USE cpulog
	179	USE interfaces
	180	USE pegrid
	181
	182	IMPLICIT NONE
	183
[1003]	184	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar, work
[1]	185
	186
	187	CALL cpu_log( log_point_s(3), 'poisfft', 'start' )
	188
	189	!
	190	!-- Two-dimensional Fourier Transformation in x- and y-direction.
	191	#if defined( __parallel )
	192	IF ( pdims(2) == 1 ) THEN
	193
	194	!
	195	!-- 1d-domain-decomposition along x:
	196	!-- FFT along y and transposition y --> x
	197	CALL ffty_tr_yx( ar, work, ar )
	198
	199	!
	200	!-- FFT along x, solving the tridiagonal system and backward FFT
	201	CALL fftx_tri_fftx( ar )
	202
	203	!
	204	!-- Transposition x --> y and backward FFT along y
	205	CALL tr_xy_ffty( ar, work, ar )
	206
	207	ELSEIF ( pdims(1) == 1 ) THEN
	208
	209	!
	210	!-- 1d-domain-decomposition along y:
	211	!-- FFT along x and transposition x --> y
	212	CALL fftx_tr_xy( ar, work, ar )
	213
	214	!
	215	!-- FFT along y, solving the tridiagonal system and backward FFT
	216	CALL ffty_tri_ffty( ar )
	217
	218	!
	219	!-- Transposition y --> x and backward FFT along x
	220	CALL tr_yx_fftx( ar, work, ar )
	221
	222	ELSE
	223
	224	!
	225	!-- 2d-domain-decomposition
	226	!-- Transposition z --> x
	227	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
[164]	228	CALL transpose_zx( ar, work, ar )
[1]	229	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
	230
	231	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
	232	CALL fftxp( ar, 'forward' )
	233	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
	234
	235	!
	236	!-- Transposition x --> y
	237	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
[164]	238	CALL transpose_xy( ar, work, ar )
[1]	239	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
	240
	241	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
	242	CALL fftyp( ar, 'forward' )
	243	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	244
	245	!
	246	!-- Transposition y --> z
	247	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
[164]	248	CALL transpose_yz( ar, work, ar )
[1]	249	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
	250
	251	!
	252	!-- Solve the Poisson equation in z-direction in cartesian space.
	253	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
	254	CALL tridia( ar )
	255	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
	256
	257	!
	258	!-- Inverse Fourier Transformation
	259	!-- Transposition z --> y
	260	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
[164]	261	CALL transpose_zy( ar, work, ar )
[1]	262	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
	263
	264	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
	265	CALL fftyp( ar, 'backward' )
	266	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	267
	268	!
	269	!-- Transposition y --> x
	270	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
[164]	271	CALL transpose_yx( ar, work, ar )
[1]	272	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
	273
	274	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
	275	CALL fftxp( ar, 'backward' )
	276	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	277
	278	!
	279	!-- Transposition x --> z
	280	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
[164]	281	CALL transpose_xz( ar, work, ar )
[1]	282	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
	283
	284	ENDIF
	285
	286	#else
	287
	288	!
	289	!-- Two-dimensional Fourier Transformation along x- and y-direction.
	290	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
	291	CALL fftx( ar, 'forward' )
	292	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
	293	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
	294	CALL ffty( ar, 'forward' )
	295	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	296
	297	!
	298	!-- Solve the Poisson equation in z-direction in cartesian space.
	299	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
	300	CALL tridia( ar )
	301	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
	302
	303	!
	304	!-- Inverse Fourier Transformation.
	305	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
	306	CALL ffty( ar, 'backward' )
	307	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	308	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
	309	CALL fftx( ar, 'backward' )
	310	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	311
	312	#endif
	313
	314	CALL cpu_log( log_point_s(3), 'poisfft', 'stop' )
	315
	316	END SUBROUTINE poisfft
	317
	318
	319
	320	SUBROUTINE tridia( ar )
	321
	322	!------------------------------------------------------------------------------!
	323	! solves the linear system of equations:
	324	!
	325	! -(4 pi^2(i^2/(dx^2nnx^2)+j^2/(dy^2nny^2))+
	326	! 1/(dzu(k)dzw(k))+1/(dzu(k-1)dzw(k)))*p(i,j,k)+
	327	! 1/(dzu(k)dzw(k))p(i,j,k+1)+1/(dzu(k-1)dzw(k))p(i,j,k-1)=d(i,j,k)
	328	!
	329	! by using the Thomas algorithm
	330	!------------------------------------------------------------------------------!
	331
	332	USE arrays_3d
	333
	334	IMPLICIT NONE
	335
	336	INTEGER :: i, j, k, nnyh
	337
	338	REAL, DIMENSION(nxl_z:nxr_z,0:nz-1) :: ar1
	339	REAL, DIMENSION(5,nxl_z:nxr_z,0:nz-1) :: tri
	340
	341	#if defined( __parallel )
[1003]	342	REAL :: ar(nxl_z:nxr_z,nys_z:nyn_z,1:nz)
[1]	343	#else
	344	REAL :: ar(1:nz,nys_z:nyn_z,nxl_z:nxr_z)
	345	#endif
	346
	347
	348	nnyh = (ny+1) / 2
	349
	350	!
	351	!-- Define constant elements of the tridiagonal matrix.
[683]	352	!$OMP PARALLEL PRIVATE ( k, i )
	353	!$OMP DO
[1]	354	DO k = 0, nz-1
	355	DO i = nxl_z, nxr_z
[667]	356	tri(2,i,k) = ddzu_pres(k+1) * ddzw(k+1)
	357	tri(3,i,k) = ddzu_pres(k+2) * ddzw(k+1)
[1]	358	ENDDO
	359	ENDDO
[683]	360	!$OMP END PARALLEL
[1]	361
	362	#if defined( __parallel )
	363	!
	364	!-- Repeat for all y-levels.
[683]	365	!$OMP PARALLEL FIRSTPRIVATE( tri ) PRIVATE ( ar1, j )
	366	!$OMP DO
[1]	367	DO j = nys_z, nyn_z
	368	IF ( j <= nnyh ) THEN
	369	CALL maketri( tri, j )
	370	ELSE
	371	CALL maketri( tri, ny+1-j )
	372	ENDIF
	373	CALL split( tri )
	374	CALL substi( ar, ar1, tri, j )
	375	ENDDO
[683]	376	!$OMP END PARALLEL
[1]	377	#else
	378	!
	379	!-- First y-level.
	380	CALL maketri( tri, nys_z )
	381	CALL split( tri )
	382	CALL substi( ar, ar1, tri, 0 )
	383
	384	!
	385	!-- Further y-levels.
	386	DO j = 1, nnyh - 1
	387	CALL maketri( tri, j )
	388	CALL split( tri )
	389	CALL substi( ar, ar1, tri, j )
	390	CALL substi( ar, ar1, tri, ny+1-j )
	391	ENDDO
	392	CALL maketri( tri, nnyh )
	393	CALL split( tri )
	394	CALL substi( ar, ar1, tri, nnyh+nys )
	395	#endif
	396
	397	CONTAINS
	398
	399	SUBROUTINE maketri( tri, j )
	400
	401	!------------------------------------------------------------------------------!
	402	! Computes the i- and j-dependent component of the matrix
	403	!------------------------------------------------------------------------------!
	404
	405	USE arrays_3d
	406	USE constants
	407	USE control_parameters
	408	USE grid_variables
	409
	410	IMPLICIT NONE
	411
	412	INTEGER :: i, j, k, nnxh
	413	REAL :: a, c
	414	REAL :: ll(nxl_z:nxr_z)
	415	REAL :: tri(5,nxl_z:nxr_z,0:nz-1)
	416
	417
	418	nnxh = ( nx + 1 ) / 2
	419
	420	!
	421	!-- Provide the tridiagonal matrix for solution of the Poisson equation in
	422	!-- Fourier space. The coefficients are computed following the method of
	423	!-- Schmidt et al. (DFVLR-Mitteilung 84-15), which departs from Stephan
	424	!-- Siano's original version by discretizing the Poisson equation,
	425	!-- before it is Fourier-transformed
	426	#if defined( __parallel )
	427	DO i = nxl_z, nxr_z
[128]	428	IF ( i >= 0 .AND. i <= nnxh ) THEN
[1]	429	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
[1013]	430	REAL( nx+1 ) ) ) / ( dx * dx ) + &
[1]	431	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
[1013]	432	REAL( ny+1 ) ) ) / ( dy * dy )
[1]	433	ELSE
	434	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
[1013]	435	REAL( nx+1 ) ) ) / ( dx * dx ) + &
[1]	436	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
[1013]	437	REAL( ny+1 ) ) ) / ( dy * dy )
[1]	438	ENDIF
	439	DO k = 0,nz-1
[667]	440	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
	441	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
[1]	442	tri(1,i,k) = a + c - ll(i)
	443	ENDDO
	444	ENDDO
	445	#else
	446	DO i = 0, nnxh
[1013]	447	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / REAL( nx+1 ) ) ) / &
[1]	448	( dx * dx ) + &
[1013]	449	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / REAL( ny+1 ) ) ) / &
[1]	450	( dy * dy )
	451	DO k = 0, nz-1
[667]	452	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
	453	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
[1]	454	tri(1,i,k) = a + c - ll(i)
	455	IF ( i >= 1 .and. i < nnxh ) THEN
	456	tri(1,nx+1-i,k) = tri(1,i,k)
	457	ENDIF
	458	ENDDO
	459	ENDDO
	460	#endif
	461	IF ( ibc_p_b == 1 .OR. ibc_p_b == 2 ) THEN
	462	DO i = nxl_z, nxr_z
	463	tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
	464	ENDDO
	465	ENDIF
	466	IF ( ibc_p_t == 1 ) THEN
	467	DO i = nxl_z, nxr_z
	468	tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
	469	ENDDO
	470	ENDIF
	471
	472	END SUBROUTINE maketri
	473
	474
	475	SUBROUTINE substi( ar, ar1, tri, j )
	476
	477	!------------------------------------------------------------------------------!
	478	! Substitution (Forward and Backward) (Thomas algorithm)
	479	!------------------------------------------------------------------------------!
	480
[76]	481	USE control_parameters
	482
[1]	483	IMPLICIT NONE
	484
	485	INTEGER :: i, j, k
	486	REAL :: ar1(nxl_z:nxr_z,0:nz-1)
	487	REAL :: tri(5,nxl_z:nxr_z,0:nz-1)
	488	#if defined( __parallel )
[1003]	489	REAL :: ar(nxl_z:nxr_z,nys_z:nyn_z,1:nz)
[1]	490	#else
	491	REAL :: ar(1:nz,nys_z:nyn_z,nxl_z:nxr_z)
	492	#endif
	493
	494	!
	495	!-- Forward substitution.
	496	DO i = nxl_z, nxr_z
	497	#if defined( __parallel )
	498	ar1(i,0) = ar(i,j,1)
	499	#else
	500	ar1(i,0) = ar(1,j,i)
	501	#endif
	502	ENDDO
	503	DO k = 1, nz - 1
	504	DO i = nxl_z, nxr_z
	505	#if defined( __parallel )
	506	ar1(i,k) = ar(i,j,k+1) - tri(5,i,k) * ar1(i,k-1)
	507	#else
	508	ar1(i,k) = ar(k+1,j,i) - tri(5,i,k) * ar1(i,k-1)
	509	#endif
	510	ENDDO
	511	ENDDO
	512
	513	!
[877]	514	!-- Backward substitution
	515	!-- Note, the 1.0E-20 in the denominator is due to avoid divisions
	516	!-- by zero appearing if the pressure bc is set to neumann at the top of
	517	!-- the model domain.
[1]	518	DO i = nxl_z, nxr_z
	519	#if defined( __parallel )
[877]	520	ar(i,j,nz) = ar1(i,nz-1) / ( tri(4,i,nz-1) + 1.0E-20 )
[1]	521	#else
[877]	522	ar(nz,j,i) = ar1(i,nz-1) / ( tri(4,i,nz-1) + 1.0E-20 )
[1]	523	#endif
	524	ENDDO
	525	DO k = nz-2, 0, -1
	526	DO i = nxl_z, nxr_z
	527	#if defined( __parallel )
	528	ar(i,j,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,j,k+2) ) &
	529	/ tri(4,i,k)
	530	#else
	531	ar(k+1,j,i) = ( ar1(i,k) - tri(3,i,k) * ar(k+2,j,i) ) &
	532	/ tri(4,i,k)
	533	#endif
	534	ENDDO
	535	ENDDO
	536
[76]	537	!
	538	!-- Indices i=0, j=0 correspond to horizontally averaged pressure.
	539	!-- The respective values of ar should be zero at all k-levels if
	540	!-- acceleration of horizontally averaged vertical velocity is zero.
	541	IF ( ibc_p_b == 1 .AND. ibc_p_t == 1 ) THEN
	542	IF ( j == 0 .AND. nxl_z == 0 ) THEN
	543	#if defined( __parallel )
	544	DO k = 1, nz
	545	ar(nxl_z,j,k) = 0.0
	546	ENDDO
	547	#else
	548	DO k = 1, nz
	549	ar(k,j,nxl_z) = 0.0
	550	ENDDO
	551	#endif
	552	ENDIF
	553	ENDIF
	554
[1]	555	END SUBROUTINE substi
	556
	557
	558	SUBROUTINE split( tri )
	559
	560	!------------------------------------------------------------------------------!
	561	! Splitting of the tridiagonal matrix (Thomas algorithm)
	562	!------------------------------------------------------------------------------!
	563
	564	IMPLICIT NONE
	565
	566	INTEGER :: i, k
	567	REAL :: tri(5,nxl_z:nxr_z,0:nz-1)
	568
	569	!
	570	!-- Splitting.
	571	DO i = nxl_z, nxr_z
	572	tri(4,i,0) = tri(1,i,0)
	573	ENDDO
	574	DO k = 1, nz-1
	575	DO i = nxl_z, nxr_z
	576	tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
	577	tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
	578	ENDDO
	579	ENDDO
	580
	581	END SUBROUTINE split
	582
	583	END SUBROUTINE tridia
	584
	585
	586	#if defined( __parallel )
	587	SUBROUTINE fftxp( ar, direction )
	588
	589	!------------------------------------------------------------------------------!
	590	! Fourier-transformation along x-direction Parallelized version
	591	!------------------------------------------------------------------------------!
	592
	593	IMPLICIT NONE
	594
	595	CHARACTER (LEN=*) :: direction
	596	INTEGER :: j, k
[1003]	597	REAL :: ar(0:nx,nys_x:nyn_x,nzb_x:nzt_x)
[1]	598
	599	!
	600	!-- Performing the fft with one of the methods implemented
[683]	601	!$OMP PARALLEL PRIVATE ( j, k )
	602	!$OMP DO
[1]	603	DO k = nzb_x, nzt_x
	604	DO j = nys_x, nyn_x
	605	CALL fft_x( ar(0:nx,j,k), direction )
	606	ENDDO
	607	ENDDO
[683]	608	!$OMP END PARALLEL
[1]	609
	610	END SUBROUTINE fftxp
	611
	612	#else
	613	SUBROUTINE fftx( ar, direction )
	614
	615	!------------------------------------------------------------------------------!
	616	! Fourier-transformation along x-direction Non parallel version
	617	!------------------------------------------------------------------------------!
	618
	619	IMPLICIT NONE
	620
	621	CHARACTER (LEN=*) :: direction
	622	INTEGER :: i, j, k
	623	REAL :: ar(1:nz,0:ny,0:nx)
	624
	625	!
	626	!-- Performing the fft with one of the methods implemented
[683]	627	!$OMP PARALLEL PRIVATE ( j, k )
	628	!$OMP DO
[1]	629	DO k = 1, nz
	630	DO j = 0, ny
	631	CALL fft_x( ar(k,j,0:nx), direction )
	632	ENDDO
	633	ENDDO
[683]	634	!$OMP END PARALLEL
[1]	635
	636	END SUBROUTINE fftx
	637	#endif
	638
	639
	640	#if defined( __parallel )
	641	SUBROUTINE fftyp( ar, direction )
	642
	643	!------------------------------------------------------------------------------!
	644	! Fourier-transformation along y-direction Parallelized version
	645	!------------------------------------------------------------------------------!
	646
	647	IMPLICIT NONE
	648
	649	CHARACTER (LEN=*) :: direction
	650	INTEGER :: i, k
[1003]	651	REAL :: ar(0:ny,nxl_y:nxr_y,nzb_y:nzt_y)
[1]	652
	653	!
	654	!-- Performing the fft with one of the methods implemented
[683]	655	!$OMP PARALLEL PRIVATE ( i, k )
	656	!$OMP DO
[1]	657	DO k = nzb_y, nzt_y
	658	DO i = nxl_y, nxr_y
	659	CALL fft_y( ar(0:ny,i,k), direction )
	660	ENDDO
	661	ENDDO
[683]	662	!$OMP END PARALLEL
[1]	663
	664	END SUBROUTINE fftyp
	665
	666	#else
	667	SUBROUTINE ffty( ar, direction )
	668
	669	!------------------------------------------------------------------------------!
	670	! Fourier-transformation along y-direction Non parallel version
	671	!------------------------------------------------------------------------------!
	672
	673	IMPLICIT NONE
	674
	675	CHARACTER (LEN=*) :: direction
	676	INTEGER :: i, k
	677	REAL :: ar(1:nz,0:ny,0:nx)
	678
	679	!
	680	!-- Performing the fft with one of the methods implemented
[683]	681	!$OMP PARALLEL PRIVATE ( i, k )
	682	!$OMP DO
[1]	683	DO k = 1, nz
	684	DO i = 0, nx
	685	CALL fft_y( ar(k,0:ny,i), direction )
	686	ENDDO
	687	ENDDO
[683]	688	!$OMP END PARALLEL
[1]	689
	690	END SUBROUTINE ffty
	691	#endif
	692
	693	#if defined( __parallel )
	694	SUBROUTINE ffty_tr_yx( f_in, work, f_out )
	695
	696	!------------------------------------------------------------------------------!
	697	! Fourier-transformation along y with subsequent transposition y --> x for
	698	! a 1d-decomposition along x
	699	!
	700	! ATTENTION: The performance of this routine is much faster on the NEC-SX6,
	701	! if the first index of work_ffty_vec is odd. Otherwise
	702	! memory bank conflicts may occur (especially if the index is a
	703	! multiple of 128). That's why work_ffty_vec is dimensioned as
	704	! 0:ny+1.
	705	! Of course, this will not work if users are using an odd number
	706	! of gridpoints along y.
	707	!------------------------------------------------------------------------------!
	708
	709	USE control_parameters
	710	USE cpulog
	711	USE indices
	712	USE interfaces
	713	USE pegrid
	714	USE transpose_indices
	715
	716	IMPLICIT NONE
	717
	718	INTEGER :: i, iend, iouter, ir, j, k
	719	INTEGER, PARAMETER :: stridex = 4
	720
	721	REAL, DIMENSION(0:ny,stridex) :: work_ffty
	722	#if defined( __nec )
	723	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
	724	#endif
[1003]	725	REAL, DIMENSION(1:nz,0:ny,nxl:nxr) :: f_in
	726	REAL, DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_out
	727	REAL, DIMENSION(nxl:nxr,1:nz,0:ny) :: work
[1]	728
	729	!
	730	!-- Carry out the FFT along y, where all data are present due to the
	731	!-- 1d-decomposition along x. Resort the data in a way that x becomes
	732	!-- the first index.
	733	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
	734
	735	IF ( host(1:3) == 'nec' ) THEN
	736	#if defined( __nec )
	737	!
	738	!-- Code optimized for vector processors
[85]	739	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	740	!$OMP DO
	741	DO i = nxl, nxr
	742
	743	DO j = 0, ny
	744	DO k = 1, nz
	745	work_ffty_vec(j,k,i) = f_in(k,j,i)
	746	ENDDO
	747	ENDDO
	748
	749	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' )
	750
	751	ENDDO
	752
	753	!$OMP DO
	754	DO k = 1, nz
	755	DO j = 0, ny
	756	DO i = nxl, nxr
	757	work(i,k,j) = work_ffty_vec(j,k,i)
	758	ENDDO
	759	ENDDO
	760	ENDDO
	761	!$OMP END PARALLEL
	762	#endif
	763
	764	ELSE
	765
	766	!
	767	!-- Cache optimized code.
	768	!-- The i-(x-)direction is split into a strided outer loop and an inner
	769	!-- loop for better cache performance
	770	!$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty)
	771	!$OMP DO
	772	DO iouter = nxl, nxr, stridex
	773
	774	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
	775
	776	DO k = 1, nz
	777
	778	DO i = iouter, iend
	779
	780	ir = i-iouter+1 ! counter within a stride
	781	DO j = 0, ny
	782	work_ffty(j,ir) = f_in(k,j,i)
	783	ENDDO
	784	!
	785	!-- FFT along y
	786	CALL fft_y( work_ffty(:,ir), 'forward' )
	787
	788	ENDDO
	789
	790	!
	791	!-- Resort
	792	DO j = 0, ny
	793	DO i = iouter, iend
	794	work(i,k,j) = work_ffty(j,i-iouter+1)
	795	ENDDO
	796	ENDDO
	797
	798	ENDDO
	799
	800	ENDDO
	801	!$OMP END PARALLEL
	802
	803	ENDIF
	804	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	805
	806	!
	807	!-- Transpose array
	808	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	809	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	810	CALL MPI_ALLTOALL( work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
	811	f_out(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
	812	comm1dx, ierr )
	813	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	814
	815	END SUBROUTINE ffty_tr_yx
	816
	817
	818	SUBROUTINE tr_xy_ffty( f_in, work, f_out )
	819
	820	!------------------------------------------------------------------------------!
	821	! Transposition x --> y with a subsequent backward Fourier transformation for
	822	! a 1d-decomposition along x
	823	!------------------------------------------------------------------------------!
	824
	825	USE control_parameters
	826	USE cpulog
	827	USE indices
	828	USE interfaces
	829	USE pegrid
	830	USE transpose_indices
	831
	832	IMPLICIT NONE
	833
	834	INTEGER :: i, iend, iouter, ir, j, k
	835	INTEGER, PARAMETER :: stridex = 4
	836
	837	REAL, DIMENSION(0:ny,stridex) :: work_ffty
	838	#if defined( __nec )
	839	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
	840	#endif
[1003]	841	REAL, DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_in
	842	REAL, DIMENSION(1:nz,0:ny,nxl:nxr) :: f_out
	843	REAL, DIMENSION(nxl:nxr,1:nz,0:ny) :: work
[1]	844
	845	!
	846	!-- Transpose array
	847	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	848	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	849	CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
	850	work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
	851	comm1dx, ierr )
	852	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	853
	854	!
	855	!-- Resort the data in a way that y becomes the first index and carry out the
	856	!-- backward fft along y.
	857	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
	858
	859	IF ( host(1:3) == 'nec' ) THEN
	860	#if defined( __nec )
	861	!
	862	!-- Code optimized for vector processors
[85]	863	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	864	!$OMP DO
	865	DO k = 1, nz
	866	DO j = 0, ny
	867	DO i = nxl, nxr
	868	work_ffty_vec(j,k,i) = work(i,k,j)
	869	ENDDO
	870	ENDDO
	871	ENDDO
	872
	873	!$OMP DO
	874	DO i = nxl, nxr
	875
	876	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' )
	877
	878	DO j = 0, ny
	879	DO k = 1, nz
	880	f_out(k,j,i) = work_ffty_vec(j,k,i)
	881	ENDDO
	882	ENDDO
	883
	884	ENDDO
	885	!$OMP END PARALLEL
	886	#endif
	887
	888	ELSE
	889
	890	!
	891	!-- Cache optimized code.
	892	!-- The i-(x-)direction is split into a strided outer loop and an inner
	893	!-- loop for better cache performance
	894	!$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty )
	895	!$OMP DO
	896	DO iouter = nxl, nxr, stridex
	897
	898	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
	899
	900	DO k = 1, nz
	901	!
	902	!-- Resort
	903	DO j = 0, ny
	904	DO i = iouter, iend
	905	work_ffty(j,i-iouter+1) = work(i,k,j)
	906	ENDDO
	907	ENDDO
	908
	909	DO i = iouter, iend
	910
	911	!
	912	!-- FFT along y
	913	ir = i-iouter+1 ! counter within a stride
	914	CALL fft_y( work_ffty(:,ir), 'backward' )
	915
	916	DO j = 0, ny
	917	f_out(k,j,i) = work_ffty(j,ir)
	918	ENDDO
	919	ENDDO
	920
	921	ENDDO
	922
	923	ENDDO
	924	!$OMP END PARALLEL
	925
	926	ENDIF
	927
	928	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	929
	930	END SUBROUTINE tr_xy_ffty
	931
	932
	933	SUBROUTINE fftx_tri_fftx( ar )
	934
	935	!------------------------------------------------------------------------------!
	936	! FFT along x, solution of the tridiagonal system and backward FFT for
	937	! a 1d-decomposition along x
	938	!
	939	! WARNING: this subroutine may still not work for hybrid parallelization
	940	! with OpenMP (for possible necessary changes see the original
	941	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
	942	!------------------------------------------------------------------------------!
	943
	944	USE control_parameters
	945	USE cpulog
	946	USE grid_variables
	947	USE indices
	948	USE interfaces
	949	USE pegrid
	950	USE transpose_indices
	951
	952	IMPLICIT NONE
	953
	954	character(len=3) :: myth_char
	955
	956	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
	957
[1003]	958	REAL, DIMENSION(0:nx) :: work_fftx
	959	REAL, DIMENSION(0:nx,1:nz) :: work_trix
	960	REAL, DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: ar
	961	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
[1]	962
	963
	964	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'start' )
	965
	966	ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) )
	967
	968	tn = 0 ! Default thread number in case of one thread
	969	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_fftx, work_trix )
	970	DO j = nys_x, nyn_x
	971
	972	!$ tn = omp_get_thread_num()
	973
	974	IF ( host(1:3) == 'nec' ) THEN
	975	!
	976	!-- Code optimized for vector processors
	977	DO k = 1, nz
	978
	979	m = 0
	980	DO n = 1, pdims(1)
[1003]	981	DO i = 1, nnx
[1]	982	work_trix(m,k) = ar(i,k,j,n)
	983	m = m + 1
	984	ENDDO
	985	ENDDO
	986
	987	ENDDO
	988
	989	CALL fft_x_m( work_trix, 'forward' )
	990
	991	ELSE
	992	!
	993	!-- Cache optimized code
	994	DO k = 1, nz
	995
	996	m = 0
	997	DO n = 1, pdims(1)
[1003]	998	DO i = 1, nnx
[1]	999	work_fftx(m) = ar(i,k,j,n)
	1000	m = m + 1
	1001	ENDDO
	1002	ENDDO
	1003
	1004	CALL fft_x( work_fftx, 'forward' )
	1005
	1006	DO i = 0, nx
	1007	work_trix(i,k) = work_fftx(i)
	1008	ENDDO
	1009
	1010	ENDDO
	1011
	1012	ENDIF
	1013
	1014	!
	1015	!-- Solve the linear equation system
	1016	CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) )
	1017
	1018	IF ( host(1:3) == 'nec' ) THEN
	1019	!
	1020	!-- Code optimized for vector processors
	1021	CALL fft_x_m( work_trix, 'backward' )
	1022
	1023	DO k = 1, nz
	1024
	1025	m = 0
	1026	DO n = 1, pdims(1)
[1003]	1027	DO i = 1, nnx
[1]	1028	ar(i,k,j,n) = work_trix(m,k)
	1029	m = m + 1
	1030	ENDDO
	1031	ENDDO
	1032
	1033	ENDDO
	1034
	1035	ELSE
	1036	!
	1037	!-- Cache optimized code
	1038	DO k = 1, nz
	1039
	1040	DO i = 0, nx
	1041	work_fftx(i) = work_trix(i,k)
	1042	ENDDO
	1043
	1044	CALL fft_x( work_fftx, 'backward' )
	1045
	1046	m = 0
	1047	DO n = 1, pdims(1)
[1003]	1048	DO i = 1, nnx
[1]	1049	ar(i,k,j,n) = work_fftx(m)
	1050	m = m + 1
	1051	ENDDO
	1052	ENDDO
	1053
	1054	ENDDO
	1055
	1056	ENDIF
	1057
	1058	ENDDO
	1059
	1060	DEALLOCATE( tri )
	1061
	1062	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'stop' )
	1063
	1064	END SUBROUTINE fftx_tri_fftx
	1065
	1066
	1067	SUBROUTINE fftx_tr_xy( f_in, work, f_out )
	1068
	1069	!------------------------------------------------------------------------------!
	1070	! Fourier-transformation along x with subsequent transposition x --> y for
	1071	! a 1d-decomposition along y
	1072	!
	1073	! ATTENTION: The NEC-branch of this routine may significantly profit from
	1074	! further optimizations. So far, performance is much worse than
	1075	! for routine ffty_tr_yx (more than three times slower).
	1076	!------------------------------------------------------------------------------!
	1077
	1078	USE control_parameters
	1079	USE cpulog
	1080	USE indices
	1081	USE interfaces
	1082	USE pegrid
	1083	USE transpose_indices
	1084
	1085	IMPLICIT NONE
	1086
	1087	INTEGER :: i, j, k
	1088
[1003]	1089	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
	1090	REAL, DIMENSION(1:nz,nys:nyn,0:nx) :: f_in
	1091	REAL, DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_out
	1092	REAL, DIMENSION(nys:nyn,1:nz,0:nx) :: work
[1]	1093
	1094	!
	1095	!-- Carry out the FFT along x, where all data are present due to the
	1096	!-- 1d-decomposition along y. Resort the data in a way that y becomes
	1097	!-- the first index.
	1098	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
	1099
	1100	IF ( host(1:3) == 'nec' ) THEN
	1101	!
	1102	!-- Code for vector processors
[85]	1103	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	1104	!$OMP DO
	1105	DO i = 0, nx
	1106
	1107	DO j = nys, nyn
	1108	DO k = 1, nz
	1109	work_fftx(i,k,j) = f_in(k,j,i)
	1110	ENDDO
	1111	ENDDO
	1112
	1113	ENDDO
	1114
	1115	!$OMP DO
	1116	DO j = nys, nyn
	1117
	1118	CALL fft_x_m( work_fftx(:,:,j), 'forward' )
	1119
	1120	DO k = 1, nz
	1121	DO i = 0, nx
	1122	work(j,k,i) = work_fftx(i,k,j)
	1123	ENDDO
	1124	ENDDO
	1125
	1126	ENDDO
	1127	!$OMP END PARALLEL
	1128
	1129	ELSE
	1130
	1131	!
	1132	!-- Cache optimized code (there might be still a potential for better
	1133	!-- optimization).
[696]	1134	!$OMP PARALLEL PRIVATE (i,j,k)
[1]	1135	!$OMP DO
	1136	DO i = 0, nx
	1137
	1138	DO j = nys, nyn
	1139	DO k = 1, nz
	1140	work_fftx(i,k,j) = f_in(k,j,i)
	1141	ENDDO
	1142	ENDDO
	1143
	1144	ENDDO
	1145
	1146	!$OMP DO
	1147	DO j = nys, nyn
	1148	DO k = 1, nz
	1149
	1150	CALL fft_x( work_fftx(0:nx,k,j), 'forward' )
	1151
	1152	DO i = 0, nx
	1153	work(j,k,i) = work_fftx(i,k,j)
	1154	ENDDO
	1155	ENDDO
	1156
	1157	ENDDO
	1158	!$OMP END PARALLEL
	1159
	1160	ENDIF
	1161	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
	1162
	1163	!
	1164	!-- Transpose array
	1165	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	1166	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	1167	CALL MPI_ALLTOALL( work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
	1168	f_out(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
	1169	comm1dy, ierr )
	1170	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	1171
	1172	END SUBROUTINE fftx_tr_xy
	1173
	1174
	1175	SUBROUTINE tr_yx_fftx( f_in, work, f_out )
	1176
	1177	!------------------------------------------------------------------------------!
	1178	! Transposition y --> x with a subsequent backward Fourier transformation for
	1179	! a 1d-decomposition along x
	1180	!------------------------------------------------------------------------------!
	1181
	1182	USE control_parameters
	1183	USE cpulog
	1184	USE indices
	1185	USE interfaces
	1186	USE pegrid
	1187	USE transpose_indices
	1188
	1189	IMPLICIT NONE
	1190
	1191	INTEGER :: i, j, k
	1192
[1003]	1193	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
	1194	REAL, DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_in
	1195	REAL, DIMENSION(1:nz,nys:nyn,0:nx) :: f_out
	1196	REAL, DIMENSION(nys:nyn,1:nz,0:nx) :: work
[1]	1197
	1198	!
	1199	!-- Transpose array
	1200	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	1201	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	1202	CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
	1203	work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
	1204	comm1dy, ierr )
	1205	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	1206
	1207	!
	1208	!-- Carry out the FFT along x, where all data are present due to the
	1209	!-- 1d-decomposition along y. Resort the data in a way that y becomes
	1210	!-- the first index.
	1211	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
	1212
	1213	IF ( host(1:3) == 'nec' ) THEN
	1214	!
	1215	!-- Code optimized for vector processors
[85]	1216	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	1217	!$OMP DO
	1218	DO j = nys, nyn
	1219
	1220	DO k = 1, nz
	1221	DO i = 0, nx
	1222	work_fftx(i,k,j) = work(j,k,i)
	1223	ENDDO
	1224	ENDDO
	1225
	1226	CALL fft_x_m( work_fftx(:,:,j), 'backward' )
	1227
	1228	ENDDO
	1229
	1230	!$OMP DO
	1231	DO i = 0, nx
	1232	DO j = nys, nyn
	1233	DO k = 1, nz
	1234	f_out(k,j,i) = work_fftx(i,k,j)
	1235	ENDDO
	1236	ENDDO
	1237	ENDDO
	1238	!$OMP END PARALLEL
	1239
	1240	ELSE
	1241
	1242	!
	1243	!-- Cache optimized code (there might be still a potential for better
	1244	!-- optimization).
[696]	1245	!$OMP PARALLEL PRIVATE (i,j,k)
[1]	1246	!$OMP DO
	1247	DO j = nys, nyn
	1248	DO k = 1, nz
	1249
	1250	DO i = 0, nx
	1251	work_fftx(i,k,j) = work(j,k,i)
	1252	ENDDO
	1253
	1254	CALL fft_x( work_fftx(0:nx,k,j), 'backward' )
	1255
	1256	ENDDO
	1257	ENDDO
	1258
	1259	!$OMP DO
	1260	DO i = 0, nx
	1261	DO j = nys, nyn
	1262	DO k = 1, nz
	1263	f_out(k,j,i) = work_fftx(i,k,j)
	1264	ENDDO
	1265	ENDDO
	1266	ENDDO
	1267	!$OMP END PARALLEL
	1268
	1269	ENDIF
	1270	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	1271
	1272	END SUBROUTINE tr_yx_fftx
	1273
	1274
	1275	SUBROUTINE ffty_tri_ffty( ar )
	1276
	1277	!------------------------------------------------------------------------------!
	1278	! FFT along y, solution of the tridiagonal system and backward FFT for
	1279	! a 1d-decomposition along y
	1280	!
	1281	! WARNING: this subroutine may still not work for hybrid parallelization
	1282	! with OpenMP (for possible necessary changes see the original
	1283	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
	1284	!------------------------------------------------------------------------------!
	1285
	1286	USE control_parameters
	1287	USE cpulog
	1288	USE grid_variables
	1289	USE indices
	1290	USE interfaces
	1291	USE pegrid
	1292	USE transpose_indices
	1293
	1294	IMPLICIT NONE
	1295
	1296	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
	1297
[1003]	1298	REAL, DIMENSION(0:ny) :: work_ffty
	1299	REAL, DIMENSION(0:ny,1:nz) :: work_triy
	1300	REAL, DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: ar
	1301	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
[1]	1302
	1303
	1304	CALL cpu_log( log_point_s(39), 'fft_y + tridia', 'start' )
	1305
	1306	ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) )
	1307
	1308	tn = 0 ! Default thread number in case of one thread
[696]	1309	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy )
[1]	1310	DO i = nxl_y, nxr_y
	1311
	1312	!$ tn = omp_get_thread_num()
	1313
	1314	IF ( host(1:3) == 'nec' ) THEN
	1315	!
	1316	!-- Code optimized for vector processors
	1317	DO k = 1, nz
	1318
	1319	m = 0
	1320	DO n = 1, pdims(2)
[1003]	1321	DO j = 1, nny
[1]	1322	work_triy(m,k) = ar(j,k,i,n)
	1323	m = m + 1
	1324	ENDDO
	1325	ENDDO
	1326
	1327	ENDDO
	1328
	1329	CALL fft_y_m( work_triy, ny, 'forward' )
	1330
	1331	ELSE
	1332	!
	1333	!-- Cache optimized code
	1334	DO k = 1, nz
	1335
	1336	m = 0
	1337	DO n = 1, pdims(2)
[1003]	1338	DO j = 1, nny
[1]	1339	work_ffty(m) = ar(j,k,i,n)
	1340	m = m + 1
	1341	ENDDO
	1342	ENDDO
	1343
	1344	CALL fft_y( work_ffty, 'forward' )
	1345
	1346	DO j = 0, ny
	1347	work_triy(j,k) = work_ffty(j)
	1348	ENDDO
	1349
	1350	ENDDO
	1351
	1352	ENDIF
	1353
	1354	!
	1355	!-- Solve the linear equation system
	1356	CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) )
	1357
	1358	IF ( host(1:3) == 'nec' ) THEN
	1359	!
	1360	!-- Code optimized for vector processors
	1361	CALL fft_y_m( work_triy, ny, 'backward' )
	1362
	1363	DO k = 1, nz
	1364
	1365	m = 0
	1366	DO n = 1, pdims(2)
[1003]	1367	DO j = 1, nny
[1]	1368	ar(j,k,i,n) = work_triy(m,k)
	1369	m = m + 1
	1370	ENDDO
	1371	ENDDO
	1372
	1373	ENDDO
	1374
	1375	ELSE
	1376	!
	1377	!-- Cache optimized code
	1378	DO k = 1, nz
	1379
	1380	DO j = 0, ny
	1381	work_ffty(j) = work_triy(j,k)
	1382	ENDDO
	1383
	1384	CALL fft_y( work_ffty, 'backward' )
	1385
	1386	m = 0
	1387	DO n = 1, pdims(2)
[1003]	1388	DO j = 1, nny
[1]	1389	ar(j,k,i,n) = work_ffty(m)
	1390	m = m + 1
	1391	ENDDO
	1392	ENDDO
	1393
	1394	ENDDO
	1395
	1396	ENDIF
	1397
	1398	ENDDO
	1399
	1400	DEALLOCATE( tri )
	1401
	1402	CALL cpu_log( log_point_s(39), 'fft_y + tridia', 'stop' )
	1403
	1404	END SUBROUTINE ffty_tri_ffty
	1405
	1406
	1407	SUBROUTINE tridia_1dd( ddx2, ddy2, nx, ny, j, ar, tri )
	1408
	1409	!------------------------------------------------------------------------------!
	1410	! Solves the linear system of equations for a 1d-decomposition along x (see
	1411	! tridia)
	1412	!
[940]	1413	! Attention: when using the intel compilers older than 12.0, array tri must
	1414	! be passed as an argument to the contained subroutines. Otherwise
	1415	! addres faults will occur. This feature can be activated with
	1416	! cpp-switch __intel11
[1]	1417	! On NEC, tri should not be passed (except for routine substi_1dd)
	1418	! because this causes very bad performance.
	1419	!------------------------------------------------------------------------------!
	1420
	1421	USE arrays_3d
	1422	USE control_parameters
	1423
	1424	USE pegrid
	1425
	1426	IMPLICIT NONE
	1427
	1428	INTEGER :: i, j, k, nnyh, nx, ny, omp_get_thread_num, tn
	1429
	1430	REAL :: ddx2, ddy2
	1431
	1432	REAL, DIMENSION(0:nx,1:nz) :: ar
	1433	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
	1434	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	1435
	1436
	1437	nnyh = ( ny + 1 ) / 2
	1438
	1439	!
	1440	!-- Define constant elements of the tridiagonal matrix.
	1441	!-- The compiler on SX6 does loop exchange. If 0:nx is a high power of 2,
	1442	!-- the exchanged loops create bank conflicts. The following directive
	1443	!-- prohibits loop exchange and the loops perform much better.
	1444	! tn = omp_get_thread_num()
	1445	! WRITE( 120+tn, * ) '+++ id=',myid,' nx=',nx,' thread=', omp_get_thread_num()
[82]	1446	! CALL local_flush( 120+tn )
[1]	1447	!CDIR NOLOOPCHG
	1448	DO k = 0, nz-1
	1449	DO i = 0,nx
[667]	1450	tri(2,i,k) = ddzu_pres(k+1) * ddzw(k+1)
	1451	tri(3,i,k) = ddzu_pres(k+2) * ddzw(k+1)
[1]	1452	ENDDO
	1453	ENDDO
	1454	! WRITE( 120+tn, * ) '+++ id=',myid,' end of first tridia loop thread=', omp_get_thread_num()
[82]	1455	! CALL local_flush( 120+tn )
[1]	1456
	1457	IF ( j <= nnyh ) THEN
[940]	1458	#if defined( __intel11 )
[1]	1459	CALL maketri_1dd( j, tri )
	1460	#else
	1461	CALL maketri_1dd( j )
	1462	#endif
	1463	ELSE
[940]	1464	#if defined( __intel11 )
[1]	1465	CALL maketri_1dd( ny+1-j, tri )
	1466	#else
	1467	CALL maketri_1dd( ny+1-j )
	1468	#endif
	1469	ENDIF
[940]	1470	#if defined( __intel11 )
[1]	1471	CALL split_1dd( tri )
	1472	#else
	1473	CALL split_1dd
	1474	#endif
	1475	CALL substi_1dd( ar, tri )
	1476
	1477	CONTAINS
	1478
[940]	1479	#if defined( __intel11 )
[1]	1480	SUBROUTINE maketri_1dd( j, tri )
	1481	#else
	1482	SUBROUTINE maketri_1dd( j )
	1483	#endif
	1484
	1485	!------------------------------------------------------------------------------!
	1486	! computes the i- and j-dependent component of the matrix
	1487	!------------------------------------------------------------------------------!
	1488
	1489	USE constants
	1490
	1491	IMPLICIT NONE
	1492
	1493	INTEGER :: i, j, k, nnxh
	1494	REAL :: a, c
	1495
	1496	REAL, DIMENSION(0:nx) :: l
	1497
[940]	1498	#if defined( __intel11 )
[1]	1499	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	1500	#endif
	1501
	1502
	1503	nnxh = ( nx + 1 ) / 2
	1504	!
	1505	!-- Provide the tridiagonal matrix for solution of the Poisson equation in
	1506	!-- Fourier space. The coefficients are computed following the method of
	1507	!-- Schmidt et al. (DFVLR-Mitteilung 84-15), which departs from Stephan
	1508	!-- Siano's original version by discretizing the Poisson equation,
	1509	!-- before it is Fourier-transformed
	1510	DO i = 0, nx
[128]	1511	IF ( i >= 0 .AND. i <= nnxh ) THEN
[1]	1512	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
[1013]	1513	REAL( nx+1 ) ) ) * ddx2 + &
[1]	1514	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
[1013]	1515	REAL( ny+1 ) ) ) * ddy2
[1]	1516	ELSE
	1517	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
[1013]	1518	REAL( nx+1 ) ) ) * ddx2 + &
[1]	1519	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
[1013]	1520	REAL( ny+1 ) ) ) * ddy2
[1]	1521	ENDIF
	1522	ENDDO
	1523
	1524	DO k = 0, nz-1
	1525	DO i = 0, nx
[667]	1526	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
	1527	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
[1]	1528	tri(1,i,k) = a + c - l(i)
	1529	ENDDO
	1530	ENDDO
	1531	IF ( ibc_p_b == 1 .OR. ibc_p_b == 2 ) THEN
	1532	DO i = 0, nx
	1533	tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
	1534	ENDDO
	1535	ENDIF
	1536	IF ( ibc_p_t == 1 ) THEN
	1537	DO i = 0, nx
	1538	tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
	1539	ENDDO
	1540	ENDIF
	1541
	1542	END SUBROUTINE maketri_1dd
	1543
	1544
[940]	1545	#if defined( __intel11 )
[1]	1546	SUBROUTINE split_1dd( tri )
	1547	#else
	1548	SUBROUTINE split_1dd
	1549	#endif
	1550
	1551	!------------------------------------------------------------------------------!
	1552	! Splitting of the tridiagonal matrix (Thomas algorithm)
	1553	!------------------------------------------------------------------------------!
	1554
	1555	IMPLICIT NONE
	1556
	1557	INTEGER :: i, k
	1558
[940]	1559	#if defined( __intel11 )
[1]	1560	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	1561	#endif
	1562
	1563
	1564	!
	1565	!-- Splitting
	1566	DO i = 0, nx
	1567	tri(4,i,0) = tri(1,i,0)
	1568	ENDDO
	1569	DO k = 1, nz-1
	1570	DO i = 0, nx
	1571	tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
	1572	tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
	1573	ENDDO
	1574	ENDDO
	1575
	1576	END SUBROUTINE split_1dd
	1577
	1578
	1579	SUBROUTINE substi_1dd( ar, tri )
	1580
	1581	!------------------------------------------------------------------------------!
	1582	! Substitution (Forward and Backward) (Thomas algorithm)
	1583	!------------------------------------------------------------------------------!
	1584
	1585	IMPLICIT NONE
	1586
[76]	1587	INTEGER :: i, k
[1]	1588
	1589	REAL, DIMENSION(0:nx,nz) :: ar
	1590	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
	1591	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	1592
	1593	!
	1594	!-- Forward substitution
	1595	DO i = 0, nx
	1596	ar1(i,0) = ar(i,1)
	1597	ENDDO
	1598	DO k = 1, nz-1
	1599	DO i = 0, nx
	1600	ar1(i,k) = ar(i,k+1) - tri(5,i,k) * ar1(i,k-1)
	1601	ENDDO
	1602	ENDDO
	1603
	1604	!
	1605	!-- Backward substitution
[763]	1606	!-- Note, the add of 1.0E-20 in the denominator is due to avoid divisions
	1607	!-- by zero appearing if the pressure bc is set to neumann at the top of
	1608	!-- the model domain.
[1]	1609	DO i = 0, nx
[761]	1610	ar(i,nz) = ar1(i,nz-1) / ( tri(4,i,nz-1) + 1.0E-20 )
[1]	1611	ENDDO
	1612	DO k = nz-2, 0, -1
	1613	DO i = 0, nx
	1614	ar(i,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,k+2) ) &
	1615	/ tri(4,i,k)
	1616	ENDDO
	1617	ENDDO
	1618
[76]	1619	!
	1620	!-- Indices i=0, j=0 correspond to horizontally averaged pressure.
	1621	!-- The respective values of ar should be zero at all k-levels if
	1622	!-- acceleration of horizontally averaged vertical velocity is zero.
	1623	IF ( ibc_p_b == 1 .AND. ibc_p_t == 1 ) THEN
	1624	IF ( j == 0 ) THEN
	1625	DO k = 1, nz
	1626	ar(0,k) = 0.0
	1627	ENDDO
	1628	ENDIF
	1629	ENDIF
	1630
[1]	1631	END SUBROUTINE substi_1dd
	1632
	1633	END SUBROUTINE tridia_1dd
	1634
	1635	#endif
[807]	1636	#endif
[1]	1637	END MODULE poisfft_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |