Home

Context Navigation

source: palm/trunk/SOURCE/poisfft.f90 @ 1036

Last change on this file since 1036 was 1036, checked in by raasch, 11 years ago
code has been put under the GNU General Public License (v3)
Property svn:keywords set to `Id`
File size: 47.6 KB

Rev	Line
[1]	1	MODULE poisfft_mod
	2
[1036]	3	!--------------------------------------------------------------------------------!
	4	! This file is part of PALM.
	5	!
	6	! PALM is free software: you can redistribute it and/or modify it under the terms
	7	! of the GNU General Public License as published by the Free Software Foundation,
	8	! either version 3 of the License, or (at your option) any later version.
	9	!
	10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
	11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
	12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
	13	!
	14	! You should have received a copy of the GNU General Public License along with
	15	! PALM. If not, see <http://www.gnu.org/licenses/>.
	16	!
	17	! Copyright 1997-2012 Leibniz University Hannover
	18	!--------------------------------------------------------------------------------!
	19	!
[484]	20	! Current revisions:
[1]	21	! -----------------
	22	!
[1014]	23	!
[1]	24	! Former revisions:
	25	! -----------------
[3]	26	! $Id: poisfft.f90 1036 2012-10-22 13:43:42Z raasch $
[77]	27	!
[1014]	28	! 2012-09-21 07:03:55Z raasch
	29	! FLOAT type conversion replaced by REAL
	30	!
[1004]	31	! 1003 2012-09-14 14:35:53Z raasch
	32	! indices nxa, nya, etc. replaced by nx, ny, etc.
	33	!
[941]	34	! 940 2012-07-09 14:31:00Z raasch
	35	! special handling of tri-array as an argument in tridia_1dd routines switched
	36	! off because it caused segmentation faults with intel 12.1 compiler
	37	!
[878]	38	! 877 2012-04-03 11:21:44Z suehring
	39	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
	40	! pressure at the top of the model domain.
	41	!
[810]	42	! 809 2012-01-30 13:32:58Z maronga
	43	! Bugfix: replaced .AND. and .NOT. with && and ! in the preprocessor directives
	44	!
[808]	45	! 807 2012-01-25 11:53:51Z maronga
	46	! New cpp directive "__check" implemented which is used by check_namelist_files
	47	! (most of the code is unneeded by check_namelist_files).
	48	!
[764]	49	! 763 2011-10-06 09:32:09Z suehring
	50	! Comment added concerning the last change.
	51	!
[762]	52	! 761 2011-10-05 17:58:52Z suehring
	53	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
	54	! pressure at the top of the model domain.
	55	!
[697]	56	! 696 2011-03-18 07:03:49Z raasch
	57	! work_fftx removed from PRIVATE clauses in fftx_tr_xy and tr_yx_fftx
	58	!
[684]	59	! 683 2011-02-09 14:25:15Z raasch
	60	! openMP parallelization for 2d-domain-decomposition
	61	!
[668]	62	! 667 2010-12-23 12:06:00Z suehring/gryschka
	63	! ddzu replaced by ddzu_pres due to changes in zu(0)
	64	!
[623]	65	! 622 2010-12-10 08:08:13Z raasch
	66	! optional barriers included in order to speed up collective operations
	67	!
[392]	68	! 377 2009-09-04 11:09:00Z raasch
	69	! __lcmuk changed to __lc to avoid problems with Intel compiler on sgi-ice
	70	!
[198]	71	! 164 2008-05-15 08:46:15Z raasch
	72	! Arguments removed from transpose routines
	73	!
[139]	74	! 128 2007-10-26 13:11:14Z raasch
	75	! Bugfix: wavenumber calculation for even nx in routines maketri
	76	!
[90]	77	! 85 2007-05-11 09:35:14Z raasch
	78	! Bugfix: work_fft*_vec removed from some PRIVATE-declarations
	79	!
[77]	80	! 76 2007-03-29 00:58:32Z raasch
	81	! Tridiagonal coefficients adjusted for Neumann boundary conditions both at
	82	! the bottom and the top.
	83	!
[3]	84	! RCS Log replace by Id keyword, revision history cleaned up
	85	!
[1]	86	! Revision 1.24 2006/08/04 15:00:24 raasch
	87	! Default setting of the thread number tn in case of not using OpenMP
	88	!
	89	! Revision 1.23 2006/02/23 12:48:38 raasch
	90	! Additional compiler directive in routine tridia_1dd for preventing loop
	91	! exchange on NEC-SX6
	92	!
	93	! Revision 1.20 2004/04/30 12:38:09 raasch
	94	! Parts of former poisfft_hybrid moved to this subroutine,
	95	! former subroutine changed to a module, renaming of FFT-subroutines and
	96	! -module, FFTs completely substituted by calls of fft_x and fft_y,
	97	! NAG fft used in the non-parallel case completely removed, l in maketri
	98	! is now a 1d-array, variables passed by modules instead of using parameter
	99	! lists, enlarged transposition arrays introduced
	100	!
	101	! Revision 1.1 1997/07/24 11:24:14 raasch
	102	! Initial revision
	103	!
	104	!
	105	! Description:
	106	! ------------
	107	! See below.
	108	!------------------------------------------------------------------------------!
	109
	110	!--------------------------------------------------------------------------!
	111	! poisfft !
	112	! !
	113	! Original version: Stephan Siano (pois3d) !
	114	! !
	115	! Institute of Meteorology and Climatology, University of Hannover !
	116	! Germany !
	117	! !
	118	! Version as of July 23,1996 !
	119	! !
	120	! !
	121	! Version for parallel computers: Siegfried Raasch !
	122	! !
	123	! Version as of July 03,1997 !
	124	! !
	125	! Solves the Poisson equation with a 2D spectral method !
	126	! d^2 p / dx^2 + d^2 p / dy^2 + d^2 p / dz^2 = s !
	127	! !
	128	! Input: !
	129	! real ar contains in the (nnx,nny,nnz) elements, !
	130	! starting from the element (1,nys,nxl), the !
	131	! values for s !
	132	! real work Temporary array !
	133	! !
	134	! Output: !
	135	! real ar contains the solution for p !
	136	!--------------------------------------------------------------------------!
	137
	138	USE fft_xy
	139	USE indices
	140	USE transpose_indices
	141
	142	IMPLICIT NONE
	143
	144	PRIVATE
[807]	145
[809]	146	#if ! defined ( __check )
[1]	147	PUBLIC poisfft, poisfft_init
	148
	149	INTERFACE poisfft
	150	MODULE PROCEDURE poisfft
	151	END INTERFACE poisfft
	152
	153	INTERFACE poisfft_init
	154	MODULE PROCEDURE poisfft_init
	155	END INTERFACE poisfft_init
[807]	156	#else
	157	PUBLIC poisfft_init
[1]	158
[807]	159	INTERFACE poisfft_init
	160	MODULE PROCEDURE poisfft_init
	161	END INTERFACE poisfft_init
	162	#endif
	163
[1]	164	CONTAINS
	165
	166	SUBROUTINE poisfft_init
	167
	168	CALL fft_init
	169
	170	END SUBROUTINE poisfft_init
	171
[809]	172	#if ! defined ( __check )
[1]	173	SUBROUTINE poisfft( ar, work )
	174
	175	USE cpulog
	176	USE interfaces
	177	USE pegrid
	178
	179	IMPLICIT NONE
	180
[1003]	181	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar, work
[1]	182
	183
	184	CALL cpu_log( log_point_s(3), 'poisfft', 'start' )
	185
	186	!
	187	!-- Two-dimensional Fourier Transformation in x- and y-direction.
	188	#if defined( __parallel )
	189	IF ( pdims(2) == 1 ) THEN
	190
	191	!
	192	!-- 1d-domain-decomposition along x:
	193	!-- FFT along y and transposition y --> x
	194	CALL ffty_tr_yx( ar, work, ar )
	195
	196	!
	197	!-- FFT along x, solving the tridiagonal system and backward FFT
	198	CALL fftx_tri_fftx( ar )
	199
	200	!
	201	!-- Transposition x --> y and backward FFT along y
	202	CALL tr_xy_ffty( ar, work, ar )
	203
	204	ELSEIF ( pdims(1) == 1 ) THEN
	205
	206	!
	207	!-- 1d-domain-decomposition along y:
	208	!-- FFT along x and transposition x --> y
	209	CALL fftx_tr_xy( ar, work, ar )
	210
	211	!
	212	!-- FFT along y, solving the tridiagonal system and backward FFT
	213	CALL ffty_tri_ffty( ar )
	214
	215	!
	216	!-- Transposition y --> x and backward FFT along x
	217	CALL tr_yx_fftx( ar, work, ar )
	218
	219	ELSE
	220
	221	!
	222	!-- 2d-domain-decomposition
	223	!-- Transposition z --> x
	224	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
[164]	225	CALL transpose_zx( ar, work, ar )
[1]	226	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
	227
	228	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
	229	CALL fftxp( ar, 'forward' )
	230	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
	231
	232	!
	233	!-- Transposition x --> y
	234	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
[164]	235	CALL transpose_xy( ar, work, ar )
[1]	236	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
	237
	238	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
	239	CALL fftyp( ar, 'forward' )
	240	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	241
	242	!
	243	!-- Transposition y --> z
	244	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
[164]	245	CALL transpose_yz( ar, work, ar )
[1]	246	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
	247
	248	!
	249	!-- Solve the Poisson equation in z-direction in cartesian space.
	250	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
	251	CALL tridia( ar )
	252	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
	253
	254	!
	255	!-- Inverse Fourier Transformation
	256	!-- Transposition z --> y
	257	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
[164]	258	CALL transpose_zy( ar, work, ar )
[1]	259	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
	260
	261	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
	262	CALL fftyp( ar, 'backward' )
	263	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	264
	265	!
	266	!-- Transposition y --> x
	267	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
[164]	268	CALL transpose_yx( ar, work, ar )
[1]	269	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
	270
	271	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
	272	CALL fftxp( ar, 'backward' )
	273	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	274
	275	!
	276	!-- Transposition x --> z
	277	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
[164]	278	CALL transpose_xz( ar, work, ar )
[1]	279	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
	280
	281	ENDIF
	282
	283	#else
	284
	285	!
	286	!-- Two-dimensional Fourier Transformation along x- and y-direction.
	287	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
	288	CALL fftx( ar, 'forward' )
	289	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
	290	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
	291	CALL ffty( ar, 'forward' )
	292	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	293
	294	!
	295	!-- Solve the Poisson equation in z-direction in cartesian space.
	296	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
	297	CALL tridia( ar )
	298	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
	299
	300	!
	301	!-- Inverse Fourier Transformation.
	302	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
	303	CALL ffty( ar, 'backward' )
	304	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	305	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
	306	CALL fftx( ar, 'backward' )
	307	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	308
	309	#endif
	310
	311	CALL cpu_log( log_point_s(3), 'poisfft', 'stop' )
	312
	313	END SUBROUTINE poisfft
	314
	315
	316
	317	SUBROUTINE tridia( ar )
	318
	319	!------------------------------------------------------------------------------!
	320	! solves the linear system of equations:
	321	!
	322	! -(4 pi^2(i^2/(dx^2nnx^2)+j^2/(dy^2nny^2))+
	323	! 1/(dzu(k)dzw(k))+1/(dzu(k-1)dzw(k)))*p(i,j,k)+
	324	! 1/(dzu(k)dzw(k))p(i,j,k+1)+1/(dzu(k-1)dzw(k))p(i,j,k-1)=d(i,j,k)
	325	!
	326	! by using the Thomas algorithm
	327	!------------------------------------------------------------------------------!
	328
	329	USE arrays_3d
	330
	331	IMPLICIT NONE
	332
	333	INTEGER :: i, j, k, nnyh
	334
	335	REAL, DIMENSION(nxl_z:nxr_z,0:nz-1) :: ar1
	336	REAL, DIMENSION(5,nxl_z:nxr_z,0:nz-1) :: tri
	337
	338	#if defined( __parallel )
[1003]	339	REAL :: ar(nxl_z:nxr_z,nys_z:nyn_z,1:nz)
[1]	340	#else
	341	REAL :: ar(1:nz,nys_z:nyn_z,nxl_z:nxr_z)
	342	#endif
	343
	344
	345	nnyh = (ny+1) / 2
	346
	347	!
	348	!-- Define constant elements of the tridiagonal matrix.
[683]	349	!$OMP PARALLEL PRIVATE ( k, i )
	350	!$OMP DO
[1]	351	DO k = 0, nz-1
	352	DO i = nxl_z, nxr_z
[667]	353	tri(2,i,k) = ddzu_pres(k+1) * ddzw(k+1)
	354	tri(3,i,k) = ddzu_pres(k+2) * ddzw(k+1)
[1]	355	ENDDO
	356	ENDDO
[683]	357	!$OMP END PARALLEL
[1]	358
	359	#if defined( __parallel )
	360	!
	361	!-- Repeat for all y-levels.
[683]	362	!$OMP PARALLEL FIRSTPRIVATE( tri ) PRIVATE ( ar1, j )
	363	!$OMP DO
[1]	364	DO j = nys_z, nyn_z
	365	IF ( j <= nnyh ) THEN
	366	CALL maketri( tri, j )
	367	ELSE
	368	CALL maketri( tri, ny+1-j )
	369	ENDIF
	370	CALL split( tri )
	371	CALL substi( ar, ar1, tri, j )
	372	ENDDO
[683]	373	!$OMP END PARALLEL
[1]	374	#else
	375	!
	376	!-- First y-level.
	377	CALL maketri( tri, nys_z )
	378	CALL split( tri )
	379	CALL substi( ar, ar1, tri, 0 )
	380
	381	!
	382	!-- Further y-levels.
	383	DO j = 1, nnyh - 1
	384	CALL maketri( tri, j )
	385	CALL split( tri )
	386	CALL substi( ar, ar1, tri, j )
	387	CALL substi( ar, ar1, tri, ny+1-j )
	388	ENDDO
	389	CALL maketri( tri, nnyh )
	390	CALL split( tri )
	391	CALL substi( ar, ar1, tri, nnyh+nys )
	392	#endif
	393
	394	CONTAINS
	395
	396	SUBROUTINE maketri( tri, j )
	397
	398	!------------------------------------------------------------------------------!
	399	! Computes the i- and j-dependent component of the matrix
	400	!------------------------------------------------------------------------------!
	401
	402	USE arrays_3d
	403	USE constants
	404	USE control_parameters
	405	USE grid_variables
	406
	407	IMPLICIT NONE
	408
	409	INTEGER :: i, j, k, nnxh
	410	REAL :: a, c
	411	REAL :: ll(nxl_z:nxr_z)
	412	REAL :: tri(5,nxl_z:nxr_z,0:nz-1)
	413
	414
	415	nnxh = ( nx + 1 ) / 2
	416
	417	!
	418	!-- Provide the tridiagonal matrix for solution of the Poisson equation in
	419	!-- Fourier space. The coefficients are computed following the method of
	420	!-- Schmidt et al. (DFVLR-Mitteilung 84-15), which departs from Stephan
	421	!-- Siano's original version by discretizing the Poisson equation,
	422	!-- before it is Fourier-transformed
	423	#if defined( __parallel )
	424	DO i = nxl_z, nxr_z
[128]	425	IF ( i >= 0 .AND. i <= nnxh ) THEN
[1]	426	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
[1013]	427	REAL( nx+1 ) ) ) / ( dx * dx ) + &
[1]	428	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
[1013]	429	REAL( ny+1 ) ) ) / ( dy * dy )
[1]	430	ELSE
	431	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
[1013]	432	REAL( nx+1 ) ) ) / ( dx * dx ) + &
[1]	433	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
[1013]	434	REAL( ny+1 ) ) ) / ( dy * dy )
[1]	435	ENDIF
	436	DO k = 0,nz-1
[667]	437	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
	438	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
[1]	439	tri(1,i,k) = a + c - ll(i)
	440	ENDDO
	441	ENDDO
	442	#else
	443	DO i = 0, nnxh
[1013]	444	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / REAL( nx+1 ) ) ) / &
[1]	445	( dx * dx ) + &
[1013]	446	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / REAL( ny+1 ) ) ) / &
[1]	447	( dy * dy )
	448	DO k = 0, nz-1
[667]	449	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
	450	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
[1]	451	tri(1,i,k) = a + c - ll(i)
	452	IF ( i >= 1 .and. i < nnxh ) THEN
	453	tri(1,nx+1-i,k) = tri(1,i,k)
	454	ENDIF
	455	ENDDO
	456	ENDDO
	457	#endif
	458	IF ( ibc_p_b == 1 .OR. ibc_p_b == 2 ) THEN
	459	DO i = nxl_z, nxr_z
	460	tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
	461	ENDDO
	462	ENDIF
	463	IF ( ibc_p_t == 1 ) THEN
	464	DO i = nxl_z, nxr_z
	465	tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
	466	ENDDO
	467	ENDIF
	468
	469	END SUBROUTINE maketri
	470
	471
	472	SUBROUTINE substi( ar, ar1, tri, j )
	473
	474	!------------------------------------------------------------------------------!
	475	! Substitution (Forward and Backward) (Thomas algorithm)
	476	!------------------------------------------------------------------------------!
	477
[76]	478	USE control_parameters
	479
[1]	480	IMPLICIT NONE
	481
	482	INTEGER :: i, j, k
	483	REAL :: ar1(nxl_z:nxr_z,0:nz-1)
	484	REAL :: tri(5,nxl_z:nxr_z,0:nz-1)
	485	#if defined( __parallel )
[1003]	486	REAL :: ar(nxl_z:nxr_z,nys_z:nyn_z,1:nz)
[1]	487	#else
	488	REAL :: ar(1:nz,nys_z:nyn_z,nxl_z:nxr_z)
	489	#endif
	490
	491	!
	492	!-- Forward substitution.
	493	DO i = nxl_z, nxr_z
	494	#if defined( __parallel )
	495	ar1(i,0) = ar(i,j,1)
	496	#else
	497	ar1(i,0) = ar(1,j,i)
	498	#endif
	499	ENDDO
	500	DO k = 1, nz - 1
	501	DO i = nxl_z, nxr_z
	502	#if defined( __parallel )
	503	ar1(i,k) = ar(i,j,k+1) - tri(5,i,k) * ar1(i,k-1)
	504	#else
	505	ar1(i,k) = ar(k+1,j,i) - tri(5,i,k) * ar1(i,k-1)
	506	#endif
	507	ENDDO
	508	ENDDO
	509
	510	!
[877]	511	!-- Backward substitution
	512	!-- Note, the 1.0E-20 in the denominator is due to avoid divisions
	513	!-- by zero appearing if the pressure bc is set to neumann at the top of
	514	!-- the model domain.
[1]	515	DO i = nxl_z, nxr_z
	516	#if defined( __parallel )
[877]	517	ar(i,j,nz) = ar1(i,nz-1) / ( tri(4,i,nz-1) + 1.0E-20 )
[1]	518	#else
[877]	519	ar(nz,j,i) = ar1(i,nz-1) / ( tri(4,i,nz-1) + 1.0E-20 )
[1]	520	#endif
	521	ENDDO
	522	DO k = nz-2, 0, -1
	523	DO i = nxl_z, nxr_z
	524	#if defined( __parallel )
	525	ar(i,j,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,j,k+2) ) &
	526	/ tri(4,i,k)
	527	#else
	528	ar(k+1,j,i) = ( ar1(i,k) - tri(3,i,k) * ar(k+2,j,i) ) &
	529	/ tri(4,i,k)
	530	#endif
	531	ENDDO
	532	ENDDO
	533
[76]	534	!
	535	!-- Indices i=0, j=0 correspond to horizontally averaged pressure.
	536	!-- The respective values of ar should be zero at all k-levels if
	537	!-- acceleration of horizontally averaged vertical velocity is zero.
	538	IF ( ibc_p_b == 1 .AND. ibc_p_t == 1 ) THEN
	539	IF ( j == 0 .AND. nxl_z == 0 ) THEN
	540	#if defined( __parallel )
	541	DO k = 1, nz
	542	ar(nxl_z,j,k) = 0.0
	543	ENDDO
	544	#else
	545	DO k = 1, nz
	546	ar(k,j,nxl_z) = 0.0
	547	ENDDO
	548	#endif
	549	ENDIF
	550	ENDIF
	551
[1]	552	END SUBROUTINE substi
	553
	554
	555	SUBROUTINE split( tri )
	556
	557	!------------------------------------------------------------------------------!
	558	! Splitting of the tridiagonal matrix (Thomas algorithm)
	559	!------------------------------------------------------------------------------!
	560
	561	IMPLICIT NONE
	562
	563	INTEGER :: i, k
	564	REAL :: tri(5,nxl_z:nxr_z,0:nz-1)
	565
	566	!
	567	!-- Splitting.
	568	DO i = nxl_z, nxr_z
	569	tri(4,i,0) = tri(1,i,0)
	570	ENDDO
	571	DO k = 1, nz-1
	572	DO i = nxl_z, nxr_z
	573	tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
	574	tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
	575	ENDDO
	576	ENDDO
	577
	578	END SUBROUTINE split
	579
	580	END SUBROUTINE tridia
	581
	582
	583	#if defined( __parallel )
	584	SUBROUTINE fftxp( ar, direction )
	585
	586	!------------------------------------------------------------------------------!
	587	! Fourier-transformation along x-direction Parallelized version
	588	!------------------------------------------------------------------------------!
	589
	590	IMPLICIT NONE
	591
	592	CHARACTER (LEN=*) :: direction
	593	INTEGER :: j, k
[1003]	594	REAL :: ar(0:nx,nys_x:nyn_x,nzb_x:nzt_x)
[1]	595
	596	!
	597	!-- Performing the fft with one of the methods implemented
[683]	598	!$OMP PARALLEL PRIVATE ( j, k )
	599	!$OMP DO
[1]	600	DO k = nzb_x, nzt_x
	601	DO j = nys_x, nyn_x
	602	CALL fft_x( ar(0:nx,j,k), direction )
	603	ENDDO
	604	ENDDO
[683]	605	!$OMP END PARALLEL
[1]	606
	607	END SUBROUTINE fftxp
	608
	609	#else
	610	SUBROUTINE fftx( ar, direction )
	611
	612	!------------------------------------------------------------------------------!
	613	! Fourier-transformation along x-direction Non parallel version
	614	!------------------------------------------------------------------------------!
	615
	616	IMPLICIT NONE
	617
	618	CHARACTER (LEN=*) :: direction
	619	INTEGER :: i, j, k
	620	REAL :: ar(1:nz,0:ny,0:nx)
	621
	622	!
	623	!-- Performing the fft with one of the methods implemented
[683]	624	!$OMP PARALLEL PRIVATE ( j, k )
	625	!$OMP DO
[1]	626	DO k = 1, nz
	627	DO j = 0, ny
	628	CALL fft_x( ar(k,j,0:nx), direction )
	629	ENDDO
	630	ENDDO
[683]	631	!$OMP END PARALLEL
[1]	632
	633	END SUBROUTINE fftx
	634	#endif
	635
	636
	637	#if defined( __parallel )
	638	SUBROUTINE fftyp( ar, direction )
	639
	640	!------------------------------------------------------------------------------!
	641	! Fourier-transformation along y-direction Parallelized version
	642	!------------------------------------------------------------------------------!
	643
	644	IMPLICIT NONE
	645
	646	CHARACTER (LEN=*) :: direction
	647	INTEGER :: i, k
[1003]	648	REAL :: ar(0:ny,nxl_y:nxr_y,nzb_y:nzt_y)
[1]	649
	650	!
	651	!-- Performing the fft with one of the methods implemented
[683]	652	!$OMP PARALLEL PRIVATE ( i, k )
	653	!$OMP DO
[1]	654	DO k = nzb_y, nzt_y
	655	DO i = nxl_y, nxr_y
	656	CALL fft_y( ar(0:ny,i,k), direction )
	657	ENDDO
	658	ENDDO
[683]	659	!$OMP END PARALLEL
[1]	660
	661	END SUBROUTINE fftyp
	662
	663	#else
	664	SUBROUTINE ffty( ar, direction )
	665
	666	!------------------------------------------------------------------------------!
	667	! Fourier-transformation along y-direction Non parallel version
	668	!------------------------------------------------------------------------------!
	669
	670	IMPLICIT NONE
	671
	672	CHARACTER (LEN=*) :: direction
	673	INTEGER :: i, k
	674	REAL :: ar(1:nz,0:ny,0:nx)
	675
	676	!
	677	!-- Performing the fft with one of the methods implemented
[683]	678	!$OMP PARALLEL PRIVATE ( i, k )
	679	!$OMP DO
[1]	680	DO k = 1, nz
	681	DO i = 0, nx
	682	CALL fft_y( ar(k,0:ny,i), direction )
	683	ENDDO
	684	ENDDO
[683]	685	!$OMP END PARALLEL
[1]	686
	687	END SUBROUTINE ffty
	688	#endif
	689
	690	#if defined( __parallel )
	691	SUBROUTINE ffty_tr_yx( f_in, work, f_out )
	692
	693	!------------------------------------------------------------------------------!
	694	! Fourier-transformation along y with subsequent transposition y --> x for
	695	! a 1d-decomposition along x
	696	!
	697	! ATTENTION: The performance of this routine is much faster on the NEC-SX6,
	698	! if the first index of work_ffty_vec is odd. Otherwise
	699	! memory bank conflicts may occur (especially if the index is a
	700	! multiple of 128). That's why work_ffty_vec is dimensioned as
	701	! 0:ny+1.
	702	! Of course, this will not work if users are using an odd number
	703	! of gridpoints along y.
	704	!------------------------------------------------------------------------------!
	705
	706	USE control_parameters
	707	USE cpulog
	708	USE indices
	709	USE interfaces
	710	USE pegrid
	711	USE transpose_indices
	712
	713	IMPLICIT NONE
	714
	715	INTEGER :: i, iend, iouter, ir, j, k
	716	INTEGER, PARAMETER :: stridex = 4
	717
	718	REAL, DIMENSION(0:ny,stridex) :: work_ffty
	719	#if defined( __nec )
	720	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
	721	#endif
[1003]	722	REAL, DIMENSION(1:nz,0:ny,nxl:nxr) :: f_in
	723	REAL, DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_out
	724	REAL, DIMENSION(nxl:nxr,1:nz,0:ny) :: work
[1]	725
	726	!
	727	!-- Carry out the FFT along y, where all data are present due to the
	728	!-- 1d-decomposition along x. Resort the data in a way that x becomes
	729	!-- the first index.
	730	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
	731
	732	IF ( host(1:3) == 'nec' ) THEN
	733	#if defined( __nec )
	734	!
	735	!-- Code optimized for vector processors
[85]	736	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	737	!$OMP DO
	738	DO i = nxl, nxr
	739
	740	DO j = 0, ny
	741	DO k = 1, nz
	742	work_ffty_vec(j,k,i) = f_in(k,j,i)
	743	ENDDO
	744	ENDDO
	745
	746	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' )
	747
	748	ENDDO
	749
	750	!$OMP DO
	751	DO k = 1, nz
	752	DO j = 0, ny
	753	DO i = nxl, nxr
	754	work(i,k,j) = work_ffty_vec(j,k,i)
	755	ENDDO
	756	ENDDO
	757	ENDDO
	758	!$OMP END PARALLEL
	759	#endif
	760
	761	ELSE
	762
	763	!
	764	!-- Cache optimized code.
	765	!-- The i-(x-)direction is split into a strided outer loop and an inner
	766	!-- loop for better cache performance
	767	!$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty)
	768	!$OMP DO
	769	DO iouter = nxl, nxr, stridex
	770
	771	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
	772
	773	DO k = 1, nz
	774
	775	DO i = iouter, iend
	776
	777	ir = i-iouter+1 ! counter within a stride
	778	DO j = 0, ny
	779	work_ffty(j,ir) = f_in(k,j,i)
	780	ENDDO
	781	!
	782	!-- FFT along y
	783	CALL fft_y( work_ffty(:,ir), 'forward' )
	784
	785	ENDDO
	786
	787	!
	788	!-- Resort
	789	DO j = 0, ny
	790	DO i = iouter, iend
	791	work(i,k,j) = work_ffty(j,i-iouter+1)
	792	ENDDO
	793	ENDDO
	794
	795	ENDDO
	796
	797	ENDDO
	798	!$OMP END PARALLEL
	799
	800	ENDIF
	801	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	802
	803	!
	804	!-- Transpose array
	805	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	806	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	807	CALL MPI_ALLTOALL( work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
	808	f_out(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
	809	comm1dx, ierr )
	810	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	811
	812	END SUBROUTINE ffty_tr_yx
	813
	814
	815	SUBROUTINE tr_xy_ffty( f_in, work, f_out )
	816
	817	!------------------------------------------------------------------------------!
	818	! Transposition x --> y with a subsequent backward Fourier transformation for
	819	! a 1d-decomposition along x
	820	!------------------------------------------------------------------------------!
	821
	822	USE control_parameters
	823	USE cpulog
	824	USE indices
	825	USE interfaces
	826	USE pegrid
	827	USE transpose_indices
	828
	829	IMPLICIT NONE
	830
	831	INTEGER :: i, iend, iouter, ir, j, k
	832	INTEGER, PARAMETER :: stridex = 4
	833
	834	REAL, DIMENSION(0:ny,stridex) :: work_ffty
	835	#if defined( __nec )
	836	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
	837	#endif
[1003]	838	REAL, DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_in
	839	REAL, DIMENSION(1:nz,0:ny,nxl:nxr) :: f_out
	840	REAL, DIMENSION(nxl:nxr,1:nz,0:ny) :: work
[1]	841
	842	!
	843	!-- Transpose array
	844	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	845	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	846	CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
	847	work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
	848	comm1dx, ierr )
	849	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	850
	851	!
	852	!-- Resort the data in a way that y becomes the first index and carry out the
	853	!-- backward fft along y.
	854	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
	855
	856	IF ( host(1:3) == 'nec' ) THEN
	857	#if defined( __nec )
	858	!
	859	!-- Code optimized for vector processors
[85]	860	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	861	!$OMP DO
	862	DO k = 1, nz
	863	DO j = 0, ny
	864	DO i = nxl, nxr
	865	work_ffty_vec(j,k,i) = work(i,k,j)
	866	ENDDO
	867	ENDDO
	868	ENDDO
	869
	870	!$OMP DO
	871	DO i = nxl, nxr
	872
	873	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' )
	874
	875	DO j = 0, ny
	876	DO k = 1, nz
	877	f_out(k,j,i) = work_ffty_vec(j,k,i)
	878	ENDDO
	879	ENDDO
	880
	881	ENDDO
	882	!$OMP END PARALLEL
	883	#endif
	884
	885	ELSE
	886
	887	!
	888	!-- Cache optimized code.
	889	!-- The i-(x-)direction is split into a strided outer loop and an inner
	890	!-- loop for better cache performance
	891	!$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty )
	892	!$OMP DO
	893	DO iouter = nxl, nxr, stridex
	894
	895	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
	896
	897	DO k = 1, nz
	898	!
	899	!-- Resort
	900	DO j = 0, ny
	901	DO i = iouter, iend
	902	work_ffty(j,i-iouter+1) = work(i,k,j)
	903	ENDDO
	904	ENDDO
	905
	906	DO i = iouter, iend
	907
	908	!
	909	!-- FFT along y
	910	ir = i-iouter+1 ! counter within a stride
	911	CALL fft_y( work_ffty(:,ir), 'backward' )
	912
	913	DO j = 0, ny
	914	f_out(k,j,i) = work_ffty(j,ir)
	915	ENDDO
	916	ENDDO
	917
	918	ENDDO
	919
	920	ENDDO
	921	!$OMP END PARALLEL
	922
	923	ENDIF
	924
	925	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	926
	927	END SUBROUTINE tr_xy_ffty
	928
	929
	930	SUBROUTINE fftx_tri_fftx( ar )
	931
	932	!------------------------------------------------------------------------------!
	933	! FFT along x, solution of the tridiagonal system and backward FFT for
	934	! a 1d-decomposition along x
	935	!
	936	! WARNING: this subroutine may still not work for hybrid parallelization
	937	! with OpenMP (for possible necessary changes see the original
	938	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
	939	!------------------------------------------------------------------------------!
	940
	941	USE control_parameters
	942	USE cpulog
	943	USE grid_variables
	944	USE indices
	945	USE interfaces
	946	USE pegrid
	947	USE transpose_indices
	948
	949	IMPLICIT NONE
	950
	951	character(len=3) :: myth_char
	952
	953	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
	954
[1003]	955	REAL, DIMENSION(0:nx) :: work_fftx
	956	REAL, DIMENSION(0:nx,1:nz) :: work_trix
	957	REAL, DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: ar
	958	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
[1]	959
	960
	961	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'start' )
	962
	963	ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) )
	964
	965	tn = 0 ! Default thread number in case of one thread
	966	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_fftx, work_trix )
	967	DO j = nys_x, nyn_x
	968
	969	!$ tn = omp_get_thread_num()
	970
	971	IF ( host(1:3) == 'nec' ) THEN
	972	!
	973	!-- Code optimized for vector processors
	974	DO k = 1, nz
	975
	976	m = 0
	977	DO n = 1, pdims(1)
[1003]	978	DO i = 1, nnx
[1]	979	work_trix(m,k) = ar(i,k,j,n)
	980	m = m + 1
	981	ENDDO
	982	ENDDO
	983
	984	ENDDO
	985
	986	CALL fft_x_m( work_trix, 'forward' )
	987
	988	ELSE
	989	!
	990	!-- Cache optimized code
	991	DO k = 1, nz
	992
	993	m = 0
	994	DO n = 1, pdims(1)
[1003]	995	DO i = 1, nnx
[1]	996	work_fftx(m) = ar(i,k,j,n)
	997	m = m + 1
	998	ENDDO
	999	ENDDO
	1000
	1001	CALL fft_x( work_fftx, 'forward' )
	1002
	1003	DO i = 0, nx
	1004	work_trix(i,k) = work_fftx(i)
	1005	ENDDO
	1006
	1007	ENDDO
	1008
	1009	ENDIF
	1010
	1011	!
	1012	!-- Solve the linear equation system
	1013	CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) )
	1014
	1015	IF ( host(1:3) == 'nec' ) THEN
	1016	!
	1017	!-- Code optimized for vector processors
	1018	CALL fft_x_m( work_trix, 'backward' )
	1019
	1020	DO k = 1, nz
	1021
	1022	m = 0
	1023	DO n = 1, pdims(1)
[1003]	1024	DO i = 1, nnx
[1]	1025	ar(i,k,j,n) = work_trix(m,k)
	1026	m = m + 1
	1027	ENDDO
	1028	ENDDO
	1029
	1030	ENDDO
	1031
	1032	ELSE
	1033	!
	1034	!-- Cache optimized code
	1035	DO k = 1, nz
	1036
	1037	DO i = 0, nx
	1038	work_fftx(i) = work_trix(i,k)
	1039	ENDDO
	1040
	1041	CALL fft_x( work_fftx, 'backward' )
	1042
	1043	m = 0
	1044	DO n = 1, pdims(1)
[1003]	1045	DO i = 1, nnx
[1]	1046	ar(i,k,j,n) = work_fftx(m)
	1047	m = m + 1
	1048	ENDDO
	1049	ENDDO
	1050
	1051	ENDDO
	1052
	1053	ENDIF
	1054
	1055	ENDDO
	1056
	1057	DEALLOCATE( tri )
	1058
	1059	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'stop' )
	1060
	1061	END SUBROUTINE fftx_tri_fftx
	1062
	1063
	1064	SUBROUTINE fftx_tr_xy( f_in, work, f_out )
	1065
	1066	!------------------------------------------------------------------------------!
	1067	! Fourier-transformation along x with subsequent transposition x --> y for
	1068	! a 1d-decomposition along y
	1069	!
	1070	! ATTENTION: The NEC-branch of this routine may significantly profit from
	1071	! further optimizations. So far, performance is much worse than
	1072	! for routine ffty_tr_yx (more than three times slower).
	1073	!------------------------------------------------------------------------------!
	1074
	1075	USE control_parameters
	1076	USE cpulog
	1077	USE indices
	1078	USE interfaces
	1079	USE pegrid
	1080	USE transpose_indices
	1081
	1082	IMPLICIT NONE
	1083
	1084	INTEGER :: i, j, k
	1085
[1003]	1086	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
	1087	REAL, DIMENSION(1:nz,nys:nyn,0:nx) :: f_in
	1088	REAL, DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_out
	1089	REAL, DIMENSION(nys:nyn,1:nz,0:nx) :: work
[1]	1090
	1091	!
	1092	!-- Carry out the FFT along x, where all data are present due to the
	1093	!-- 1d-decomposition along y. Resort the data in a way that y becomes
	1094	!-- the first index.
	1095	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
	1096
	1097	IF ( host(1:3) == 'nec' ) THEN
	1098	!
	1099	!-- Code for vector processors
[85]	1100	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	1101	!$OMP DO
	1102	DO i = 0, nx
	1103
	1104	DO j = nys, nyn
	1105	DO k = 1, nz
	1106	work_fftx(i,k,j) = f_in(k,j,i)
	1107	ENDDO
	1108	ENDDO
	1109
	1110	ENDDO
	1111
	1112	!$OMP DO
	1113	DO j = nys, nyn
	1114
	1115	CALL fft_x_m( work_fftx(:,:,j), 'forward' )
	1116
	1117	DO k = 1, nz
	1118	DO i = 0, nx
	1119	work(j,k,i) = work_fftx(i,k,j)
	1120	ENDDO
	1121	ENDDO
	1122
	1123	ENDDO
	1124	!$OMP END PARALLEL
	1125
	1126	ELSE
	1127
	1128	!
	1129	!-- Cache optimized code (there might be still a potential for better
	1130	!-- optimization).
[696]	1131	!$OMP PARALLEL PRIVATE (i,j,k)
[1]	1132	!$OMP DO
	1133	DO i = 0, nx
	1134
	1135	DO j = nys, nyn
	1136	DO k = 1, nz
	1137	work_fftx(i,k,j) = f_in(k,j,i)
	1138	ENDDO
	1139	ENDDO
	1140
	1141	ENDDO
	1142
	1143	!$OMP DO
	1144	DO j = nys, nyn
	1145	DO k = 1, nz
	1146
	1147	CALL fft_x( work_fftx(0:nx,k,j), 'forward' )
	1148
	1149	DO i = 0, nx
	1150	work(j,k,i) = work_fftx(i,k,j)
	1151	ENDDO
	1152	ENDDO
	1153
	1154	ENDDO
	1155	!$OMP END PARALLEL
	1156
	1157	ENDIF
	1158	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
	1159
	1160	!
	1161	!-- Transpose array
	1162	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	1163	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	1164	CALL MPI_ALLTOALL( work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
	1165	f_out(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
	1166	comm1dy, ierr )
	1167	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	1168
	1169	END SUBROUTINE fftx_tr_xy
	1170
	1171
	1172	SUBROUTINE tr_yx_fftx( f_in, work, f_out )
	1173
	1174	!------------------------------------------------------------------------------!
	1175	! Transposition y --> x with a subsequent backward Fourier transformation for
	1176	! a 1d-decomposition along x
	1177	!------------------------------------------------------------------------------!
	1178
	1179	USE control_parameters
	1180	USE cpulog
	1181	USE indices
	1182	USE interfaces
	1183	USE pegrid
	1184	USE transpose_indices
	1185
	1186	IMPLICIT NONE
	1187
	1188	INTEGER :: i, j, k
	1189
[1003]	1190	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
	1191	REAL, DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_in
	1192	REAL, DIMENSION(1:nz,nys:nyn,0:nx) :: f_out
	1193	REAL, DIMENSION(nys:nyn,1:nz,0:nx) :: work
[1]	1194
	1195	!
	1196	!-- Transpose array
	1197	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	1198	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	1199	CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
	1200	work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
	1201	comm1dy, ierr )
	1202	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	1203
	1204	!
	1205	!-- Carry out the FFT along x, where all data are present due to the
	1206	!-- 1d-decomposition along y. Resort the data in a way that y becomes
	1207	!-- the first index.
	1208	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
	1209
	1210	IF ( host(1:3) == 'nec' ) THEN
	1211	!
	1212	!-- Code optimized for vector processors
[85]	1213	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	1214	!$OMP DO
	1215	DO j = nys, nyn
	1216
	1217	DO k = 1, nz
	1218	DO i = 0, nx
	1219	work_fftx(i,k,j) = work(j,k,i)
	1220	ENDDO
	1221	ENDDO
	1222
	1223	CALL fft_x_m( work_fftx(:,:,j), 'backward' )
	1224
	1225	ENDDO
	1226
	1227	!$OMP DO
	1228	DO i = 0, nx
	1229	DO j = nys, nyn
	1230	DO k = 1, nz
	1231	f_out(k,j,i) = work_fftx(i,k,j)
	1232	ENDDO
	1233	ENDDO
	1234	ENDDO
	1235	!$OMP END PARALLEL
	1236
	1237	ELSE
	1238
	1239	!
	1240	!-- Cache optimized code (there might be still a potential for better
	1241	!-- optimization).
[696]	1242	!$OMP PARALLEL PRIVATE (i,j,k)
[1]	1243	!$OMP DO
	1244	DO j = nys, nyn
	1245	DO k = 1, nz
	1246
	1247	DO i = 0, nx
	1248	work_fftx(i,k,j) = work(j,k,i)
	1249	ENDDO
	1250
	1251	CALL fft_x( work_fftx(0:nx,k,j), 'backward' )
	1252
	1253	ENDDO
	1254	ENDDO
	1255
	1256	!$OMP DO
	1257	DO i = 0, nx
	1258	DO j = nys, nyn
	1259	DO k = 1, nz
	1260	f_out(k,j,i) = work_fftx(i,k,j)
	1261	ENDDO
	1262	ENDDO
	1263	ENDDO
	1264	!$OMP END PARALLEL
	1265
	1266	ENDIF
	1267	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	1268
	1269	END SUBROUTINE tr_yx_fftx
	1270
	1271
	1272	SUBROUTINE ffty_tri_ffty( ar )
	1273
	1274	!------------------------------------------------------------------------------!
	1275	! FFT along y, solution of the tridiagonal system and backward FFT for
	1276	! a 1d-decomposition along y
	1277	!
	1278	! WARNING: this subroutine may still not work for hybrid parallelization
	1279	! with OpenMP (for possible necessary changes see the original
	1280	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
	1281	!------------------------------------------------------------------------------!
	1282
	1283	USE control_parameters
	1284	USE cpulog
	1285	USE grid_variables
	1286	USE indices
	1287	USE interfaces
	1288	USE pegrid
	1289	USE transpose_indices
	1290
	1291	IMPLICIT NONE
	1292
	1293	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
	1294
[1003]	1295	REAL, DIMENSION(0:ny) :: work_ffty
	1296	REAL, DIMENSION(0:ny,1:nz) :: work_triy
	1297	REAL, DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: ar
	1298	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
[1]	1299
	1300
	1301	CALL cpu_log( log_point_s(39), 'fft_y + tridia', 'start' )
	1302
	1303	ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) )
	1304
	1305	tn = 0 ! Default thread number in case of one thread
[696]	1306	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy )
[1]	1307	DO i = nxl_y, nxr_y
	1308
	1309	!$ tn = omp_get_thread_num()
	1310
	1311	IF ( host(1:3) == 'nec' ) THEN
	1312	!
	1313	!-- Code optimized for vector processors
	1314	DO k = 1, nz
	1315
	1316	m = 0
	1317	DO n = 1, pdims(2)
[1003]	1318	DO j = 1, nny
[1]	1319	work_triy(m,k) = ar(j,k,i,n)
	1320	m = m + 1
	1321	ENDDO
	1322	ENDDO
	1323
	1324	ENDDO
	1325
	1326	CALL fft_y_m( work_triy, ny, 'forward' )
	1327
	1328	ELSE
	1329	!
	1330	!-- Cache optimized code
	1331	DO k = 1, nz
	1332
	1333	m = 0
	1334	DO n = 1, pdims(2)
[1003]	1335	DO j = 1, nny
[1]	1336	work_ffty(m) = ar(j,k,i,n)
	1337	m = m + 1
	1338	ENDDO
	1339	ENDDO
	1340
	1341	CALL fft_y( work_ffty, 'forward' )
	1342
	1343	DO j = 0, ny
	1344	work_triy(j,k) = work_ffty(j)
	1345	ENDDO
	1346
	1347	ENDDO
	1348
	1349	ENDIF
	1350
	1351	!
	1352	!-- Solve the linear equation system
	1353	CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) )
	1354
	1355	IF ( host(1:3) == 'nec' ) THEN
	1356	!
	1357	!-- Code optimized for vector processors
	1358	CALL fft_y_m( work_triy, ny, 'backward' )
	1359
	1360	DO k = 1, nz
	1361
	1362	m = 0
	1363	DO n = 1, pdims(2)
[1003]	1364	DO j = 1, nny
[1]	1365	ar(j,k,i,n) = work_triy(m,k)
	1366	m = m + 1
	1367	ENDDO
	1368	ENDDO
	1369
	1370	ENDDO
	1371
	1372	ELSE
	1373	!
	1374	!-- Cache optimized code
	1375	DO k = 1, nz
	1376
	1377	DO j = 0, ny
	1378	work_ffty(j) = work_triy(j,k)
	1379	ENDDO
	1380
	1381	CALL fft_y( work_ffty, 'backward' )
	1382
	1383	m = 0
	1384	DO n = 1, pdims(2)
[1003]	1385	DO j = 1, nny
[1]	1386	ar(j,k,i,n) = work_ffty(m)
	1387	m = m + 1
	1388	ENDDO
	1389	ENDDO
	1390
	1391	ENDDO
	1392
	1393	ENDIF
	1394
	1395	ENDDO
	1396
	1397	DEALLOCATE( tri )
	1398
	1399	CALL cpu_log( log_point_s(39), 'fft_y + tridia', 'stop' )
	1400
	1401	END SUBROUTINE ffty_tri_ffty
	1402
	1403
	1404	SUBROUTINE tridia_1dd( ddx2, ddy2, nx, ny, j, ar, tri )
	1405
	1406	!------------------------------------------------------------------------------!
	1407	! Solves the linear system of equations for a 1d-decomposition along x (see
	1408	! tridia)
	1409	!
[940]	1410	! Attention: when using the intel compilers older than 12.0, array tri must
	1411	! be passed as an argument to the contained subroutines. Otherwise
	1412	! addres faults will occur. This feature can be activated with
	1413	! cpp-switch __intel11
[1]	1414	! On NEC, tri should not be passed (except for routine substi_1dd)
	1415	! because this causes very bad performance.
	1416	!------------------------------------------------------------------------------!
	1417
	1418	USE arrays_3d
	1419	USE control_parameters
	1420
	1421	USE pegrid
	1422
	1423	IMPLICIT NONE
	1424
	1425	INTEGER :: i, j, k, nnyh, nx, ny, omp_get_thread_num, tn
	1426
	1427	REAL :: ddx2, ddy2
	1428
	1429	REAL, DIMENSION(0:nx,1:nz) :: ar
	1430	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
	1431	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	1432
	1433
	1434	nnyh = ( ny + 1 ) / 2
	1435
	1436	!
	1437	!-- Define constant elements of the tridiagonal matrix.
	1438	!-- The compiler on SX6 does loop exchange. If 0:nx is a high power of 2,
	1439	!-- the exchanged loops create bank conflicts. The following directive
	1440	!-- prohibits loop exchange and the loops perform much better.
	1441	! tn = omp_get_thread_num()
	1442	! WRITE( 120+tn, * ) '+++ id=',myid,' nx=',nx,' thread=', omp_get_thread_num()
[82]	1443	! CALL local_flush( 120+tn )
[1]	1444	!CDIR NOLOOPCHG
	1445	DO k = 0, nz-1
	1446	DO i = 0,nx
[667]	1447	tri(2,i,k) = ddzu_pres(k+1) * ddzw(k+1)
	1448	tri(3,i,k) = ddzu_pres(k+2) * ddzw(k+1)
[1]	1449	ENDDO
	1450	ENDDO
	1451	! WRITE( 120+tn, * ) '+++ id=',myid,' end of first tridia loop thread=', omp_get_thread_num()
[82]	1452	! CALL local_flush( 120+tn )
[1]	1453
	1454	IF ( j <= nnyh ) THEN
[940]	1455	#if defined( __intel11 )
[1]	1456	CALL maketri_1dd( j, tri )
	1457	#else
	1458	CALL maketri_1dd( j )
	1459	#endif
	1460	ELSE
[940]	1461	#if defined( __intel11 )
[1]	1462	CALL maketri_1dd( ny+1-j, tri )
	1463	#else
	1464	CALL maketri_1dd( ny+1-j )
	1465	#endif
	1466	ENDIF
[940]	1467	#if defined( __intel11 )
[1]	1468	CALL split_1dd( tri )
	1469	#else
	1470	CALL split_1dd
	1471	#endif
	1472	CALL substi_1dd( ar, tri )
	1473
	1474	CONTAINS
	1475
[940]	1476	#if defined( __intel11 )
[1]	1477	SUBROUTINE maketri_1dd( j, tri )
	1478	#else
	1479	SUBROUTINE maketri_1dd( j )
	1480	#endif
	1481
	1482	!------------------------------------------------------------------------------!
	1483	! computes the i- and j-dependent component of the matrix
	1484	!------------------------------------------------------------------------------!
	1485
	1486	USE constants
	1487
	1488	IMPLICIT NONE
	1489
	1490	INTEGER :: i, j, k, nnxh
	1491	REAL :: a, c
	1492
	1493	REAL, DIMENSION(0:nx) :: l
	1494
[940]	1495	#if defined( __intel11 )
[1]	1496	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	1497	#endif
	1498
	1499
	1500	nnxh = ( nx + 1 ) / 2
	1501	!
	1502	!-- Provide the tridiagonal matrix for solution of the Poisson equation in
	1503	!-- Fourier space. The coefficients are computed following the method of
	1504	!-- Schmidt et al. (DFVLR-Mitteilung 84-15), which departs from Stephan
	1505	!-- Siano's original version by discretizing the Poisson equation,
	1506	!-- before it is Fourier-transformed
	1507	DO i = 0, nx
[128]	1508	IF ( i >= 0 .AND. i <= nnxh ) THEN
[1]	1509	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
[1013]	1510	REAL( nx+1 ) ) ) * ddx2 + &
[1]	1511	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
[1013]	1512	REAL( ny+1 ) ) ) * ddy2
[1]	1513	ELSE
	1514	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
[1013]	1515	REAL( nx+1 ) ) ) * ddx2 + &
[1]	1516	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
[1013]	1517	REAL( ny+1 ) ) ) * ddy2
[1]	1518	ENDIF
	1519	ENDDO
	1520
	1521	DO k = 0, nz-1
	1522	DO i = 0, nx
[667]	1523	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
	1524	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
[1]	1525	tri(1,i,k) = a + c - l(i)
	1526	ENDDO
	1527	ENDDO
	1528	IF ( ibc_p_b == 1 .OR. ibc_p_b == 2 ) THEN
	1529	DO i = 0, nx
	1530	tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
	1531	ENDDO
	1532	ENDIF
	1533	IF ( ibc_p_t == 1 ) THEN
	1534	DO i = 0, nx
	1535	tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
	1536	ENDDO
	1537	ENDIF
	1538
	1539	END SUBROUTINE maketri_1dd
	1540
	1541
[940]	1542	#if defined( __intel11 )
[1]	1543	SUBROUTINE split_1dd( tri )
	1544	#else
	1545	SUBROUTINE split_1dd
	1546	#endif
	1547
	1548	!------------------------------------------------------------------------------!
	1549	! Splitting of the tridiagonal matrix (Thomas algorithm)
	1550	!------------------------------------------------------------------------------!
	1551
	1552	IMPLICIT NONE
	1553
	1554	INTEGER :: i, k
	1555
[940]	1556	#if defined( __intel11 )
[1]	1557	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	1558	#endif
	1559
	1560
	1561	!
	1562	!-- Splitting
	1563	DO i = 0, nx
	1564	tri(4,i,0) = tri(1,i,0)
	1565	ENDDO
	1566	DO k = 1, nz-1
	1567	DO i = 0, nx
	1568	tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
	1569	tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
	1570	ENDDO
	1571	ENDDO
	1572
	1573	END SUBROUTINE split_1dd
	1574
	1575
	1576	SUBROUTINE substi_1dd( ar, tri )
	1577
	1578	!------------------------------------------------------------------------------!
	1579	! Substitution (Forward and Backward) (Thomas algorithm)
	1580	!------------------------------------------------------------------------------!
	1581
	1582	IMPLICIT NONE
	1583
[76]	1584	INTEGER :: i, k
[1]	1585
	1586	REAL, DIMENSION(0:nx,nz) :: ar
	1587	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
	1588	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	1589
	1590	!
	1591	!-- Forward substitution
	1592	DO i = 0, nx
	1593	ar1(i,0) = ar(i,1)
	1594	ENDDO
	1595	DO k = 1, nz-1
	1596	DO i = 0, nx
	1597	ar1(i,k) = ar(i,k+1) - tri(5,i,k) * ar1(i,k-1)
	1598	ENDDO
	1599	ENDDO
	1600
	1601	!
	1602	!-- Backward substitution
[763]	1603	!-- Note, the add of 1.0E-20 in the denominator is due to avoid divisions
	1604	!-- by zero appearing if the pressure bc is set to neumann at the top of
	1605	!-- the model domain.
[1]	1606	DO i = 0, nx
[761]	1607	ar(i,nz) = ar1(i,nz-1) / ( tri(4,i,nz-1) + 1.0E-20 )
[1]	1608	ENDDO
	1609	DO k = nz-2, 0, -1
	1610	DO i = 0, nx
	1611	ar(i,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,k+2) ) &
	1612	/ tri(4,i,k)
	1613	ENDDO
	1614	ENDDO
	1615
[76]	1616	!
	1617	!-- Indices i=0, j=0 correspond to horizontally averaged pressure.
	1618	!-- The respective values of ar should be zero at all k-levels if
	1619	!-- acceleration of horizontally averaged vertical velocity is zero.
	1620	IF ( ibc_p_b == 1 .AND. ibc_p_t == 1 ) THEN
	1621	IF ( j == 0 ) THEN
	1622	DO k = 1, nz
	1623	ar(0,k) = 0.0
	1624	ENDDO
	1625	ENDIF
	1626	ENDIF
	1627
[1]	1628	END SUBROUTINE substi_1dd
	1629
	1630	END SUBROUTINE tridia_1dd
	1631
	1632	#endif
[807]	1633	#endif
[1]	1634	END MODULE poisfft_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |