Home

Context Navigation

source: palm/trunk/SOURCE/poisfft.f90 @ 1103

Last change on this file since 1103 was 1103, checked in by raasch, 11 years ago
small bugfixes; mrun and subjob scripts are made bash compatible; further adjustments for lckyuh
Property svn:keywords set to `Id`
File size: 47.4 KB

Rev	Line
[1]	1	MODULE poisfft_mod
	2
[1036]	3	!--------------------------------------------------------------------------------!
	4	! This file is part of PALM.
	5	!
	6	! PALM is free software: you can redistribute it and/or modify it under the terms
	7	! of the GNU General Public License as published by the Free Software Foundation,
	8	! either version 3 of the License, or (at your option) any later version.
	9	!
	10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
	11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
	12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
	13	!
	14	! You should have received a copy of the GNU General Public License along with
	15	! PALM. If not, see <http://www.gnu.org/licenses/>.
	16	!
	17	! Copyright 1997-2012 Leibniz University Hannover
	18	!--------------------------------------------------------------------------------!
	19	!
[484]	20	! Current revisions:
[1]	21	! -----------------
[1103]	22	! tri, ar, and ar1 arguments in tridia-routines (2d) are removed because they
	23	! sometimes cause segmentation faults with intel 12.1 compiler
[1]	24	!
	25	! Former revisions:
	26	! -----------------
[3]	27	! $Id: poisfft.f90 1103 2013-02-20 02:15:53Z raasch $
[77]	28	!
[1093]	29	! 1092 2013-02-02 11:24:22Z raasch
	30	! unused variables removed
	31	!
[1037]	32	! 1036 2012-10-22 13:43:42Z raasch
	33	! code put under GPL (PALM 3.9)
	34	!
[1014]	35	! 2012-09-21 07:03:55Z raasch
	36	! FLOAT type conversion replaced by REAL
	37	!
[1004]	38	! 1003 2012-09-14 14:35:53Z raasch
	39	! indices nxa, nya, etc. replaced by nx, ny, etc.
	40	!
[941]	41	! 940 2012-07-09 14:31:00Z raasch
	42	! special handling of tri-array as an argument in tridia_1dd routines switched
	43	! off because it caused segmentation faults with intel 12.1 compiler
	44	!
[878]	45	! 877 2012-04-03 11:21:44Z suehring
	46	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
	47	! pressure at the top of the model domain.
	48	!
[810]	49	! 809 2012-01-30 13:32:58Z maronga
	50	! Bugfix: replaced .AND. and .NOT. with && and ! in the preprocessor directives
	51	!
[808]	52	! 807 2012-01-25 11:53:51Z maronga
	53	! New cpp directive "__check" implemented which is used by check_namelist_files
	54	! (most of the code is unneeded by check_namelist_files).
	55	!
[764]	56	! 763 2011-10-06 09:32:09Z suehring
	57	! Comment added concerning the last change.
	58	!
[762]	59	! 761 2011-10-05 17:58:52Z suehring
	60	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
	61	! pressure at the top of the model domain.
	62	!
[697]	63	! 696 2011-03-18 07:03:49Z raasch
	64	! work_fftx removed from PRIVATE clauses in fftx_tr_xy and tr_yx_fftx
	65	!
[684]	66	! 683 2011-02-09 14:25:15Z raasch
	67	! openMP parallelization for 2d-domain-decomposition
	68	!
[668]	69	! 667 2010-12-23 12:06:00Z suehring/gryschka
	70	! ddzu replaced by ddzu_pres due to changes in zu(0)
	71	!
[623]	72	! 622 2010-12-10 08:08:13Z raasch
	73	! optional barriers included in order to speed up collective operations
	74	!
[392]	75	! 377 2009-09-04 11:09:00Z raasch
	76	! __lcmuk changed to __lc to avoid problems with Intel compiler on sgi-ice
	77	!
[198]	78	! 164 2008-05-15 08:46:15Z raasch
	79	! Arguments removed from transpose routines
	80	!
[139]	81	! 128 2007-10-26 13:11:14Z raasch
	82	! Bugfix: wavenumber calculation for even nx in routines maketri
	83	!
[90]	84	! 85 2007-05-11 09:35:14Z raasch
	85	! Bugfix: work_fft*_vec removed from some PRIVATE-declarations
	86	!
[77]	87	! 76 2007-03-29 00:58:32Z raasch
	88	! Tridiagonal coefficients adjusted for Neumann boundary conditions both at
	89	! the bottom and the top.
	90	!
[3]	91	! RCS Log replace by Id keyword, revision history cleaned up
	92	!
[1]	93	! Revision 1.24 2006/08/04 15:00:24 raasch
	94	! Default setting of the thread number tn in case of not using OpenMP
	95	!
	96	! Revision 1.23 2006/02/23 12:48:38 raasch
	97	! Additional compiler directive in routine tridia_1dd for preventing loop
	98	! exchange on NEC-SX6
	99	!
	100	! Revision 1.20 2004/04/30 12:38:09 raasch
	101	! Parts of former poisfft_hybrid moved to this subroutine,
	102	! former subroutine changed to a module, renaming of FFT-subroutines and
	103	! -module, FFTs completely substituted by calls of fft_x and fft_y,
	104	! NAG fft used in the non-parallel case completely removed, l in maketri
	105	! is now a 1d-array, variables passed by modules instead of using parameter
	106	! lists, enlarged transposition arrays introduced
	107	!
	108	! Revision 1.1 1997/07/24 11:24:14 raasch
	109	! Initial revision
	110	!
	111	!
	112	! Description:
	113	! ------------
	114	! See below.
	115	!------------------------------------------------------------------------------!
	116
	117	!--------------------------------------------------------------------------!
	118	! poisfft !
	119	! !
	120	! Original version: Stephan Siano (pois3d) !
	121	! !
	122	! Institute of Meteorology and Climatology, University of Hannover !
	123	! Germany !
	124	! !
	125	! Version as of July 23,1996 !
	126	! !
	127	! !
	128	! Version for parallel computers: Siegfried Raasch !
	129	! !
	130	! Version as of July 03,1997 !
	131	! !
	132	! Solves the Poisson equation with a 2D spectral method !
	133	! d^2 p / dx^2 + d^2 p / dy^2 + d^2 p / dz^2 = s !
	134	! !
	135	! Input: !
	136	! real ar contains in the (nnx,nny,nnz) elements, !
	137	! starting from the element (1,nys,nxl), the !
	138	! values for s !
	139	! real work Temporary array !
	140	! !
	141	! Output: !
	142	! real ar contains the solution for p !
	143	!--------------------------------------------------------------------------!
	144
	145	USE fft_xy
	146	USE indices
	147	USE transpose_indices
	148
	149	IMPLICIT NONE
	150
	151	PRIVATE
[807]	152
[809]	153	#if ! defined ( __check )
[1]	154	PUBLIC poisfft, poisfft_init
	155
	156	INTERFACE poisfft
	157	MODULE PROCEDURE poisfft
	158	END INTERFACE poisfft
	159
	160	INTERFACE poisfft_init
	161	MODULE PROCEDURE poisfft_init
	162	END INTERFACE poisfft_init
[807]	163	#else
	164	PUBLIC poisfft_init
[1]	165
[807]	166	INTERFACE poisfft_init
	167	MODULE PROCEDURE poisfft_init
	168	END INTERFACE poisfft_init
	169	#endif
	170
[1]	171	CONTAINS
	172
	173	SUBROUTINE poisfft_init
	174
	175	CALL fft_init
	176
	177	END SUBROUTINE poisfft_init
	178
[809]	179	#if ! defined ( __check )
[1]	180	SUBROUTINE poisfft( ar, work )
	181
	182	USE cpulog
	183	USE interfaces
	184	USE pegrid
	185
	186	IMPLICIT NONE
	187
[1003]	188	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar, work
[1]	189
	190
	191	CALL cpu_log( log_point_s(3), 'poisfft', 'start' )
	192
	193	!
	194	!-- Two-dimensional Fourier Transformation in x- and y-direction.
	195	#if defined( __parallel )
	196	IF ( pdims(2) == 1 ) THEN
	197
	198	!
	199	!-- 1d-domain-decomposition along x:
	200	!-- FFT along y and transposition y --> x
	201	CALL ffty_tr_yx( ar, work, ar )
	202
	203	!
	204	!-- FFT along x, solving the tridiagonal system and backward FFT
	205	CALL fftx_tri_fftx( ar )
	206
	207	!
	208	!-- Transposition x --> y and backward FFT along y
	209	CALL tr_xy_ffty( ar, work, ar )
	210
	211	ELSEIF ( pdims(1) == 1 ) THEN
	212
	213	!
	214	!-- 1d-domain-decomposition along y:
	215	!-- FFT along x and transposition x --> y
	216	CALL fftx_tr_xy( ar, work, ar )
	217
	218	!
	219	!-- FFT along y, solving the tridiagonal system and backward FFT
	220	CALL ffty_tri_ffty( ar )
	221
	222	!
	223	!-- Transposition y --> x and backward FFT along x
	224	CALL tr_yx_fftx( ar, work, ar )
	225
	226	ELSE
	227
	228	!
	229	!-- 2d-domain-decomposition
	230	!-- Transposition z --> x
	231	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
[164]	232	CALL transpose_zx( ar, work, ar )
[1]	233	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
	234
	235	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
	236	CALL fftxp( ar, 'forward' )
	237	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
	238
	239	!
	240	!-- Transposition x --> y
	241	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
[164]	242	CALL transpose_xy( ar, work, ar )
[1]	243	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
	244
	245	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
	246	CALL fftyp( ar, 'forward' )
	247	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	248
	249	!
	250	!-- Transposition y --> z
	251	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
[164]	252	CALL transpose_yz( ar, work, ar )
[1]	253	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
	254
	255	!
	256	!-- Solve the Poisson equation in z-direction in cartesian space.
	257	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
	258	CALL tridia( ar )
	259	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
	260
	261	!
	262	!-- Inverse Fourier Transformation
	263	!-- Transposition z --> y
	264	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
[164]	265	CALL transpose_zy( ar, work, ar )
[1]	266	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
	267
	268	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
	269	CALL fftyp( ar, 'backward' )
	270	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	271
	272	!
	273	!-- Transposition y --> x
	274	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
[164]	275	CALL transpose_yx( ar, work, ar )
[1]	276	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
	277
	278	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
	279	CALL fftxp( ar, 'backward' )
	280	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	281
	282	!
	283	!-- Transposition x --> z
	284	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
[164]	285	CALL transpose_xz( ar, work, ar )
[1]	286	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
	287
	288	ENDIF
	289
	290	#else
	291
	292	!
	293	!-- Two-dimensional Fourier Transformation along x- and y-direction.
	294	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
	295	CALL fftx( ar, 'forward' )
	296	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
	297	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
	298	CALL ffty( ar, 'forward' )
	299	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	300
	301	!
	302	!-- Solve the Poisson equation in z-direction in cartesian space.
	303	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
	304	CALL tridia( ar )
	305	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
	306
	307	!
	308	!-- Inverse Fourier Transformation.
	309	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
	310	CALL ffty( ar, 'backward' )
	311	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	312	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
	313	CALL fftx( ar, 'backward' )
	314	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	315
	316	#endif
	317
	318	CALL cpu_log( log_point_s(3), 'poisfft', 'stop' )
	319
	320	END SUBROUTINE poisfft
	321
	322
	323
	324	SUBROUTINE tridia( ar )
	325
	326	!------------------------------------------------------------------------------!
	327	! solves the linear system of equations:
	328	!
	329	! -(4 pi^2(i^2/(dx^2nnx^2)+j^2/(dy^2nny^2))+
	330	! 1/(dzu(k)dzw(k))+1/(dzu(k-1)dzw(k)))*p(i,j,k)+
	331	! 1/(dzu(k)dzw(k))p(i,j,k+1)+1/(dzu(k-1)dzw(k))p(i,j,k-1)=d(i,j,k)
	332	!
	333	! by using the Thomas algorithm
	334	!------------------------------------------------------------------------------!
	335
	336	USE arrays_3d
	337
	338	IMPLICIT NONE
	339
	340	INTEGER :: i, j, k, nnyh
	341
	342	REAL, DIMENSION(nxl_z:nxr_z,0:nz-1) :: ar1
	343	REAL, DIMENSION(5,nxl_z:nxr_z,0:nz-1) :: tri
	344
	345	#if defined( __parallel )
[1003]	346	REAL :: ar(nxl_z:nxr_z,nys_z:nyn_z,1:nz)
[1]	347	#else
	348	REAL :: ar(1:nz,nys_z:nyn_z,nxl_z:nxr_z)
	349	#endif
	350
	351
	352	nnyh = (ny+1) / 2
	353
	354	!
	355	!-- Define constant elements of the tridiagonal matrix.
[683]	356	!$OMP PARALLEL PRIVATE ( k, i )
	357	!$OMP DO
[1]	358	DO k = 0, nz-1
	359	DO i = nxl_z, nxr_z
[667]	360	tri(2,i,k) = ddzu_pres(k+1) * ddzw(k+1)
	361	tri(3,i,k) = ddzu_pres(k+2) * ddzw(k+1)
[1]	362	ENDDO
	363	ENDDO
[683]	364	!$OMP END PARALLEL
[1]	365
	366	#if defined( __parallel )
	367	!
	368	!-- Repeat for all y-levels.
[683]	369	!$OMP PARALLEL FIRSTPRIVATE( tri ) PRIVATE ( ar1, j )
	370	!$OMP DO
[1]	371	DO j = nys_z, nyn_z
	372	IF ( j <= nnyh ) THEN
[1103]	373	CALL maketri( j )
[1]	374	ELSE
[1103]	375	CALL maketri( ny+1-j )
[1]	376	ENDIF
[1103]	377	CALL split
	378	CALL substi( j )
[1]	379	ENDDO
[683]	380	!$OMP END PARALLEL
[1]	381	#else
	382	!
	383	!-- First y-level.
[1103]	384	CALL maketri( nys_z )
	385	CALL split
	386	CALL substi( 0 )
[1]	387
	388	!
	389	!-- Further y-levels.
	390	DO j = 1, nnyh - 1
[1103]	391	CALL maketri( j )
	392	CALL split
	393	CALL substi( j )
	394	CALL substi( ny+1-j )
[1]	395	ENDDO
[1103]	396	CALL maketri( nnyh )
	397	CALL split
	398	CALL substi( nnyh+nys )
[1]	399	#endif
	400
	401	CONTAINS
	402
[1103]	403	SUBROUTINE maketri( j )
[1]	404
	405	!------------------------------------------------------------------------------!
	406	! Computes the i- and j-dependent component of the matrix
	407	!------------------------------------------------------------------------------!
	408
	409	USE arrays_3d
	410	USE constants
	411	USE control_parameters
	412	USE grid_variables
	413
	414	IMPLICIT NONE
	415
	416	INTEGER :: i, j, k, nnxh
	417	REAL :: a, c
	418	REAL :: ll(nxl_z:nxr_z)
	419
	420
	421	nnxh = ( nx + 1 ) / 2
	422
	423	!
	424	!-- Provide the tridiagonal matrix for solution of the Poisson equation in
	425	!-- Fourier space. The coefficients are computed following the method of
	426	!-- Schmidt et al. (DFVLR-Mitteilung 84-15), which departs from Stephan
	427	!-- Siano's original version by discretizing the Poisson equation,
	428	!-- before it is Fourier-transformed
	429	#if defined( __parallel )
	430	DO i = nxl_z, nxr_z
[128]	431	IF ( i >= 0 .AND. i <= nnxh ) THEN
[1]	432	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
[1013]	433	REAL( nx+1 ) ) ) / ( dx * dx ) + &
[1]	434	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
[1013]	435	REAL( ny+1 ) ) ) / ( dy * dy )
[1]	436	ELSE
	437	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
[1013]	438	REAL( nx+1 ) ) ) / ( dx * dx ) + &
[1]	439	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
[1013]	440	REAL( ny+1 ) ) ) / ( dy * dy )
[1]	441	ENDIF
	442	DO k = 0,nz-1
[667]	443	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
	444	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
[1]	445	tri(1,i,k) = a + c - ll(i)
	446	ENDDO
	447	ENDDO
	448	#else
	449	DO i = 0, nnxh
[1013]	450	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / REAL( nx+1 ) ) ) / &
[1]	451	( dx * dx ) + &
[1013]	452	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / REAL( ny+1 ) ) ) / &
[1]	453	( dy * dy )
	454	DO k = 0, nz-1
[667]	455	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
	456	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
[1]	457	tri(1,i,k) = a + c - ll(i)
	458	IF ( i >= 1 .and. i < nnxh ) THEN
	459	tri(1,nx+1-i,k) = tri(1,i,k)
	460	ENDIF
	461	ENDDO
	462	ENDDO
	463	#endif
	464	IF ( ibc_p_b == 1 .OR. ibc_p_b == 2 ) THEN
	465	DO i = nxl_z, nxr_z
	466	tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
	467	ENDDO
	468	ENDIF
	469	IF ( ibc_p_t == 1 ) THEN
	470	DO i = nxl_z, nxr_z
	471	tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
	472	ENDDO
	473	ENDIF
	474
	475	END SUBROUTINE maketri
	476
	477
[1103]	478	SUBROUTINE substi( j )
[1]	479
	480	!------------------------------------------------------------------------------!
	481	! Substitution (Forward and Backward) (Thomas algorithm)
	482	!------------------------------------------------------------------------------!
	483
[76]	484	USE control_parameters
	485
[1]	486	IMPLICIT NONE
	487
	488	INTEGER :: i, j, k
	489
	490	!
	491	!-- Forward substitution.
	492	DO i = nxl_z, nxr_z
	493	#if defined( __parallel )
	494	ar1(i,0) = ar(i,j,1)
	495	#else
	496	ar1(i,0) = ar(1,j,i)
	497	#endif
	498	ENDDO
	499	DO k = 1, nz - 1
	500	DO i = nxl_z, nxr_z
	501	#if defined( __parallel )
	502	ar1(i,k) = ar(i,j,k+1) - tri(5,i,k) * ar1(i,k-1)
	503	#else
	504	ar1(i,k) = ar(k+1,j,i) - tri(5,i,k) * ar1(i,k-1)
	505	#endif
	506	ENDDO
	507	ENDDO
	508
	509	!
[877]	510	!-- Backward substitution
	511	!-- Note, the 1.0E-20 in the denominator is due to avoid divisions
	512	!-- by zero appearing if the pressure bc is set to neumann at the top of
	513	!-- the model domain.
[1]	514	DO i = nxl_z, nxr_z
	515	#if defined( __parallel )
[877]	516	ar(i,j,nz) = ar1(i,nz-1) / ( tri(4,i,nz-1) + 1.0E-20 )
[1]	517	#else
[877]	518	ar(nz,j,i) = ar1(i,nz-1) / ( tri(4,i,nz-1) + 1.0E-20 )
[1]	519	#endif
	520	ENDDO
	521	DO k = nz-2, 0, -1
	522	DO i = nxl_z, nxr_z
	523	#if defined( __parallel )
	524	ar(i,j,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,j,k+2) ) &
	525	/ tri(4,i,k)
	526	#else
	527	ar(k+1,j,i) = ( ar1(i,k) - tri(3,i,k) * ar(k+2,j,i) ) &
	528	/ tri(4,i,k)
	529	#endif
	530	ENDDO
	531	ENDDO
	532
[76]	533	!
	534	!-- Indices i=0, j=0 correspond to horizontally averaged pressure.
	535	!-- The respective values of ar should be zero at all k-levels if
	536	!-- acceleration of horizontally averaged vertical velocity is zero.
	537	IF ( ibc_p_b == 1 .AND. ibc_p_t == 1 ) THEN
	538	IF ( j == 0 .AND. nxl_z == 0 ) THEN
	539	#if defined( __parallel )
	540	DO k = 1, nz
	541	ar(nxl_z,j,k) = 0.0
	542	ENDDO
	543	#else
	544	DO k = 1, nz
	545	ar(k,j,nxl_z) = 0.0
	546	ENDDO
	547	#endif
	548	ENDIF
	549	ENDIF
	550
[1]	551	END SUBROUTINE substi
	552
	553
[1103]	554	SUBROUTINE split
[1]	555
	556	!------------------------------------------------------------------------------!
	557	! Splitting of the tridiagonal matrix (Thomas algorithm)
	558	!------------------------------------------------------------------------------!
	559
	560	IMPLICIT NONE
	561
	562	INTEGER :: i, k
	563
	564	!
	565	!-- Splitting.
	566	DO i = nxl_z, nxr_z
	567	tri(4,i,0) = tri(1,i,0)
	568	ENDDO
	569	DO k = 1, nz-1
	570	DO i = nxl_z, nxr_z
	571	tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
	572	tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
	573	ENDDO
	574	ENDDO
	575
	576	END SUBROUTINE split
	577
	578	END SUBROUTINE tridia
	579
	580
	581	#if defined( __parallel )
	582	SUBROUTINE fftxp( ar, direction )
	583
	584	!------------------------------------------------------------------------------!
	585	! Fourier-transformation along x-direction Parallelized version
	586	!------------------------------------------------------------------------------!
	587
	588	IMPLICIT NONE
	589
	590	CHARACTER (LEN=*) :: direction
	591	INTEGER :: j, k
[1003]	592	REAL :: ar(0:nx,nys_x:nyn_x,nzb_x:nzt_x)
[1]	593
	594	!
	595	!-- Performing the fft with one of the methods implemented
[683]	596	!$OMP PARALLEL PRIVATE ( j, k )
	597	!$OMP DO
[1]	598	DO k = nzb_x, nzt_x
	599	DO j = nys_x, nyn_x
	600	CALL fft_x( ar(0:nx,j,k), direction )
	601	ENDDO
	602	ENDDO
[683]	603	!$OMP END PARALLEL
[1]	604
	605	END SUBROUTINE fftxp
	606
	607	#else
	608	SUBROUTINE fftx( ar, direction )
	609
	610	!------------------------------------------------------------------------------!
	611	! Fourier-transformation along x-direction Non parallel version
	612	!------------------------------------------------------------------------------!
	613
	614	IMPLICIT NONE
	615
	616	CHARACTER (LEN=*) :: direction
	617	INTEGER :: i, j, k
	618	REAL :: ar(1:nz,0:ny,0:nx)
	619
	620	!
	621	!-- Performing the fft with one of the methods implemented
[683]	622	!$OMP PARALLEL PRIVATE ( j, k )
	623	!$OMP DO
[1]	624	DO k = 1, nz
	625	DO j = 0, ny
	626	CALL fft_x( ar(k,j,0:nx), direction )
	627	ENDDO
	628	ENDDO
[683]	629	!$OMP END PARALLEL
[1]	630
	631	END SUBROUTINE fftx
	632	#endif
	633
	634
	635	#if defined( __parallel )
	636	SUBROUTINE fftyp( ar, direction )
	637
	638	!------------------------------------------------------------------------------!
	639	! Fourier-transformation along y-direction Parallelized version
	640	!------------------------------------------------------------------------------!
	641
	642	IMPLICIT NONE
	643
	644	CHARACTER (LEN=*) :: direction
	645	INTEGER :: i, k
[1003]	646	REAL :: ar(0:ny,nxl_y:nxr_y,nzb_y:nzt_y)
[1]	647
	648	!
	649	!-- Performing the fft with one of the methods implemented
[683]	650	!$OMP PARALLEL PRIVATE ( i, k )
	651	!$OMP DO
[1]	652	DO k = nzb_y, nzt_y
	653	DO i = nxl_y, nxr_y
	654	CALL fft_y( ar(0:ny,i,k), direction )
	655	ENDDO
	656	ENDDO
[683]	657	!$OMP END PARALLEL
[1]	658
	659	END SUBROUTINE fftyp
	660
	661	#else
	662	SUBROUTINE ffty( ar, direction )
	663
	664	!------------------------------------------------------------------------------!
	665	! Fourier-transformation along y-direction Non parallel version
	666	!------------------------------------------------------------------------------!
	667
	668	IMPLICIT NONE
	669
	670	CHARACTER (LEN=*) :: direction
	671	INTEGER :: i, k
	672	REAL :: ar(1:nz,0:ny,0:nx)
	673
	674	!
	675	!-- Performing the fft with one of the methods implemented
[683]	676	!$OMP PARALLEL PRIVATE ( i, k )
	677	!$OMP DO
[1]	678	DO k = 1, nz
	679	DO i = 0, nx
	680	CALL fft_y( ar(k,0:ny,i), direction )
	681	ENDDO
	682	ENDDO
[683]	683	!$OMP END PARALLEL
[1]	684
	685	END SUBROUTINE ffty
	686	#endif
	687
	688	#if defined( __parallel )
	689	SUBROUTINE ffty_tr_yx( f_in, work, f_out )
	690
	691	!------------------------------------------------------------------------------!
	692	! Fourier-transformation along y with subsequent transposition y --> x for
	693	! a 1d-decomposition along x
	694	!
	695	! ATTENTION: The performance of this routine is much faster on the NEC-SX6,
	696	! if the first index of work_ffty_vec is odd. Otherwise
	697	! memory bank conflicts may occur (especially if the index is a
	698	! multiple of 128). That's why work_ffty_vec is dimensioned as
	699	! 0:ny+1.
	700	! Of course, this will not work if users are using an odd number
	701	! of gridpoints along y.
	702	!------------------------------------------------------------------------------!
	703
	704	USE control_parameters
	705	USE cpulog
	706	USE indices
	707	USE interfaces
	708	USE pegrid
	709	USE transpose_indices
	710
	711	IMPLICIT NONE
	712
	713	INTEGER :: i, iend, iouter, ir, j, k
	714	INTEGER, PARAMETER :: stridex = 4
	715
	716	REAL, DIMENSION(0:ny,stridex) :: work_ffty
	717	#if defined( __nec )
	718	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
	719	#endif
[1003]	720	REAL, DIMENSION(1:nz,0:ny,nxl:nxr) :: f_in
	721	REAL, DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_out
	722	REAL, DIMENSION(nxl:nxr,1:nz,0:ny) :: work
[1]	723
	724	!
	725	!-- Carry out the FFT along y, where all data are present due to the
	726	!-- 1d-decomposition along x. Resort the data in a way that x becomes
	727	!-- the first index.
	728	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
	729
	730	IF ( host(1:3) == 'nec' ) THEN
	731	#if defined( __nec )
	732	!
	733	!-- Code optimized for vector processors
[85]	734	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	735	!$OMP DO
	736	DO i = nxl, nxr
	737
	738	DO j = 0, ny
	739	DO k = 1, nz
	740	work_ffty_vec(j,k,i) = f_in(k,j,i)
	741	ENDDO
	742	ENDDO
	743
	744	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' )
	745
	746	ENDDO
	747
	748	!$OMP DO
	749	DO k = 1, nz
	750	DO j = 0, ny
	751	DO i = nxl, nxr
	752	work(i,k,j) = work_ffty_vec(j,k,i)
	753	ENDDO
	754	ENDDO
	755	ENDDO
	756	!$OMP END PARALLEL
	757	#endif
	758
	759	ELSE
	760
	761	!
	762	!-- Cache optimized code.
	763	!-- The i-(x-)direction is split into a strided outer loop and an inner
	764	!-- loop for better cache performance
	765	!$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty)
	766	!$OMP DO
	767	DO iouter = nxl, nxr, stridex
	768
	769	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
	770
	771	DO k = 1, nz
	772
	773	DO i = iouter, iend
	774
	775	ir = i-iouter+1 ! counter within a stride
	776	DO j = 0, ny
	777	work_ffty(j,ir) = f_in(k,j,i)
	778	ENDDO
	779	!
	780	!-- FFT along y
	781	CALL fft_y( work_ffty(:,ir), 'forward' )
	782
	783	ENDDO
	784
	785	!
	786	!-- Resort
	787	DO j = 0, ny
	788	DO i = iouter, iend
	789	work(i,k,j) = work_ffty(j,i-iouter+1)
	790	ENDDO
	791	ENDDO
	792
	793	ENDDO
	794
	795	ENDDO
	796	!$OMP END PARALLEL
	797
	798	ENDIF
	799	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	800
	801	!
	802	!-- Transpose array
	803	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	804	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	805	CALL MPI_ALLTOALL( work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
	806	f_out(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
	807	comm1dx, ierr )
	808	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	809
	810	END SUBROUTINE ffty_tr_yx
	811
	812
	813	SUBROUTINE tr_xy_ffty( f_in, work, f_out )
	814
	815	!------------------------------------------------------------------------------!
	816	! Transposition x --> y with a subsequent backward Fourier transformation for
	817	! a 1d-decomposition along x
	818	!------------------------------------------------------------------------------!
	819
	820	USE control_parameters
	821	USE cpulog
	822	USE indices
	823	USE interfaces
	824	USE pegrid
	825	USE transpose_indices
	826
	827	IMPLICIT NONE
	828
	829	INTEGER :: i, iend, iouter, ir, j, k
	830	INTEGER, PARAMETER :: stridex = 4
	831
	832	REAL, DIMENSION(0:ny,stridex) :: work_ffty
	833	#if defined( __nec )
	834	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
	835	#endif
[1003]	836	REAL, DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_in
	837	REAL, DIMENSION(1:nz,0:ny,nxl:nxr) :: f_out
	838	REAL, DIMENSION(nxl:nxr,1:nz,0:ny) :: work
[1]	839
	840	!
	841	!-- Transpose array
	842	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	843	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	844	CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
	845	work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
	846	comm1dx, ierr )
	847	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	848
	849	!
	850	!-- Resort the data in a way that y becomes the first index and carry out the
	851	!-- backward fft along y.
	852	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
	853
	854	IF ( host(1:3) == 'nec' ) THEN
	855	#if defined( __nec )
	856	!
	857	!-- Code optimized for vector processors
[85]	858	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	859	!$OMP DO
	860	DO k = 1, nz
	861	DO j = 0, ny
	862	DO i = nxl, nxr
	863	work_ffty_vec(j,k,i) = work(i,k,j)
	864	ENDDO
	865	ENDDO
	866	ENDDO
	867
	868	!$OMP DO
	869	DO i = nxl, nxr
	870
	871	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' )
	872
	873	DO j = 0, ny
	874	DO k = 1, nz
	875	f_out(k,j,i) = work_ffty_vec(j,k,i)
	876	ENDDO
	877	ENDDO
	878
	879	ENDDO
	880	!$OMP END PARALLEL
	881	#endif
	882
	883	ELSE
	884
	885	!
	886	!-- Cache optimized code.
	887	!-- The i-(x-)direction is split into a strided outer loop and an inner
	888	!-- loop for better cache performance
	889	!$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty )
	890	!$OMP DO
	891	DO iouter = nxl, nxr, stridex
	892
	893	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
	894
	895	DO k = 1, nz
	896	!
	897	!-- Resort
	898	DO j = 0, ny
	899	DO i = iouter, iend
	900	work_ffty(j,i-iouter+1) = work(i,k,j)
	901	ENDDO
	902	ENDDO
	903
	904	DO i = iouter, iend
	905
	906	!
	907	!-- FFT along y
	908	ir = i-iouter+1 ! counter within a stride
	909	CALL fft_y( work_ffty(:,ir), 'backward' )
	910
	911	DO j = 0, ny
	912	f_out(k,j,i) = work_ffty(j,ir)
	913	ENDDO
	914	ENDDO
	915
	916	ENDDO
	917
	918	ENDDO
	919	!$OMP END PARALLEL
	920
	921	ENDIF
	922
	923	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	924
	925	END SUBROUTINE tr_xy_ffty
	926
	927
	928	SUBROUTINE fftx_tri_fftx( ar )
	929
	930	!------------------------------------------------------------------------------!
	931	! FFT along x, solution of the tridiagonal system and backward FFT for
	932	! a 1d-decomposition along x
	933	!
	934	! WARNING: this subroutine may still not work for hybrid parallelization
	935	! with OpenMP (for possible necessary changes see the original
	936	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
	937	!------------------------------------------------------------------------------!
	938
	939	USE control_parameters
	940	USE cpulog
	941	USE grid_variables
	942	USE indices
	943	USE interfaces
	944	USE pegrid
	945	USE transpose_indices
	946
	947	IMPLICIT NONE
	948
	949	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
	950
[1003]	951	REAL, DIMENSION(0:nx) :: work_fftx
	952	REAL, DIMENSION(0:nx,1:nz) :: work_trix
	953	REAL, DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: ar
	954	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
[1]	955
	956
	957	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'start' )
	958
	959	ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) )
	960
	961	tn = 0 ! Default thread number in case of one thread
	962	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_fftx, work_trix )
	963	DO j = nys_x, nyn_x
	964
	965	!$ tn = omp_get_thread_num()
	966
	967	IF ( host(1:3) == 'nec' ) THEN
	968	!
	969	!-- Code optimized for vector processors
	970	DO k = 1, nz
	971
	972	m = 0
	973	DO n = 1, pdims(1)
[1003]	974	DO i = 1, nnx
[1]	975	work_trix(m,k) = ar(i,k,j,n)
	976	m = m + 1
	977	ENDDO
	978	ENDDO
	979
	980	ENDDO
	981
	982	CALL fft_x_m( work_trix, 'forward' )
	983
	984	ELSE
	985	!
	986	!-- Cache optimized code
	987	DO k = 1, nz
	988
	989	m = 0
	990	DO n = 1, pdims(1)
[1003]	991	DO i = 1, nnx
[1]	992	work_fftx(m) = ar(i,k,j,n)
	993	m = m + 1
	994	ENDDO
	995	ENDDO
	996
	997	CALL fft_x( work_fftx, 'forward' )
	998
	999	DO i = 0, nx
	1000	work_trix(i,k) = work_fftx(i)
	1001	ENDDO
	1002
	1003	ENDDO
	1004
	1005	ENDIF
	1006
	1007	!
	1008	!-- Solve the linear equation system
	1009	CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) )
	1010
	1011	IF ( host(1:3) == 'nec' ) THEN
	1012	!
	1013	!-- Code optimized for vector processors
	1014	CALL fft_x_m( work_trix, 'backward' )
	1015
	1016	DO k = 1, nz
	1017
	1018	m = 0
	1019	DO n = 1, pdims(1)
[1003]	1020	DO i = 1, nnx
[1]	1021	ar(i,k,j,n) = work_trix(m,k)
	1022	m = m + 1
	1023	ENDDO
	1024	ENDDO
	1025
	1026	ENDDO
	1027
	1028	ELSE
	1029	!
	1030	!-- Cache optimized code
	1031	DO k = 1, nz
	1032
	1033	DO i = 0, nx
	1034	work_fftx(i) = work_trix(i,k)
	1035	ENDDO
	1036
	1037	CALL fft_x( work_fftx, 'backward' )
	1038
	1039	m = 0
	1040	DO n = 1, pdims(1)
[1003]	1041	DO i = 1, nnx
[1]	1042	ar(i,k,j,n) = work_fftx(m)
	1043	m = m + 1
	1044	ENDDO
	1045	ENDDO
	1046
	1047	ENDDO
	1048
	1049	ENDIF
	1050
	1051	ENDDO
	1052
	1053	DEALLOCATE( tri )
	1054
	1055	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'stop' )
	1056
	1057	END SUBROUTINE fftx_tri_fftx
	1058
	1059
	1060	SUBROUTINE fftx_tr_xy( f_in, work, f_out )
	1061
	1062	!------------------------------------------------------------------------------!
	1063	! Fourier-transformation along x with subsequent transposition x --> y for
	1064	! a 1d-decomposition along y
	1065	!
	1066	! ATTENTION: The NEC-branch of this routine may significantly profit from
	1067	! further optimizations. So far, performance is much worse than
	1068	! for routine ffty_tr_yx (more than three times slower).
	1069	!------------------------------------------------------------------------------!
	1070
	1071	USE control_parameters
	1072	USE cpulog
	1073	USE indices
	1074	USE interfaces
	1075	USE pegrid
	1076	USE transpose_indices
	1077
	1078	IMPLICIT NONE
	1079
	1080	INTEGER :: i, j, k
	1081
[1003]	1082	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
	1083	REAL, DIMENSION(1:nz,nys:nyn,0:nx) :: f_in
	1084	REAL, DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_out
	1085	REAL, DIMENSION(nys:nyn,1:nz,0:nx) :: work
[1]	1086
	1087	!
	1088	!-- Carry out the FFT along x, where all data are present due to the
	1089	!-- 1d-decomposition along y. Resort the data in a way that y becomes
	1090	!-- the first index.
	1091	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
	1092
	1093	IF ( host(1:3) == 'nec' ) THEN
	1094	!
	1095	!-- Code for vector processors
[85]	1096	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	1097	!$OMP DO
	1098	DO i = 0, nx
	1099
	1100	DO j = nys, nyn
	1101	DO k = 1, nz
	1102	work_fftx(i,k,j) = f_in(k,j,i)
	1103	ENDDO
	1104	ENDDO
	1105
	1106	ENDDO
	1107
	1108	!$OMP DO
	1109	DO j = nys, nyn
	1110
	1111	CALL fft_x_m( work_fftx(:,:,j), 'forward' )
	1112
	1113	DO k = 1, nz
	1114	DO i = 0, nx
	1115	work(j,k,i) = work_fftx(i,k,j)
	1116	ENDDO
	1117	ENDDO
	1118
	1119	ENDDO
	1120	!$OMP END PARALLEL
	1121
	1122	ELSE
	1123
	1124	!
	1125	!-- Cache optimized code (there might be still a potential for better
	1126	!-- optimization).
[696]	1127	!$OMP PARALLEL PRIVATE (i,j,k)
[1]	1128	!$OMP DO
	1129	DO i = 0, nx
	1130
	1131	DO j = nys, nyn
	1132	DO k = 1, nz
	1133	work_fftx(i,k,j) = f_in(k,j,i)
	1134	ENDDO
	1135	ENDDO
	1136
	1137	ENDDO
	1138
	1139	!$OMP DO
	1140	DO j = nys, nyn
	1141	DO k = 1, nz
	1142
	1143	CALL fft_x( work_fftx(0:nx,k,j), 'forward' )
	1144
	1145	DO i = 0, nx
	1146	work(j,k,i) = work_fftx(i,k,j)
	1147	ENDDO
	1148	ENDDO
	1149
	1150	ENDDO
	1151	!$OMP END PARALLEL
	1152
	1153	ENDIF
	1154	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
	1155
	1156	!
	1157	!-- Transpose array
	1158	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	1159	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	1160	CALL MPI_ALLTOALL( work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
	1161	f_out(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
	1162	comm1dy, ierr )
	1163	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	1164
	1165	END SUBROUTINE fftx_tr_xy
	1166
	1167
	1168	SUBROUTINE tr_yx_fftx( f_in, work, f_out )
	1169
	1170	!------------------------------------------------------------------------------!
	1171	! Transposition y --> x with a subsequent backward Fourier transformation for
	1172	! a 1d-decomposition along x
	1173	!------------------------------------------------------------------------------!
	1174
	1175	USE control_parameters
	1176	USE cpulog
	1177	USE indices
	1178	USE interfaces
	1179	USE pegrid
	1180	USE transpose_indices
	1181
	1182	IMPLICIT NONE
	1183
	1184	INTEGER :: i, j, k
	1185
[1003]	1186	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
	1187	REAL, DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_in
	1188	REAL, DIMENSION(1:nz,nys:nyn,0:nx) :: f_out
	1189	REAL, DIMENSION(nys:nyn,1:nz,0:nx) :: work
[1]	1190
	1191	!
	1192	!-- Transpose array
	1193	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	1194	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	1195	CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
	1196	work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
	1197	comm1dy, ierr )
	1198	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	1199
	1200	!
	1201	!-- Carry out the FFT along x, where all data are present due to the
	1202	!-- 1d-decomposition along y. Resort the data in a way that y becomes
	1203	!-- the first index.
	1204	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
	1205
	1206	IF ( host(1:3) == 'nec' ) THEN
	1207	!
	1208	!-- Code optimized for vector processors
[85]	1209	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	1210	!$OMP DO
	1211	DO j = nys, nyn
	1212
	1213	DO k = 1, nz
	1214	DO i = 0, nx
	1215	work_fftx(i,k,j) = work(j,k,i)
	1216	ENDDO
	1217	ENDDO
	1218
	1219	CALL fft_x_m( work_fftx(:,:,j), 'backward' )
	1220
	1221	ENDDO
	1222
	1223	!$OMP DO
	1224	DO i = 0, nx
	1225	DO j = nys, nyn
	1226	DO k = 1, nz
	1227	f_out(k,j,i) = work_fftx(i,k,j)
	1228	ENDDO
	1229	ENDDO
	1230	ENDDO
	1231	!$OMP END PARALLEL
	1232
	1233	ELSE
	1234
	1235	!
	1236	!-- Cache optimized code (there might be still a potential for better
	1237	!-- optimization).
[696]	1238	!$OMP PARALLEL PRIVATE (i,j,k)
[1]	1239	!$OMP DO
	1240	DO j = nys, nyn
	1241	DO k = 1, nz
	1242
	1243	DO i = 0, nx
	1244	work_fftx(i,k,j) = work(j,k,i)
	1245	ENDDO
	1246
	1247	CALL fft_x( work_fftx(0:nx,k,j), 'backward' )
	1248
	1249	ENDDO
	1250	ENDDO
	1251
	1252	!$OMP DO
	1253	DO i = 0, nx
	1254	DO j = nys, nyn
	1255	DO k = 1, nz
	1256	f_out(k,j,i) = work_fftx(i,k,j)
	1257	ENDDO
	1258	ENDDO
	1259	ENDDO
	1260	!$OMP END PARALLEL
	1261
	1262	ENDIF
	1263	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	1264
	1265	END SUBROUTINE tr_yx_fftx
	1266
	1267
	1268	SUBROUTINE ffty_tri_ffty( ar )
	1269
	1270	!------------------------------------------------------------------------------!
	1271	! FFT along y, solution of the tridiagonal system and backward FFT for
	1272	! a 1d-decomposition along y
	1273	!
	1274	! WARNING: this subroutine may still not work for hybrid parallelization
	1275	! with OpenMP (for possible necessary changes see the original
	1276	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
	1277	!------------------------------------------------------------------------------!
	1278
	1279	USE control_parameters
	1280	USE cpulog
	1281	USE grid_variables
	1282	USE indices
	1283	USE interfaces
	1284	USE pegrid
	1285	USE transpose_indices
	1286
	1287	IMPLICIT NONE
	1288
	1289	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
	1290
[1003]	1291	REAL, DIMENSION(0:ny) :: work_ffty
	1292	REAL, DIMENSION(0:ny,1:nz) :: work_triy
	1293	REAL, DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: ar
	1294	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
[1]	1295
	1296
	1297	CALL cpu_log( log_point_s(39), 'fft_y + tridia', 'start' )
	1298
	1299	ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) )
	1300
	1301	tn = 0 ! Default thread number in case of one thread
[696]	1302	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy )
[1]	1303	DO i = nxl_y, nxr_y
	1304
	1305	!$ tn = omp_get_thread_num()
	1306
	1307	IF ( host(1:3) == 'nec' ) THEN
	1308	!
	1309	!-- Code optimized for vector processors
	1310	DO k = 1, nz
	1311
	1312	m = 0
	1313	DO n = 1, pdims(2)
[1003]	1314	DO j = 1, nny
[1]	1315	work_triy(m,k) = ar(j,k,i,n)
	1316	m = m + 1
	1317	ENDDO
	1318	ENDDO
	1319
	1320	ENDDO
	1321
	1322	CALL fft_y_m( work_triy, ny, 'forward' )
	1323
	1324	ELSE
	1325	!
	1326	!-- Cache optimized code
	1327	DO k = 1, nz
	1328
	1329	m = 0
	1330	DO n = 1, pdims(2)
[1003]	1331	DO j = 1, nny
[1]	1332	work_ffty(m) = ar(j,k,i,n)
	1333	m = m + 1
	1334	ENDDO
	1335	ENDDO
	1336
	1337	CALL fft_y( work_ffty, 'forward' )
	1338
	1339	DO j = 0, ny
	1340	work_triy(j,k) = work_ffty(j)
	1341	ENDDO
	1342
	1343	ENDDO
	1344
	1345	ENDIF
	1346
	1347	!
	1348	!-- Solve the linear equation system
	1349	CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) )
	1350
	1351	IF ( host(1:3) == 'nec' ) THEN
	1352	!
	1353	!-- Code optimized for vector processors
	1354	CALL fft_y_m( work_triy, ny, 'backward' )
	1355
	1356	DO k = 1, nz
	1357
	1358	m = 0
	1359	DO n = 1, pdims(2)
[1003]	1360	DO j = 1, nny
[1]	1361	ar(j,k,i,n) = work_triy(m,k)
	1362	m = m + 1
	1363	ENDDO
	1364	ENDDO
	1365
	1366	ENDDO
	1367
	1368	ELSE
	1369	!
	1370	!-- Cache optimized code
	1371	DO k = 1, nz
	1372
	1373	DO j = 0, ny
	1374	work_ffty(j) = work_triy(j,k)
	1375	ENDDO
	1376
	1377	CALL fft_y( work_ffty, 'backward' )
	1378
	1379	m = 0
	1380	DO n = 1, pdims(2)
[1003]	1381	DO j = 1, nny
[1]	1382	ar(j,k,i,n) = work_ffty(m)
	1383	m = m + 1
	1384	ENDDO
	1385	ENDDO
	1386
	1387	ENDDO
	1388
	1389	ENDIF
	1390
	1391	ENDDO
	1392
	1393	DEALLOCATE( tri )
	1394
	1395	CALL cpu_log( log_point_s(39), 'fft_y + tridia', 'stop' )
	1396
	1397	END SUBROUTINE ffty_tri_ffty
	1398
	1399
	1400	SUBROUTINE tridia_1dd( ddx2, ddy2, nx, ny, j, ar, tri )
	1401
	1402	!------------------------------------------------------------------------------!
	1403	! Solves the linear system of equations for a 1d-decomposition along x (see
	1404	! tridia)
	1405	!
[940]	1406	! Attention: when using the intel compilers older than 12.0, array tri must
	1407	! be passed as an argument to the contained subroutines. Otherwise
	1408	! addres faults will occur. This feature can be activated with
	1409	! cpp-switch __intel11
[1]	1410	! On NEC, tri should not be passed (except for routine substi_1dd)
	1411	! because this causes very bad performance.
	1412	!------------------------------------------------------------------------------!
	1413
	1414	USE arrays_3d
	1415	USE control_parameters
	1416
	1417	USE pegrid
	1418
	1419	IMPLICIT NONE
	1420
	1421	INTEGER :: i, j, k, nnyh, nx, ny, omp_get_thread_num, tn
	1422
	1423	REAL :: ddx2, ddy2
	1424
	1425	REAL, DIMENSION(0:nx,1:nz) :: ar
	1426	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	1427
	1428
	1429	nnyh = ( ny + 1 ) / 2
	1430
	1431	!
	1432	!-- Define constant elements of the tridiagonal matrix.
	1433	!-- The compiler on SX6 does loop exchange. If 0:nx is a high power of 2,
	1434	!-- the exchanged loops create bank conflicts. The following directive
	1435	!-- prohibits loop exchange and the loops perform much better.
	1436	! tn = omp_get_thread_num()
	1437	! WRITE( 120+tn, * ) '+++ id=',myid,' nx=',nx,' thread=', omp_get_thread_num()
[82]	1438	! CALL local_flush( 120+tn )
[1]	1439	!CDIR NOLOOPCHG
	1440	DO k = 0, nz-1
	1441	DO i = 0,nx
[667]	1442	tri(2,i,k) = ddzu_pres(k+1) * ddzw(k+1)
	1443	tri(3,i,k) = ddzu_pres(k+2) * ddzw(k+1)
[1]	1444	ENDDO
	1445	ENDDO
	1446	! WRITE( 120+tn, * ) '+++ id=',myid,' end of first tridia loop thread=', omp_get_thread_num()
[82]	1447	! CALL local_flush( 120+tn )
[1]	1448
	1449	IF ( j <= nnyh ) THEN
[940]	1450	#if defined( __intel11 )
[1]	1451	CALL maketri_1dd( j, tri )
	1452	#else
	1453	CALL maketri_1dd( j )
	1454	#endif
	1455	ELSE
[940]	1456	#if defined( __intel11 )
[1]	1457	CALL maketri_1dd( ny+1-j, tri )
	1458	#else
	1459	CALL maketri_1dd( ny+1-j )
	1460	#endif
	1461	ENDIF
[940]	1462	#if defined( __intel11 )
[1]	1463	CALL split_1dd( tri )
	1464	#else
	1465	CALL split_1dd
	1466	#endif
	1467	CALL substi_1dd( ar, tri )
	1468
	1469	CONTAINS
	1470
[940]	1471	#if defined( __intel11 )
[1]	1472	SUBROUTINE maketri_1dd( j, tri )
	1473	#else
	1474	SUBROUTINE maketri_1dd( j )
	1475	#endif
	1476
	1477	!------------------------------------------------------------------------------!
	1478	! computes the i- and j-dependent component of the matrix
	1479	!------------------------------------------------------------------------------!
	1480
	1481	USE constants
	1482
	1483	IMPLICIT NONE
	1484
	1485	INTEGER :: i, j, k, nnxh
	1486	REAL :: a, c
	1487
	1488	REAL, DIMENSION(0:nx) :: l
	1489
[940]	1490	#if defined( __intel11 )
[1]	1491	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	1492	#endif
	1493
	1494
	1495	nnxh = ( nx + 1 ) / 2
	1496	!
	1497	!-- Provide the tridiagonal matrix for solution of the Poisson equation in
	1498	!-- Fourier space. The coefficients are computed following the method of
	1499	!-- Schmidt et al. (DFVLR-Mitteilung 84-15), which departs from Stephan
	1500	!-- Siano's original version by discretizing the Poisson equation,
	1501	!-- before it is Fourier-transformed
	1502	DO i = 0, nx
[128]	1503	IF ( i >= 0 .AND. i <= nnxh ) THEN
[1]	1504	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
[1013]	1505	REAL( nx+1 ) ) ) * ddx2 + &
[1]	1506	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
[1013]	1507	REAL( ny+1 ) ) ) * ddy2
[1]	1508	ELSE
	1509	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
[1013]	1510	REAL( nx+1 ) ) ) * ddx2 + &
[1]	1511	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
[1013]	1512	REAL( ny+1 ) ) ) * ddy2
[1]	1513	ENDIF
	1514	ENDDO
	1515
	1516	DO k = 0, nz-1
	1517	DO i = 0, nx
[667]	1518	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
	1519	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
[1]	1520	tri(1,i,k) = a + c - l(i)
	1521	ENDDO
	1522	ENDDO
	1523	IF ( ibc_p_b == 1 .OR. ibc_p_b == 2 ) THEN
	1524	DO i = 0, nx
	1525	tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
	1526	ENDDO
	1527	ENDIF
	1528	IF ( ibc_p_t == 1 ) THEN
	1529	DO i = 0, nx
	1530	tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
	1531	ENDDO
	1532	ENDIF
	1533
	1534	END SUBROUTINE maketri_1dd
	1535
	1536
[940]	1537	#if defined( __intel11 )
[1]	1538	SUBROUTINE split_1dd( tri )
	1539	#else
	1540	SUBROUTINE split_1dd
	1541	#endif
	1542
	1543	!------------------------------------------------------------------------------!
	1544	! Splitting of the tridiagonal matrix (Thomas algorithm)
	1545	!------------------------------------------------------------------------------!
	1546
	1547	IMPLICIT NONE
	1548
	1549	INTEGER :: i, k
	1550
[940]	1551	#if defined( __intel11 )
[1]	1552	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	1553	#endif
	1554
	1555
	1556	!
	1557	!-- Splitting
	1558	DO i = 0, nx
	1559	tri(4,i,0) = tri(1,i,0)
	1560	ENDDO
	1561	DO k = 1, nz-1
	1562	DO i = 0, nx
	1563	tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
	1564	tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
	1565	ENDDO
	1566	ENDDO
	1567
	1568	END SUBROUTINE split_1dd
	1569
	1570
	1571	SUBROUTINE substi_1dd( ar, tri )
	1572
	1573	!------------------------------------------------------------------------------!
	1574	! Substitution (Forward and Backward) (Thomas algorithm)
	1575	!------------------------------------------------------------------------------!
	1576
	1577	IMPLICIT NONE
	1578
[76]	1579	INTEGER :: i, k
[1]	1580
	1581	REAL, DIMENSION(0:nx,nz) :: ar
	1582	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
	1583	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	1584
	1585	!
	1586	!-- Forward substitution
	1587	DO i = 0, nx
	1588	ar1(i,0) = ar(i,1)
	1589	ENDDO
	1590	DO k = 1, nz-1
	1591	DO i = 0, nx
	1592	ar1(i,k) = ar(i,k+1) - tri(5,i,k) * ar1(i,k-1)
	1593	ENDDO
	1594	ENDDO
	1595
	1596	!
	1597	!-- Backward substitution
[763]	1598	!-- Note, the add of 1.0E-20 in the denominator is due to avoid divisions
	1599	!-- by zero appearing if the pressure bc is set to neumann at the top of
	1600	!-- the model domain.
[1]	1601	DO i = 0, nx
[761]	1602	ar(i,nz) = ar1(i,nz-1) / ( tri(4,i,nz-1) + 1.0E-20 )
[1]	1603	ENDDO
	1604	DO k = nz-2, 0, -1
	1605	DO i = 0, nx
	1606	ar(i,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,k+2) ) &
	1607	/ tri(4,i,k)
	1608	ENDDO
	1609	ENDDO
	1610
[76]	1611	!
	1612	!-- Indices i=0, j=0 correspond to horizontally averaged pressure.
	1613	!-- The respective values of ar should be zero at all k-levels if
	1614	!-- acceleration of horizontally averaged vertical velocity is zero.
	1615	IF ( ibc_p_b == 1 .AND. ibc_p_t == 1 ) THEN
	1616	IF ( j == 0 ) THEN
	1617	DO k = 1, nz
	1618	ar(0,k) = 0.0
	1619	ENDDO
	1620	ENDIF
	1621	ENDIF
	1622
[1]	1623	END SUBROUTINE substi_1dd
	1624
	1625	END SUBROUTINE tridia_1dd
	1626
	1627	#endif
[807]	1628	#endif
[1]	1629	END MODULE poisfft_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |