Home

Context Navigation

source: palm/trunk/SOURCE/poisfft.f90 @ 1018

Last change on this file since 1018 was 1014, checked in by raasch, 12 years ago
last commit documented
Property svn:keywords set to `Id`
File size: 46.9 KB

Rev	Line
[1]	1	MODULE poisfft_mod
	2
	3	!------------------------------------------------------------------------------!
[484]	4	! Current revisions:
[1]	5	! -----------------
	6	!
[1014]	7	!
[1]	8	! Former revisions:
	9	! -----------------
[3]	10	! $Id: poisfft.f90 1014 2012-09-21 07:09:03Z raasch $
[77]	11	!
[1014]	12	! 2012-09-21 07:03:55Z raasch
	13	! FLOAT type conversion replaced by REAL
	14	!
[1004]	15	! 1003 2012-09-14 14:35:53Z raasch
	16	! indices nxa, nya, etc. replaced by nx, ny, etc.
	17	!
[941]	18	! 940 2012-07-09 14:31:00Z raasch
	19	! special handling of tri-array as an argument in tridia_1dd routines switched
	20	! off because it caused segmentation faults with intel 12.1 compiler
	21	!
[878]	22	! 877 2012-04-03 11:21:44Z suehring
	23	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
	24	! pressure at the top of the model domain.
	25	!
[810]	26	! 809 2012-01-30 13:32:58Z maronga
	27	! Bugfix: replaced .AND. and .NOT. with && and ! in the preprocessor directives
	28	!
[808]	29	! 807 2012-01-25 11:53:51Z maronga
	30	! New cpp directive "__check" implemented which is used by check_namelist_files
	31	! (most of the code is unneeded by check_namelist_files).
	32	!
[764]	33	! 763 2011-10-06 09:32:09Z suehring
	34	! Comment added concerning the last change.
	35	!
[762]	36	! 761 2011-10-05 17:58:52Z suehring
	37	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
	38	! pressure at the top of the model domain.
	39	!
[697]	40	! 696 2011-03-18 07:03:49Z raasch
	41	! work_fftx removed from PRIVATE clauses in fftx_tr_xy and tr_yx_fftx
	42	!
[684]	43	! 683 2011-02-09 14:25:15Z raasch
	44	! openMP parallelization for 2d-domain-decomposition
	45	!
[668]	46	! 667 2010-12-23 12:06:00Z suehring/gryschka
	47	! ddzu replaced by ddzu_pres due to changes in zu(0)
	48	!
[623]	49	! 622 2010-12-10 08:08:13Z raasch
	50	! optional barriers included in order to speed up collective operations
	51	!
[392]	52	! 377 2009-09-04 11:09:00Z raasch
	53	! __lcmuk changed to __lc to avoid problems with Intel compiler on sgi-ice
	54	!
[198]	55	! 164 2008-05-15 08:46:15Z raasch
	56	! Arguments removed from transpose routines
	57	!
[139]	58	! 128 2007-10-26 13:11:14Z raasch
	59	! Bugfix: wavenumber calculation for even nx in routines maketri
	60	!
[90]	61	! 85 2007-05-11 09:35:14Z raasch
	62	! Bugfix: work_fft*_vec removed from some PRIVATE-declarations
	63	!
[77]	64	! 76 2007-03-29 00:58:32Z raasch
	65	! Tridiagonal coefficients adjusted for Neumann boundary conditions both at
	66	! the bottom and the top.
	67	!
[3]	68	! RCS Log replace by Id keyword, revision history cleaned up
	69	!
[1]	70	! Revision 1.24 2006/08/04 15:00:24 raasch
	71	! Default setting of the thread number tn in case of not using OpenMP
	72	!
	73	! Revision 1.23 2006/02/23 12:48:38 raasch
	74	! Additional compiler directive in routine tridia_1dd for preventing loop
	75	! exchange on NEC-SX6
	76	!
	77	! Revision 1.20 2004/04/30 12:38:09 raasch
	78	! Parts of former poisfft_hybrid moved to this subroutine,
	79	! former subroutine changed to a module, renaming of FFT-subroutines and
	80	! -module, FFTs completely substituted by calls of fft_x and fft_y,
	81	! NAG fft used in the non-parallel case completely removed, l in maketri
	82	! is now a 1d-array, variables passed by modules instead of using parameter
	83	! lists, enlarged transposition arrays introduced
	84	!
	85	! Revision 1.1 1997/07/24 11:24:14 raasch
	86	! Initial revision
	87	!
	88	!
	89	! Description:
	90	! ------------
	91	! See below.
	92	!------------------------------------------------------------------------------!
	93
	94	!--------------------------------------------------------------------------!
	95	! poisfft !
	96	! !
	97	! Original version: Stephan Siano (pois3d) !
	98	! !
	99	! Institute of Meteorology and Climatology, University of Hannover !
	100	! Germany !
	101	! !
	102	! Version as of July 23,1996 !
	103	! !
	104	! !
	105	! Version for parallel computers: Siegfried Raasch !
	106	! !
	107	! Version as of July 03,1997 !
	108	! !
	109	! Solves the Poisson equation with a 2D spectral method !
	110	! d^2 p / dx^2 + d^2 p / dy^2 + d^2 p / dz^2 = s !
	111	! !
	112	! Input: !
	113	! real ar contains in the (nnx,nny,nnz) elements, !
	114	! starting from the element (1,nys,nxl), the !
	115	! values for s !
	116	! real work Temporary array !
	117	! !
	118	! Output: !
	119	! real ar contains the solution for p !
	120	!--------------------------------------------------------------------------!
	121
	122	USE fft_xy
	123	USE indices
	124	USE transpose_indices
	125
	126	IMPLICIT NONE
	127
	128	PRIVATE
[807]	129
[809]	130	#if ! defined ( __check )
[1]	131	PUBLIC poisfft, poisfft_init
	132
	133	INTERFACE poisfft
	134	MODULE PROCEDURE poisfft
	135	END INTERFACE poisfft
	136
	137	INTERFACE poisfft_init
	138	MODULE PROCEDURE poisfft_init
	139	END INTERFACE poisfft_init
[807]	140	#else
	141	PUBLIC poisfft_init
[1]	142
[807]	143	INTERFACE poisfft_init
	144	MODULE PROCEDURE poisfft_init
	145	END INTERFACE poisfft_init
	146	#endif
	147
[1]	148	CONTAINS
	149
	150	SUBROUTINE poisfft_init
	151
	152	CALL fft_init
	153
	154	END SUBROUTINE poisfft_init
	155
[809]	156	#if ! defined ( __check )
[1]	157	SUBROUTINE poisfft( ar, work )
	158
	159	USE cpulog
	160	USE interfaces
	161	USE pegrid
	162
	163	IMPLICIT NONE
	164
[1003]	165	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar, work
[1]	166
	167
	168	CALL cpu_log( log_point_s(3), 'poisfft', 'start' )
	169
	170	!
	171	!-- Two-dimensional Fourier Transformation in x- and y-direction.
	172	#if defined( __parallel )
	173	IF ( pdims(2) == 1 ) THEN
	174
	175	!
	176	!-- 1d-domain-decomposition along x:
	177	!-- FFT along y and transposition y --> x
	178	CALL ffty_tr_yx( ar, work, ar )
	179
	180	!
	181	!-- FFT along x, solving the tridiagonal system and backward FFT
	182	CALL fftx_tri_fftx( ar )
	183
	184	!
	185	!-- Transposition x --> y and backward FFT along y
	186	CALL tr_xy_ffty( ar, work, ar )
	187
	188	ELSEIF ( pdims(1) == 1 ) THEN
	189
	190	!
	191	!-- 1d-domain-decomposition along y:
	192	!-- FFT along x and transposition x --> y
	193	CALL fftx_tr_xy( ar, work, ar )
	194
	195	!
	196	!-- FFT along y, solving the tridiagonal system and backward FFT
	197	CALL ffty_tri_ffty( ar )
	198
	199	!
	200	!-- Transposition y --> x and backward FFT along x
	201	CALL tr_yx_fftx( ar, work, ar )
	202
	203	ELSE
	204
	205	!
	206	!-- 2d-domain-decomposition
	207	!-- Transposition z --> x
	208	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
[164]	209	CALL transpose_zx( ar, work, ar )
[1]	210	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
	211
	212	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
	213	CALL fftxp( ar, 'forward' )
	214	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
	215
	216	!
	217	!-- Transposition x --> y
	218	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
[164]	219	CALL transpose_xy( ar, work, ar )
[1]	220	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
	221
	222	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
	223	CALL fftyp( ar, 'forward' )
	224	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	225
	226	!
	227	!-- Transposition y --> z
	228	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
[164]	229	CALL transpose_yz( ar, work, ar )
[1]	230	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
	231
	232	!
	233	!-- Solve the Poisson equation in z-direction in cartesian space.
	234	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
	235	CALL tridia( ar )
	236	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
	237
	238	!
	239	!-- Inverse Fourier Transformation
	240	!-- Transposition z --> y
	241	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
[164]	242	CALL transpose_zy( ar, work, ar )
[1]	243	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
	244
	245	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
	246	CALL fftyp( ar, 'backward' )
	247	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	248
	249	!
	250	!-- Transposition y --> x
	251	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
[164]	252	CALL transpose_yx( ar, work, ar )
[1]	253	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
	254
	255	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
	256	CALL fftxp( ar, 'backward' )
	257	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	258
	259	!
	260	!-- Transposition x --> z
	261	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
[164]	262	CALL transpose_xz( ar, work, ar )
[1]	263	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
	264
	265	ENDIF
	266
	267	#else
	268
	269	!
	270	!-- Two-dimensional Fourier Transformation along x- and y-direction.
	271	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
	272	CALL fftx( ar, 'forward' )
	273	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
	274	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
	275	CALL ffty( ar, 'forward' )
	276	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	277
	278	!
	279	!-- Solve the Poisson equation in z-direction in cartesian space.
	280	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
	281	CALL tridia( ar )
	282	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
	283
	284	!
	285	!-- Inverse Fourier Transformation.
	286	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
	287	CALL ffty( ar, 'backward' )
	288	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	289	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
	290	CALL fftx( ar, 'backward' )
	291	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	292
	293	#endif
	294
	295	CALL cpu_log( log_point_s(3), 'poisfft', 'stop' )
	296
	297	END SUBROUTINE poisfft
	298
	299
	300
	301	SUBROUTINE tridia( ar )
	302
	303	!------------------------------------------------------------------------------!
	304	! solves the linear system of equations:
	305	!
	306	! -(4 pi^2(i^2/(dx^2nnx^2)+j^2/(dy^2nny^2))+
	307	! 1/(dzu(k)dzw(k))+1/(dzu(k-1)dzw(k)))*p(i,j,k)+
	308	! 1/(dzu(k)dzw(k))p(i,j,k+1)+1/(dzu(k-1)dzw(k))p(i,j,k-1)=d(i,j,k)
	309	!
	310	! by using the Thomas algorithm
	311	!------------------------------------------------------------------------------!
	312
	313	USE arrays_3d
	314
	315	IMPLICIT NONE
	316
	317	INTEGER :: i, j, k, nnyh
	318
	319	REAL, DIMENSION(nxl_z:nxr_z,0:nz-1) :: ar1
	320	REAL, DIMENSION(5,nxl_z:nxr_z,0:nz-1) :: tri
	321
	322	#if defined( __parallel )
[1003]	323	REAL :: ar(nxl_z:nxr_z,nys_z:nyn_z,1:nz)
[1]	324	#else
	325	REAL :: ar(1:nz,nys_z:nyn_z,nxl_z:nxr_z)
	326	#endif
	327
	328
	329	nnyh = (ny+1) / 2
	330
	331	!
	332	!-- Define constant elements of the tridiagonal matrix.
[683]	333	!$OMP PARALLEL PRIVATE ( k, i )
	334	!$OMP DO
[1]	335	DO k = 0, nz-1
	336	DO i = nxl_z, nxr_z
[667]	337	tri(2,i,k) = ddzu_pres(k+1) * ddzw(k+1)
	338	tri(3,i,k) = ddzu_pres(k+2) * ddzw(k+1)
[1]	339	ENDDO
	340	ENDDO
[683]	341	!$OMP END PARALLEL
[1]	342
	343	#if defined( __parallel )
	344	!
	345	!-- Repeat for all y-levels.
[683]	346	!$OMP PARALLEL FIRSTPRIVATE( tri ) PRIVATE ( ar1, j )
	347	!$OMP DO
[1]	348	DO j = nys_z, nyn_z
	349	IF ( j <= nnyh ) THEN
	350	CALL maketri( tri, j )
	351	ELSE
	352	CALL maketri( tri, ny+1-j )
	353	ENDIF
	354	CALL split( tri )
	355	CALL substi( ar, ar1, tri, j )
	356	ENDDO
[683]	357	!$OMP END PARALLEL
[1]	358	#else
	359	!
	360	!-- First y-level.
	361	CALL maketri( tri, nys_z )
	362	CALL split( tri )
	363	CALL substi( ar, ar1, tri, 0 )
	364
	365	!
	366	!-- Further y-levels.
	367	DO j = 1, nnyh - 1
	368	CALL maketri( tri, j )
	369	CALL split( tri )
	370	CALL substi( ar, ar1, tri, j )
	371	CALL substi( ar, ar1, tri, ny+1-j )
	372	ENDDO
	373	CALL maketri( tri, nnyh )
	374	CALL split( tri )
	375	CALL substi( ar, ar1, tri, nnyh+nys )
	376	#endif
	377
	378	CONTAINS
	379
	380	SUBROUTINE maketri( tri, j )
	381
	382	!------------------------------------------------------------------------------!
	383	! Computes the i- and j-dependent component of the matrix
	384	!------------------------------------------------------------------------------!
	385
	386	USE arrays_3d
	387	USE constants
	388	USE control_parameters
	389	USE grid_variables
	390
	391	IMPLICIT NONE
	392
	393	INTEGER :: i, j, k, nnxh
	394	REAL :: a, c
	395	REAL :: ll(nxl_z:nxr_z)
	396	REAL :: tri(5,nxl_z:nxr_z,0:nz-1)
	397
	398
	399	nnxh = ( nx + 1 ) / 2
	400
	401	!
	402	!-- Provide the tridiagonal matrix for solution of the Poisson equation in
	403	!-- Fourier space. The coefficients are computed following the method of
	404	!-- Schmidt et al. (DFVLR-Mitteilung 84-15), which departs from Stephan
	405	!-- Siano's original version by discretizing the Poisson equation,
	406	!-- before it is Fourier-transformed
	407	#if defined( __parallel )
	408	DO i = nxl_z, nxr_z
[128]	409	IF ( i >= 0 .AND. i <= nnxh ) THEN
[1]	410	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
[1013]	411	REAL( nx+1 ) ) ) / ( dx * dx ) + &
[1]	412	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
[1013]	413	REAL( ny+1 ) ) ) / ( dy * dy )
[1]	414	ELSE
	415	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
[1013]	416	REAL( nx+1 ) ) ) / ( dx * dx ) + &
[1]	417	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
[1013]	418	REAL( ny+1 ) ) ) / ( dy * dy )
[1]	419	ENDIF
	420	DO k = 0,nz-1
[667]	421	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
	422	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
[1]	423	tri(1,i,k) = a + c - ll(i)
	424	ENDDO
	425	ENDDO
	426	#else
	427	DO i = 0, nnxh
[1013]	428	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / REAL( nx+1 ) ) ) / &
[1]	429	( dx * dx ) + &
[1013]	430	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / REAL( ny+1 ) ) ) / &
[1]	431	( dy * dy )
	432	DO k = 0, nz-1
[667]	433	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
	434	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
[1]	435	tri(1,i,k) = a + c - ll(i)
	436	IF ( i >= 1 .and. i < nnxh ) THEN
	437	tri(1,nx+1-i,k) = tri(1,i,k)
	438	ENDIF
	439	ENDDO
	440	ENDDO
	441	#endif
	442	IF ( ibc_p_b == 1 .OR. ibc_p_b == 2 ) THEN
	443	DO i = nxl_z, nxr_z
	444	tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
	445	ENDDO
	446	ENDIF
	447	IF ( ibc_p_t == 1 ) THEN
	448	DO i = nxl_z, nxr_z
	449	tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
	450	ENDDO
	451	ENDIF
	452
	453	END SUBROUTINE maketri
	454
	455
	456	SUBROUTINE substi( ar, ar1, tri, j )
	457
	458	!------------------------------------------------------------------------------!
	459	! Substitution (Forward and Backward) (Thomas algorithm)
	460	!------------------------------------------------------------------------------!
	461
[76]	462	USE control_parameters
	463
[1]	464	IMPLICIT NONE
	465
	466	INTEGER :: i, j, k
	467	REAL :: ar1(nxl_z:nxr_z,0:nz-1)
	468	REAL :: tri(5,nxl_z:nxr_z,0:nz-1)
	469	#if defined( __parallel )
[1003]	470	REAL :: ar(nxl_z:nxr_z,nys_z:nyn_z,1:nz)
[1]	471	#else
	472	REAL :: ar(1:nz,nys_z:nyn_z,nxl_z:nxr_z)
	473	#endif
	474
	475	!
	476	!-- Forward substitution.
	477	DO i = nxl_z, nxr_z
	478	#if defined( __parallel )
	479	ar1(i,0) = ar(i,j,1)
	480	#else
	481	ar1(i,0) = ar(1,j,i)
	482	#endif
	483	ENDDO
	484	DO k = 1, nz - 1
	485	DO i = nxl_z, nxr_z
	486	#if defined( __parallel )
	487	ar1(i,k) = ar(i,j,k+1) - tri(5,i,k) * ar1(i,k-1)
	488	#else
	489	ar1(i,k) = ar(k+1,j,i) - tri(5,i,k) * ar1(i,k-1)
	490	#endif
	491	ENDDO
	492	ENDDO
	493
	494	!
[877]	495	!-- Backward substitution
	496	!-- Note, the 1.0E-20 in the denominator is due to avoid divisions
	497	!-- by zero appearing if the pressure bc is set to neumann at the top of
	498	!-- the model domain.
[1]	499	DO i = nxl_z, nxr_z
	500	#if defined( __parallel )
[877]	501	ar(i,j,nz) = ar1(i,nz-1) / ( tri(4,i,nz-1) + 1.0E-20 )
[1]	502	#else
[877]	503	ar(nz,j,i) = ar1(i,nz-1) / ( tri(4,i,nz-1) + 1.0E-20 )
[1]	504	#endif
	505	ENDDO
	506	DO k = nz-2, 0, -1
	507	DO i = nxl_z, nxr_z
	508	#if defined( __parallel )
	509	ar(i,j,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,j,k+2) ) &
	510	/ tri(4,i,k)
	511	#else
	512	ar(k+1,j,i) = ( ar1(i,k) - tri(3,i,k) * ar(k+2,j,i) ) &
	513	/ tri(4,i,k)
	514	#endif
	515	ENDDO
	516	ENDDO
	517
[76]	518	!
	519	!-- Indices i=0, j=0 correspond to horizontally averaged pressure.
	520	!-- The respective values of ar should be zero at all k-levels if
	521	!-- acceleration of horizontally averaged vertical velocity is zero.
	522	IF ( ibc_p_b == 1 .AND. ibc_p_t == 1 ) THEN
	523	IF ( j == 0 .AND. nxl_z == 0 ) THEN
	524	#if defined( __parallel )
	525	DO k = 1, nz
	526	ar(nxl_z,j,k) = 0.0
	527	ENDDO
	528	#else
	529	DO k = 1, nz
	530	ar(k,j,nxl_z) = 0.0
	531	ENDDO
	532	#endif
	533	ENDIF
	534	ENDIF
	535
[1]	536	END SUBROUTINE substi
	537
	538
	539	SUBROUTINE split( tri )
	540
	541	!------------------------------------------------------------------------------!
	542	! Splitting of the tridiagonal matrix (Thomas algorithm)
	543	!------------------------------------------------------------------------------!
	544
	545	IMPLICIT NONE
	546
	547	INTEGER :: i, k
	548	REAL :: tri(5,nxl_z:nxr_z,0:nz-1)
	549
	550	!
	551	!-- Splitting.
	552	DO i = nxl_z, nxr_z
	553	tri(4,i,0) = tri(1,i,0)
	554	ENDDO
	555	DO k = 1, nz-1
	556	DO i = nxl_z, nxr_z
	557	tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
	558	tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
	559	ENDDO
	560	ENDDO
	561
	562	END SUBROUTINE split
	563
	564	END SUBROUTINE tridia
	565
	566
	567	#if defined( __parallel )
	568	SUBROUTINE fftxp( ar, direction )
	569
	570	!------------------------------------------------------------------------------!
	571	! Fourier-transformation along x-direction Parallelized version
	572	!------------------------------------------------------------------------------!
	573
	574	IMPLICIT NONE
	575
	576	CHARACTER (LEN=*) :: direction
	577	INTEGER :: j, k
[1003]	578	REAL :: ar(0:nx,nys_x:nyn_x,nzb_x:nzt_x)
[1]	579
	580	!
	581	!-- Performing the fft with one of the methods implemented
[683]	582	!$OMP PARALLEL PRIVATE ( j, k )
	583	!$OMP DO
[1]	584	DO k = nzb_x, nzt_x
	585	DO j = nys_x, nyn_x
	586	CALL fft_x( ar(0:nx,j,k), direction )
	587	ENDDO
	588	ENDDO
[683]	589	!$OMP END PARALLEL
[1]	590
	591	END SUBROUTINE fftxp
	592
	593	#else
	594	SUBROUTINE fftx( ar, direction )
	595
	596	!------------------------------------------------------------------------------!
	597	! Fourier-transformation along x-direction Non parallel version
	598	!------------------------------------------------------------------------------!
	599
	600	IMPLICIT NONE
	601
	602	CHARACTER (LEN=*) :: direction
	603	INTEGER :: i, j, k
	604	REAL :: ar(1:nz,0:ny,0:nx)
	605
	606	!
	607	!-- Performing the fft with one of the methods implemented
[683]	608	!$OMP PARALLEL PRIVATE ( j, k )
	609	!$OMP DO
[1]	610	DO k = 1, nz
	611	DO j = 0, ny
	612	CALL fft_x( ar(k,j,0:nx), direction )
	613	ENDDO
	614	ENDDO
[683]	615	!$OMP END PARALLEL
[1]	616
	617	END SUBROUTINE fftx
	618	#endif
	619
	620
	621	#if defined( __parallel )
	622	SUBROUTINE fftyp( ar, direction )
	623
	624	!------------------------------------------------------------------------------!
	625	! Fourier-transformation along y-direction Parallelized version
	626	!------------------------------------------------------------------------------!
	627
	628	IMPLICIT NONE
	629
	630	CHARACTER (LEN=*) :: direction
	631	INTEGER :: i, k
[1003]	632	REAL :: ar(0:ny,nxl_y:nxr_y,nzb_y:nzt_y)
[1]	633
	634	!
	635	!-- Performing the fft with one of the methods implemented
[683]	636	!$OMP PARALLEL PRIVATE ( i, k )
	637	!$OMP DO
[1]	638	DO k = nzb_y, nzt_y
	639	DO i = nxl_y, nxr_y
	640	CALL fft_y( ar(0:ny,i,k), direction )
	641	ENDDO
	642	ENDDO
[683]	643	!$OMP END PARALLEL
[1]	644
	645	END SUBROUTINE fftyp
	646
	647	#else
	648	SUBROUTINE ffty( ar, direction )
	649
	650	!------------------------------------------------------------------------------!
	651	! Fourier-transformation along y-direction Non parallel version
	652	!------------------------------------------------------------------------------!
	653
	654	IMPLICIT NONE
	655
	656	CHARACTER (LEN=*) :: direction
	657	INTEGER :: i, k
	658	REAL :: ar(1:nz,0:ny,0:nx)
	659
	660	!
	661	!-- Performing the fft with one of the methods implemented
[683]	662	!$OMP PARALLEL PRIVATE ( i, k )
	663	!$OMP DO
[1]	664	DO k = 1, nz
	665	DO i = 0, nx
	666	CALL fft_y( ar(k,0:ny,i), direction )
	667	ENDDO
	668	ENDDO
[683]	669	!$OMP END PARALLEL
[1]	670
	671	END SUBROUTINE ffty
	672	#endif
	673
	674	#if defined( __parallel )
	675	SUBROUTINE ffty_tr_yx( f_in, work, f_out )
	676
	677	!------------------------------------------------------------------------------!
	678	! Fourier-transformation along y with subsequent transposition y --> x for
	679	! a 1d-decomposition along x
	680	!
	681	! ATTENTION: The performance of this routine is much faster on the NEC-SX6,
	682	! if the first index of work_ffty_vec is odd. Otherwise
	683	! memory bank conflicts may occur (especially if the index is a
	684	! multiple of 128). That's why work_ffty_vec is dimensioned as
	685	! 0:ny+1.
	686	! Of course, this will not work if users are using an odd number
	687	! of gridpoints along y.
	688	!------------------------------------------------------------------------------!
	689
	690	USE control_parameters
	691	USE cpulog
	692	USE indices
	693	USE interfaces
	694	USE pegrid
	695	USE transpose_indices
	696
	697	IMPLICIT NONE
	698
	699	INTEGER :: i, iend, iouter, ir, j, k
	700	INTEGER, PARAMETER :: stridex = 4
	701
	702	REAL, DIMENSION(0:ny,stridex) :: work_ffty
	703	#if defined( __nec )
	704	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
	705	#endif
[1003]	706	REAL, DIMENSION(1:nz,0:ny,nxl:nxr) :: f_in
	707	REAL, DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_out
	708	REAL, DIMENSION(nxl:nxr,1:nz,0:ny) :: work
[1]	709
	710	!
	711	!-- Carry out the FFT along y, where all data are present due to the
	712	!-- 1d-decomposition along x. Resort the data in a way that x becomes
	713	!-- the first index.
	714	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
	715
	716	IF ( host(1:3) == 'nec' ) THEN
	717	#if defined( __nec )
	718	!
	719	!-- Code optimized for vector processors
[85]	720	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	721	!$OMP DO
	722	DO i = nxl, nxr
	723
	724	DO j = 0, ny
	725	DO k = 1, nz
	726	work_ffty_vec(j,k,i) = f_in(k,j,i)
	727	ENDDO
	728	ENDDO
	729
	730	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' )
	731
	732	ENDDO
	733
	734	!$OMP DO
	735	DO k = 1, nz
	736	DO j = 0, ny
	737	DO i = nxl, nxr
	738	work(i,k,j) = work_ffty_vec(j,k,i)
	739	ENDDO
	740	ENDDO
	741	ENDDO
	742	!$OMP END PARALLEL
	743	#endif
	744
	745	ELSE
	746
	747	!
	748	!-- Cache optimized code.
	749	!-- The i-(x-)direction is split into a strided outer loop and an inner
	750	!-- loop for better cache performance
	751	!$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty)
	752	!$OMP DO
	753	DO iouter = nxl, nxr, stridex
	754
	755	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
	756
	757	DO k = 1, nz
	758
	759	DO i = iouter, iend
	760
	761	ir = i-iouter+1 ! counter within a stride
	762	DO j = 0, ny
	763	work_ffty(j,ir) = f_in(k,j,i)
	764	ENDDO
	765	!
	766	!-- FFT along y
	767	CALL fft_y( work_ffty(:,ir), 'forward' )
	768
	769	ENDDO
	770
	771	!
	772	!-- Resort
	773	DO j = 0, ny
	774	DO i = iouter, iend
	775	work(i,k,j) = work_ffty(j,i-iouter+1)
	776	ENDDO
	777	ENDDO
	778
	779	ENDDO
	780
	781	ENDDO
	782	!$OMP END PARALLEL
	783
	784	ENDIF
	785	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	786
	787	!
	788	!-- Transpose array
	789	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	790	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	791	CALL MPI_ALLTOALL( work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
	792	f_out(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
	793	comm1dx, ierr )
	794	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	795
	796	END SUBROUTINE ffty_tr_yx
	797
	798
	799	SUBROUTINE tr_xy_ffty( f_in, work, f_out )
	800
	801	!------------------------------------------------------------------------------!
	802	! Transposition x --> y with a subsequent backward Fourier transformation for
	803	! a 1d-decomposition along x
	804	!------------------------------------------------------------------------------!
	805
	806	USE control_parameters
	807	USE cpulog
	808	USE indices
	809	USE interfaces
	810	USE pegrid
	811	USE transpose_indices
	812
	813	IMPLICIT NONE
	814
	815	INTEGER :: i, iend, iouter, ir, j, k
	816	INTEGER, PARAMETER :: stridex = 4
	817
	818	REAL, DIMENSION(0:ny,stridex) :: work_ffty
	819	#if defined( __nec )
	820	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
	821	#endif
[1003]	822	REAL, DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_in
	823	REAL, DIMENSION(1:nz,0:ny,nxl:nxr) :: f_out
	824	REAL, DIMENSION(nxl:nxr,1:nz,0:ny) :: work
[1]	825
	826	!
	827	!-- Transpose array
	828	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	829	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	830	CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
	831	work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
	832	comm1dx, ierr )
	833	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	834
	835	!
	836	!-- Resort the data in a way that y becomes the first index and carry out the
	837	!-- backward fft along y.
	838	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
	839
	840	IF ( host(1:3) == 'nec' ) THEN
	841	#if defined( __nec )
	842	!
	843	!-- Code optimized for vector processors
[85]	844	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	845	!$OMP DO
	846	DO k = 1, nz
	847	DO j = 0, ny
	848	DO i = nxl, nxr
	849	work_ffty_vec(j,k,i) = work(i,k,j)
	850	ENDDO
	851	ENDDO
	852	ENDDO
	853
	854	!$OMP DO
	855	DO i = nxl, nxr
	856
	857	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' )
	858
	859	DO j = 0, ny
	860	DO k = 1, nz
	861	f_out(k,j,i) = work_ffty_vec(j,k,i)
	862	ENDDO
	863	ENDDO
	864
	865	ENDDO
	866	!$OMP END PARALLEL
	867	#endif
	868
	869	ELSE
	870
	871	!
	872	!-- Cache optimized code.
	873	!-- The i-(x-)direction is split into a strided outer loop and an inner
	874	!-- loop for better cache performance
	875	!$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty )
	876	!$OMP DO
	877	DO iouter = nxl, nxr, stridex
	878
	879	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
	880
	881	DO k = 1, nz
	882	!
	883	!-- Resort
	884	DO j = 0, ny
	885	DO i = iouter, iend
	886	work_ffty(j,i-iouter+1) = work(i,k,j)
	887	ENDDO
	888	ENDDO
	889
	890	DO i = iouter, iend
	891
	892	!
	893	!-- FFT along y
	894	ir = i-iouter+1 ! counter within a stride
	895	CALL fft_y( work_ffty(:,ir), 'backward' )
	896
	897	DO j = 0, ny
	898	f_out(k,j,i) = work_ffty(j,ir)
	899	ENDDO
	900	ENDDO
	901
	902	ENDDO
	903
	904	ENDDO
	905	!$OMP END PARALLEL
	906
	907	ENDIF
	908
	909	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	910
	911	END SUBROUTINE tr_xy_ffty
	912
	913
	914	SUBROUTINE fftx_tri_fftx( ar )
	915
	916	!------------------------------------------------------------------------------!
	917	! FFT along x, solution of the tridiagonal system and backward FFT for
	918	! a 1d-decomposition along x
	919	!
	920	! WARNING: this subroutine may still not work for hybrid parallelization
	921	! with OpenMP (for possible necessary changes see the original
	922	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
	923	!------------------------------------------------------------------------------!
	924
	925	USE control_parameters
	926	USE cpulog
	927	USE grid_variables
	928	USE indices
	929	USE interfaces
	930	USE pegrid
	931	USE transpose_indices
	932
	933	IMPLICIT NONE
	934
	935	character(len=3) :: myth_char
	936
	937	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
	938
[1003]	939	REAL, DIMENSION(0:nx) :: work_fftx
	940	REAL, DIMENSION(0:nx,1:nz) :: work_trix
	941	REAL, DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: ar
	942	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
[1]	943
	944
	945	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'start' )
	946
	947	ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) )
	948
	949	tn = 0 ! Default thread number in case of one thread
	950	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_fftx, work_trix )
	951	DO j = nys_x, nyn_x
	952
	953	!$ tn = omp_get_thread_num()
	954
	955	IF ( host(1:3) == 'nec' ) THEN
	956	!
	957	!-- Code optimized for vector processors
	958	DO k = 1, nz
	959
	960	m = 0
	961	DO n = 1, pdims(1)
[1003]	962	DO i = 1, nnx
[1]	963	work_trix(m,k) = ar(i,k,j,n)
	964	m = m + 1
	965	ENDDO
	966	ENDDO
	967
	968	ENDDO
	969
	970	CALL fft_x_m( work_trix, 'forward' )
	971
	972	ELSE
	973	!
	974	!-- Cache optimized code
	975	DO k = 1, nz
	976
	977	m = 0
	978	DO n = 1, pdims(1)
[1003]	979	DO i = 1, nnx
[1]	980	work_fftx(m) = ar(i,k,j,n)
	981	m = m + 1
	982	ENDDO
	983	ENDDO
	984
	985	CALL fft_x( work_fftx, 'forward' )
	986
	987	DO i = 0, nx
	988	work_trix(i,k) = work_fftx(i)
	989	ENDDO
	990
	991	ENDDO
	992
	993	ENDIF
	994
	995	!
	996	!-- Solve the linear equation system
	997	CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) )
	998
	999	IF ( host(1:3) == 'nec' ) THEN
	1000	!
	1001	!-- Code optimized for vector processors
	1002	CALL fft_x_m( work_trix, 'backward' )
	1003
	1004	DO k = 1, nz
	1005
	1006	m = 0
	1007	DO n = 1, pdims(1)
[1003]	1008	DO i = 1, nnx
[1]	1009	ar(i,k,j,n) = work_trix(m,k)
	1010	m = m + 1
	1011	ENDDO
	1012	ENDDO
	1013
	1014	ENDDO
	1015
	1016	ELSE
	1017	!
	1018	!-- Cache optimized code
	1019	DO k = 1, nz
	1020
	1021	DO i = 0, nx
	1022	work_fftx(i) = work_trix(i,k)
	1023	ENDDO
	1024
	1025	CALL fft_x( work_fftx, 'backward' )
	1026
	1027	m = 0
	1028	DO n = 1, pdims(1)
[1003]	1029	DO i = 1, nnx
[1]	1030	ar(i,k,j,n) = work_fftx(m)
	1031	m = m + 1
	1032	ENDDO
	1033	ENDDO
	1034
	1035	ENDDO
	1036
	1037	ENDIF
	1038
	1039	ENDDO
	1040
	1041	DEALLOCATE( tri )
	1042
	1043	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'stop' )
	1044
	1045	END SUBROUTINE fftx_tri_fftx
	1046
	1047
	1048	SUBROUTINE fftx_tr_xy( f_in, work, f_out )
	1049
	1050	!------------------------------------------------------------------------------!
	1051	! Fourier-transformation along x with subsequent transposition x --> y for
	1052	! a 1d-decomposition along y
	1053	!
	1054	! ATTENTION: The NEC-branch of this routine may significantly profit from
	1055	! further optimizations. So far, performance is much worse than
	1056	! for routine ffty_tr_yx (more than three times slower).
	1057	!------------------------------------------------------------------------------!
	1058
	1059	USE control_parameters
	1060	USE cpulog
	1061	USE indices
	1062	USE interfaces
	1063	USE pegrid
	1064	USE transpose_indices
	1065
	1066	IMPLICIT NONE
	1067
	1068	INTEGER :: i, j, k
	1069
[1003]	1070	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
	1071	REAL, DIMENSION(1:nz,nys:nyn,0:nx) :: f_in
	1072	REAL, DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_out
	1073	REAL, DIMENSION(nys:nyn,1:nz,0:nx) :: work
[1]	1074
	1075	!
	1076	!-- Carry out the FFT along x, where all data are present due to the
	1077	!-- 1d-decomposition along y. Resort the data in a way that y becomes
	1078	!-- the first index.
	1079	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
	1080
	1081	IF ( host(1:3) == 'nec' ) THEN
	1082	!
	1083	!-- Code for vector processors
[85]	1084	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	1085	!$OMP DO
	1086	DO i = 0, nx
	1087
	1088	DO j = nys, nyn
	1089	DO k = 1, nz
	1090	work_fftx(i,k,j) = f_in(k,j,i)
	1091	ENDDO
	1092	ENDDO
	1093
	1094	ENDDO
	1095
	1096	!$OMP DO
	1097	DO j = nys, nyn
	1098
	1099	CALL fft_x_m( work_fftx(:,:,j), 'forward' )
	1100
	1101	DO k = 1, nz
	1102	DO i = 0, nx
	1103	work(j,k,i) = work_fftx(i,k,j)
	1104	ENDDO
	1105	ENDDO
	1106
	1107	ENDDO
	1108	!$OMP END PARALLEL
	1109
	1110	ELSE
	1111
	1112	!
	1113	!-- Cache optimized code (there might be still a potential for better
	1114	!-- optimization).
[696]	1115	!$OMP PARALLEL PRIVATE (i,j,k)
[1]	1116	!$OMP DO
	1117	DO i = 0, nx
	1118
	1119	DO j = nys, nyn
	1120	DO k = 1, nz
	1121	work_fftx(i,k,j) = f_in(k,j,i)
	1122	ENDDO
	1123	ENDDO
	1124
	1125	ENDDO
	1126
	1127	!$OMP DO
	1128	DO j = nys, nyn
	1129	DO k = 1, nz
	1130
	1131	CALL fft_x( work_fftx(0:nx,k,j), 'forward' )
	1132
	1133	DO i = 0, nx
	1134	work(j,k,i) = work_fftx(i,k,j)
	1135	ENDDO
	1136	ENDDO
	1137
	1138	ENDDO
	1139	!$OMP END PARALLEL
	1140
	1141	ENDIF
	1142	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
	1143
	1144	!
	1145	!-- Transpose array
	1146	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	1147	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	1148	CALL MPI_ALLTOALL( work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
	1149	f_out(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
	1150	comm1dy, ierr )
	1151	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	1152
	1153	END SUBROUTINE fftx_tr_xy
	1154
	1155
	1156	SUBROUTINE tr_yx_fftx( f_in, work, f_out )
	1157
	1158	!------------------------------------------------------------------------------!
	1159	! Transposition y --> x with a subsequent backward Fourier transformation for
	1160	! a 1d-decomposition along x
	1161	!------------------------------------------------------------------------------!
	1162
	1163	USE control_parameters
	1164	USE cpulog
	1165	USE indices
	1166	USE interfaces
	1167	USE pegrid
	1168	USE transpose_indices
	1169
	1170	IMPLICIT NONE
	1171
	1172	INTEGER :: i, j, k
	1173
[1003]	1174	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
	1175	REAL, DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_in
	1176	REAL, DIMENSION(1:nz,nys:nyn,0:nx) :: f_out
	1177	REAL, DIMENSION(nys:nyn,1:nz,0:nx) :: work
[1]	1178
	1179	!
	1180	!-- Transpose array
	1181	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	1182	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	1183	CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
	1184	work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
	1185	comm1dy, ierr )
	1186	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	1187
	1188	!
	1189	!-- Carry out the FFT along x, where all data are present due to the
	1190	!-- 1d-decomposition along y. Resort the data in a way that y becomes
	1191	!-- the first index.
	1192	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
	1193
	1194	IF ( host(1:3) == 'nec' ) THEN
	1195	!
	1196	!-- Code optimized for vector processors
[85]	1197	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	1198	!$OMP DO
	1199	DO j = nys, nyn
	1200
	1201	DO k = 1, nz
	1202	DO i = 0, nx
	1203	work_fftx(i,k,j) = work(j,k,i)
	1204	ENDDO
	1205	ENDDO
	1206
	1207	CALL fft_x_m( work_fftx(:,:,j), 'backward' )
	1208
	1209	ENDDO
	1210
	1211	!$OMP DO
	1212	DO i = 0, nx
	1213	DO j = nys, nyn
	1214	DO k = 1, nz
	1215	f_out(k,j,i) = work_fftx(i,k,j)
	1216	ENDDO
	1217	ENDDO
	1218	ENDDO
	1219	!$OMP END PARALLEL
	1220
	1221	ELSE
	1222
	1223	!
	1224	!-- Cache optimized code (there might be still a potential for better
	1225	!-- optimization).
[696]	1226	!$OMP PARALLEL PRIVATE (i,j,k)
[1]	1227	!$OMP DO
	1228	DO j = nys, nyn
	1229	DO k = 1, nz
	1230
	1231	DO i = 0, nx
	1232	work_fftx(i,k,j) = work(j,k,i)
	1233	ENDDO
	1234
	1235	CALL fft_x( work_fftx(0:nx,k,j), 'backward' )
	1236
	1237	ENDDO
	1238	ENDDO
	1239
	1240	!$OMP DO
	1241	DO i = 0, nx
	1242	DO j = nys, nyn
	1243	DO k = 1, nz
	1244	f_out(k,j,i) = work_fftx(i,k,j)
	1245	ENDDO
	1246	ENDDO
	1247	ENDDO
	1248	!$OMP END PARALLEL
	1249
	1250	ENDIF
	1251	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	1252
	1253	END SUBROUTINE tr_yx_fftx
	1254
	1255
	1256	SUBROUTINE ffty_tri_ffty( ar )
	1257
	1258	!------------------------------------------------------------------------------!
	1259	! FFT along y, solution of the tridiagonal system and backward FFT for
	1260	! a 1d-decomposition along y
	1261	!
	1262	! WARNING: this subroutine may still not work for hybrid parallelization
	1263	! with OpenMP (for possible necessary changes see the original
	1264	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
	1265	!------------------------------------------------------------------------------!
	1266
	1267	USE control_parameters
	1268	USE cpulog
	1269	USE grid_variables
	1270	USE indices
	1271	USE interfaces
	1272	USE pegrid
	1273	USE transpose_indices
	1274
	1275	IMPLICIT NONE
	1276
	1277	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
	1278
[1003]	1279	REAL, DIMENSION(0:ny) :: work_ffty
	1280	REAL, DIMENSION(0:ny,1:nz) :: work_triy
	1281	REAL, DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: ar
	1282	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
[1]	1283
	1284
	1285	CALL cpu_log( log_point_s(39), 'fft_y + tridia', 'start' )
	1286
	1287	ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) )
	1288
	1289	tn = 0 ! Default thread number in case of one thread
[696]	1290	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy )
[1]	1291	DO i = nxl_y, nxr_y
	1292
	1293	!$ tn = omp_get_thread_num()
	1294
	1295	IF ( host(1:3) == 'nec' ) THEN
	1296	!
	1297	!-- Code optimized for vector processors
	1298	DO k = 1, nz
	1299
	1300	m = 0
	1301	DO n = 1, pdims(2)
[1003]	1302	DO j = 1, nny
[1]	1303	work_triy(m,k) = ar(j,k,i,n)
	1304	m = m + 1
	1305	ENDDO
	1306	ENDDO
	1307
	1308	ENDDO
	1309
	1310	CALL fft_y_m( work_triy, ny, 'forward' )
	1311
	1312	ELSE
	1313	!
	1314	!-- Cache optimized code
	1315	DO k = 1, nz
	1316
	1317	m = 0
	1318	DO n = 1, pdims(2)
[1003]	1319	DO j = 1, nny
[1]	1320	work_ffty(m) = ar(j,k,i,n)
	1321	m = m + 1
	1322	ENDDO
	1323	ENDDO
	1324
	1325	CALL fft_y( work_ffty, 'forward' )
	1326
	1327	DO j = 0, ny
	1328	work_triy(j,k) = work_ffty(j)
	1329	ENDDO
	1330
	1331	ENDDO
	1332
	1333	ENDIF
	1334
	1335	!
	1336	!-- Solve the linear equation system
	1337	CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) )
	1338
	1339	IF ( host(1:3) == 'nec' ) THEN
	1340	!
	1341	!-- Code optimized for vector processors
	1342	CALL fft_y_m( work_triy, ny, 'backward' )
	1343
	1344	DO k = 1, nz
	1345
	1346	m = 0
	1347	DO n = 1, pdims(2)
[1003]	1348	DO j = 1, nny
[1]	1349	ar(j,k,i,n) = work_triy(m,k)
	1350	m = m + 1
	1351	ENDDO
	1352	ENDDO
	1353
	1354	ENDDO
	1355
	1356	ELSE
	1357	!
	1358	!-- Cache optimized code
	1359	DO k = 1, nz
	1360
	1361	DO j = 0, ny
	1362	work_ffty(j) = work_triy(j,k)
	1363	ENDDO
	1364
	1365	CALL fft_y( work_ffty, 'backward' )
	1366
	1367	m = 0
	1368	DO n = 1, pdims(2)
[1003]	1369	DO j = 1, nny
[1]	1370	ar(j,k,i,n) = work_ffty(m)
	1371	m = m + 1
	1372	ENDDO
	1373	ENDDO
	1374
	1375	ENDDO
	1376
	1377	ENDIF
	1378
	1379	ENDDO
	1380
	1381	DEALLOCATE( tri )
	1382
	1383	CALL cpu_log( log_point_s(39), 'fft_y + tridia', 'stop' )
	1384
	1385	END SUBROUTINE ffty_tri_ffty
	1386
	1387
	1388	SUBROUTINE tridia_1dd( ddx2, ddy2, nx, ny, j, ar, tri )
	1389
	1390	!------------------------------------------------------------------------------!
	1391	! Solves the linear system of equations for a 1d-decomposition along x (see
	1392	! tridia)
	1393	!
[940]	1394	! Attention: when using the intel compilers older than 12.0, array tri must
	1395	! be passed as an argument to the contained subroutines. Otherwise
	1396	! addres faults will occur. This feature can be activated with
	1397	! cpp-switch __intel11
[1]	1398	! On NEC, tri should not be passed (except for routine substi_1dd)
	1399	! because this causes very bad performance.
	1400	!------------------------------------------------------------------------------!
	1401
	1402	USE arrays_3d
	1403	USE control_parameters
	1404
	1405	USE pegrid
	1406
	1407	IMPLICIT NONE
	1408
	1409	INTEGER :: i, j, k, nnyh, nx, ny, omp_get_thread_num, tn
	1410
	1411	REAL :: ddx2, ddy2
	1412
	1413	REAL, DIMENSION(0:nx,1:nz) :: ar
	1414	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
	1415	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	1416
	1417
	1418	nnyh = ( ny + 1 ) / 2
	1419
	1420	!
	1421	!-- Define constant elements of the tridiagonal matrix.
	1422	!-- The compiler on SX6 does loop exchange. If 0:nx is a high power of 2,
	1423	!-- the exchanged loops create bank conflicts. The following directive
	1424	!-- prohibits loop exchange and the loops perform much better.
	1425	! tn = omp_get_thread_num()
	1426	! WRITE( 120+tn, * ) '+++ id=',myid,' nx=',nx,' thread=', omp_get_thread_num()
[82]	1427	! CALL local_flush( 120+tn )
[1]	1428	!CDIR NOLOOPCHG
	1429	DO k = 0, nz-1
	1430	DO i = 0,nx
[667]	1431	tri(2,i,k) = ddzu_pres(k+1) * ddzw(k+1)
	1432	tri(3,i,k) = ddzu_pres(k+2) * ddzw(k+1)
[1]	1433	ENDDO
	1434	ENDDO
	1435	! WRITE( 120+tn, * ) '+++ id=',myid,' end of first tridia loop thread=', omp_get_thread_num()
[82]	1436	! CALL local_flush( 120+tn )
[1]	1437
	1438	IF ( j <= nnyh ) THEN
[940]	1439	#if defined( __intel11 )
[1]	1440	CALL maketri_1dd( j, tri )
	1441	#else
	1442	CALL maketri_1dd( j )
	1443	#endif
	1444	ELSE
[940]	1445	#if defined( __intel11 )
[1]	1446	CALL maketri_1dd( ny+1-j, tri )
	1447	#else
	1448	CALL maketri_1dd( ny+1-j )
	1449	#endif
	1450	ENDIF
[940]	1451	#if defined( __intel11 )
[1]	1452	CALL split_1dd( tri )
	1453	#else
	1454	CALL split_1dd
	1455	#endif
	1456	CALL substi_1dd( ar, tri )
	1457
	1458	CONTAINS
	1459
[940]	1460	#if defined( __intel11 )
[1]	1461	SUBROUTINE maketri_1dd( j, tri )
	1462	#else
	1463	SUBROUTINE maketri_1dd( j )
	1464	#endif
	1465
	1466	!------------------------------------------------------------------------------!
	1467	! computes the i- and j-dependent component of the matrix
	1468	!------------------------------------------------------------------------------!
	1469
	1470	USE constants
	1471
	1472	IMPLICIT NONE
	1473
	1474	INTEGER :: i, j, k, nnxh
	1475	REAL :: a, c
	1476
	1477	REAL, DIMENSION(0:nx) :: l
	1478
[940]	1479	#if defined( __intel11 )
[1]	1480	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	1481	#endif
	1482
	1483
	1484	nnxh = ( nx + 1 ) / 2
	1485	!
	1486	!-- Provide the tridiagonal matrix for solution of the Poisson equation in
	1487	!-- Fourier space. The coefficients are computed following the method of
	1488	!-- Schmidt et al. (DFVLR-Mitteilung 84-15), which departs from Stephan
	1489	!-- Siano's original version by discretizing the Poisson equation,
	1490	!-- before it is Fourier-transformed
	1491	DO i = 0, nx
[128]	1492	IF ( i >= 0 .AND. i <= nnxh ) THEN
[1]	1493	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
[1013]	1494	REAL( nx+1 ) ) ) * ddx2 + &
[1]	1495	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
[1013]	1496	REAL( ny+1 ) ) ) * ddy2
[1]	1497	ELSE
	1498	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
[1013]	1499	REAL( nx+1 ) ) ) * ddx2 + &
[1]	1500	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
[1013]	1501	REAL( ny+1 ) ) ) * ddy2
[1]	1502	ENDIF
	1503	ENDDO
	1504
	1505	DO k = 0, nz-1
	1506	DO i = 0, nx
[667]	1507	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
	1508	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
[1]	1509	tri(1,i,k) = a + c - l(i)
	1510	ENDDO
	1511	ENDDO
	1512	IF ( ibc_p_b == 1 .OR. ibc_p_b == 2 ) THEN
	1513	DO i = 0, nx
	1514	tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
	1515	ENDDO
	1516	ENDIF
	1517	IF ( ibc_p_t == 1 ) THEN
	1518	DO i = 0, nx
	1519	tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
	1520	ENDDO
	1521	ENDIF
	1522
	1523	END SUBROUTINE maketri_1dd
	1524
	1525
[940]	1526	#if defined( __intel11 )
[1]	1527	SUBROUTINE split_1dd( tri )
	1528	#else
	1529	SUBROUTINE split_1dd
	1530	#endif
	1531
	1532	!------------------------------------------------------------------------------!
	1533	! Splitting of the tridiagonal matrix (Thomas algorithm)
	1534	!------------------------------------------------------------------------------!
	1535
	1536	IMPLICIT NONE
	1537
	1538	INTEGER :: i, k
	1539
[940]	1540	#if defined( __intel11 )
[1]	1541	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	1542	#endif
	1543
	1544
	1545	!
	1546	!-- Splitting
	1547	DO i = 0, nx
	1548	tri(4,i,0) = tri(1,i,0)
	1549	ENDDO
	1550	DO k = 1, nz-1
	1551	DO i = 0, nx
	1552	tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
	1553	tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
	1554	ENDDO
	1555	ENDDO
	1556
	1557	END SUBROUTINE split_1dd
	1558
	1559
	1560	SUBROUTINE substi_1dd( ar, tri )
	1561
	1562	!------------------------------------------------------------------------------!
	1563	! Substitution (Forward and Backward) (Thomas algorithm)
	1564	!------------------------------------------------------------------------------!
	1565
	1566	IMPLICIT NONE
	1567
[76]	1568	INTEGER :: i, k
[1]	1569
	1570	REAL, DIMENSION(0:nx,nz) :: ar
	1571	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
	1572	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	1573
	1574	!
	1575	!-- Forward substitution
	1576	DO i = 0, nx
	1577	ar1(i,0) = ar(i,1)
	1578	ENDDO
	1579	DO k = 1, nz-1
	1580	DO i = 0, nx
	1581	ar1(i,k) = ar(i,k+1) - tri(5,i,k) * ar1(i,k-1)
	1582	ENDDO
	1583	ENDDO
	1584
	1585	!
	1586	!-- Backward substitution
[763]	1587	!-- Note, the add of 1.0E-20 in the denominator is due to avoid divisions
	1588	!-- by zero appearing if the pressure bc is set to neumann at the top of
	1589	!-- the model domain.
[1]	1590	DO i = 0, nx
[761]	1591	ar(i,nz) = ar1(i,nz-1) / ( tri(4,i,nz-1) + 1.0E-20 )
[1]	1592	ENDDO
	1593	DO k = nz-2, 0, -1
	1594	DO i = 0, nx
	1595	ar(i,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,k+2) ) &
	1596	/ tri(4,i,k)
	1597	ENDDO
	1598	ENDDO
	1599
[76]	1600	!
	1601	!-- Indices i=0, j=0 correspond to horizontally averaged pressure.
	1602	!-- The respective values of ar should be zero at all k-levels if
	1603	!-- acceleration of horizontally averaged vertical velocity is zero.
	1604	IF ( ibc_p_b == 1 .AND. ibc_p_t == 1 ) THEN
	1605	IF ( j == 0 ) THEN
	1606	DO k = 1, nz
	1607	ar(0,k) = 0.0
	1608	ENDDO
	1609	ENDIF
	1610	ENDIF
	1611
[1]	1612	END SUBROUTINE substi_1dd
	1613
	1614	END SUBROUTINE tridia_1dd
	1615
	1616	#endif
[807]	1617	#endif
[1]	1618	END MODULE poisfft_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |