Home

Context Navigation

source: palm/trunk/SOURCE/poisfft.f90 @ 1011

Last change on this file since 1011 was 1004, checked in by raasch, 12 years ago
last commit documented
Property svn:keywords set to `Id`
File size: 46.8 KB

Rev	Line
[1]	1	MODULE poisfft_mod
	2
	3	!------------------------------------------------------------------------------!
[484]	4	! Current revisions:
[1]	5	! -----------------
	6	!
[1004]	7	!
[1]	8	! Former revisions:
	9	! -----------------
[3]	10	! $Id: poisfft.f90 1004 2012-09-14 14:56:50Z raasch $
[77]	11	!
[1004]	12	! 1003 2012-09-14 14:35:53Z raasch
	13	! indices nxa, nya, etc. replaced by nx, ny, etc.
	14	!
[941]	15	! 940 2012-07-09 14:31:00Z raasch
	16	! special handling of tri-array as an argument in tridia_1dd routines switched
	17	! off because it caused segmentation faults with intel 12.1 compiler
	18	!
[878]	19	! 877 2012-04-03 11:21:44Z suehring
	20	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
	21	! pressure at the top of the model domain.
	22	!
[810]	23	! 809 2012-01-30 13:32:58Z maronga
	24	! Bugfix: replaced .AND. and .NOT. with && and ! in the preprocessor directives
	25	!
[808]	26	! 807 2012-01-25 11:53:51Z maronga
	27	! New cpp directive "__check" implemented which is used by check_namelist_files
	28	! (most of the code is unneeded by check_namelist_files).
	29	!
[764]	30	! 763 2011-10-06 09:32:09Z suehring
	31	! Comment added concerning the last change.
	32	!
[762]	33	! 761 2011-10-05 17:58:52Z suehring
	34	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
	35	! pressure at the top of the model domain.
	36	!
[697]	37	! 696 2011-03-18 07:03:49Z raasch
	38	! work_fftx removed from PRIVATE clauses in fftx_tr_xy and tr_yx_fftx
	39	!
[684]	40	! 683 2011-02-09 14:25:15Z raasch
	41	! openMP parallelization for 2d-domain-decomposition
	42	!
[668]	43	! 667 2010-12-23 12:06:00Z suehring/gryschka
	44	! ddzu replaced by ddzu_pres due to changes in zu(0)
	45	!
[623]	46	! 622 2010-12-10 08:08:13Z raasch
	47	! optional barriers included in order to speed up collective operations
	48	!
[392]	49	! 377 2009-09-04 11:09:00Z raasch
	50	! __lcmuk changed to __lc to avoid problems with Intel compiler on sgi-ice
	51	!
[198]	52	! 164 2008-05-15 08:46:15Z raasch
	53	! Arguments removed from transpose routines
	54	!
[139]	55	! 128 2007-10-26 13:11:14Z raasch
	56	! Bugfix: wavenumber calculation for even nx in routines maketri
	57	!
[90]	58	! 85 2007-05-11 09:35:14Z raasch
	59	! Bugfix: work_fft*_vec removed from some PRIVATE-declarations
	60	!
[77]	61	! 76 2007-03-29 00:58:32Z raasch
	62	! Tridiagonal coefficients adjusted for Neumann boundary conditions both at
	63	! the bottom and the top.
	64	!
[3]	65	! RCS Log replace by Id keyword, revision history cleaned up
	66	!
[1]	67	! Revision 1.24 2006/08/04 15:00:24 raasch
	68	! Default setting of the thread number tn in case of not using OpenMP
	69	!
	70	! Revision 1.23 2006/02/23 12:48:38 raasch
	71	! Additional compiler directive in routine tridia_1dd for preventing loop
	72	! exchange on NEC-SX6
	73	!
	74	! Revision 1.20 2004/04/30 12:38:09 raasch
	75	! Parts of former poisfft_hybrid moved to this subroutine,
	76	! former subroutine changed to a module, renaming of FFT-subroutines and
	77	! -module, FFTs completely substituted by calls of fft_x and fft_y,
	78	! NAG fft used in the non-parallel case completely removed, l in maketri
	79	! is now a 1d-array, variables passed by modules instead of using parameter
	80	! lists, enlarged transposition arrays introduced
	81	!
	82	! Revision 1.1 1997/07/24 11:24:14 raasch
	83	! Initial revision
	84	!
	85	!
	86	! Description:
	87	! ------------
	88	! See below.
	89	!------------------------------------------------------------------------------!
	90
	91	!--------------------------------------------------------------------------!
	92	! poisfft !
	93	! !
	94	! Original version: Stephan Siano (pois3d) !
	95	! !
	96	! Institute of Meteorology and Climatology, University of Hannover !
	97	! Germany !
	98	! !
	99	! Version as of July 23,1996 !
	100	! !
	101	! !
	102	! Version for parallel computers: Siegfried Raasch !
	103	! !
	104	! Version as of July 03,1997 !
	105	! !
	106	! Solves the Poisson equation with a 2D spectral method !
	107	! d^2 p / dx^2 + d^2 p / dy^2 + d^2 p / dz^2 = s !
	108	! !
	109	! Input: !
	110	! real ar contains in the (nnx,nny,nnz) elements, !
	111	! starting from the element (1,nys,nxl), the !
	112	! values for s !
	113	! real work Temporary array !
	114	! !
	115	! Output: !
	116	! real ar contains the solution for p !
	117	!--------------------------------------------------------------------------!
	118
	119	USE fft_xy
	120	USE indices
	121	USE transpose_indices
	122
	123	IMPLICIT NONE
	124
	125	PRIVATE
[807]	126
[809]	127	#if ! defined ( __check )
[1]	128	PUBLIC poisfft, poisfft_init
	129
	130	INTERFACE poisfft
	131	MODULE PROCEDURE poisfft
	132	END INTERFACE poisfft
	133
	134	INTERFACE poisfft_init
	135	MODULE PROCEDURE poisfft_init
	136	END INTERFACE poisfft_init
[807]	137	#else
	138	PUBLIC poisfft_init
[1]	139
[807]	140	INTERFACE poisfft_init
	141	MODULE PROCEDURE poisfft_init
	142	END INTERFACE poisfft_init
	143	#endif
	144
[1]	145	CONTAINS
	146
	147	SUBROUTINE poisfft_init
	148
	149	CALL fft_init
	150
	151	END SUBROUTINE poisfft_init
	152
[809]	153	#if ! defined ( __check )
[1]	154	SUBROUTINE poisfft( ar, work )
	155
	156	USE cpulog
	157	USE interfaces
	158	USE pegrid
	159
	160	IMPLICIT NONE
	161
[1003]	162	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar, work
[1]	163
	164
	165	CALL cpu_log( log_point_s(3), 'poisfft', 'start' )
	166
	167	!
	168	!-- Two-dimensional Fourier Transformation in x- and y-direction.
	169	#if defined( __parallel )
	170	IF ( pdims(2) == 1 ) THEN
	171
	172	!
	173	!-- 1d-domain-decomposition along x:
	174	!-- FFT along y and transposition y --> x
	175	CALL ffty_tr_yx( ar, work, ar )
	176
	177	!
	178	!-- FFT along x, solving the tridiagonal system and backward FFT
	179	CALL fftx_tri_fftx( ar )
	180
	181	!
	182	!-- Transposition x --> y and backward FFT along y
	183	CALL tr_xy_ffty( ar, work, ar )
	184
	185	ELSEIF ( pdims(1) == 1 ) THEN
	186
	187	!
	188	!-- 1d-domain-decomposition along y:
	189	!-- FFT along x and transposition x --> y
	190	CALL fftx_tr_xy( ar, work, ar )
	191
	192	!
	193	!-- FFT along y, solving the tridiagonal system and backward FFT
	194	CALL ffty_tri_ffty( ar )
	195
	196	!
	197	!-- Transposition y --> x and backward FFT along x
	198	CALL tr_yx_fftx( ar, work, ar )
	199
	200	ELSE
	201
	202	!
	203	!-- 2d-domain-decomposition
	204	!-- Transposition z --> x
	205	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
[164]	206	CALL transpose_zx( ar, work, ar )
[1]	207	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
	208
	209	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
	210	CALL fftxp( ar, 'forward' )
	211	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
	212
	213	!
	214	!-- Transposition x --> y
	215	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
[164]	216	CALL transpose_xy( ar, work, ar )
[1]	217	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
	218
	219	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
	220	CALL fftyp( ar, 'forward' )
	221	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	222
	223	!
	224	!-- Transposition y --> z
	225	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
[164]	226	CALL transpose_yz( ar, work, ar )
[1]	227	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
	228
	229	!
	230	!-- Solve the Poisson equation in z-direction in cartesian space.
	231	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
	232	CALL tridia( ar )
	233	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
	234
	235	!
	236	!-- Inverse Fourier Transformation
	237	!-- Transposition z --> y
	238	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
[164]	239	CALL transpose_zy( ar, work, ar )
[1]	240	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
	241
	242	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
	243	CALL fftyp( ar, 'backward' )
	244	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	245
	246	!
	247	!-- Transposition y --> x
	248	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
[164]	249	CALL transpose_yx( ar, work, ar )
[1]	250	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
	251
	252	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
	253	CALL fftxp( ar, 'backward' )
	254	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	255
	256	!
	257	!-- Transposition x --> z
	258	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
[164]	259	CALL transpose_xz( ar, work, ar )
[1]	260	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
	261
	262	ENDIF
	263
	264	#else
	265
	266	!
	267	!-- Two-dimensional Fourier Transformation along x- and y-direction.
	268	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
	269	CALL fftx( ar, 'forward' )
	270	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
	271	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
	272	CALL ffty( ar, 'forward' )
	273	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	274
	275	!
	276	!-- Solve the Poisson equation in z-direction in cartesian space.
	277	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
	278	CALL tridia( ar )
	279	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
	280
	281	!
	282	!-- Inverse Fourier Transformation.
	283	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
	284	CALL ffty( ar, 'backward' )
	285	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	286	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
	287	CALL fftx( ar, 'backward' )
	288	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	289
	290	#endif
	291
	292	CALL cpu_log( log_point_s(3), 'poisfft', 'stop' )
	293
	294	END SUBROUTINE poisfft
	295
	296
	297
	298	SUBROUTINE tridia( ar )
	299
	300	!------------------------------------------------------------------------------!
	301	! solves the linear system of equations:
	302	!
	303	! -(4 pi^2(i^2/(dx^2nnx^2)+j^2/(dy^2nny^2))+
	304	! 1/(dzu(k)dzw(k))+1/(dzu(k-1)dzw(k)))*p(i,j,k)+
	305	! 1/(dzu(k)dzw(k))p(i,j,k+1)+1/(dzu(k-1)dzw(k))p(i,j,k-1)=d(i,j,k)
	306	!
	307	! by using the Thomas algorithm
	308	!------------------------------------------------------------------------------!
	309
	310	USE arrays_3d
	311
	312	IMPLICIT NONE
	313
	314	INTEGER :: i, j, k, nnyh
	315
	316	REAL, DIMENSION(nxl_z:nxr_z,0:nz-1) :: ar1
	317	REAL, DIMENSION(5,nxl_z:nxr_z,0:nz-1) :: tri
	318
	319	#if defined( __parallel )
[1003]	320	REAL :: ar(nxl_z:nxr_z,nys_z:nyn_z,1:nz)
[1]	321	#else
	322	REAL :: ar(1:nz,nys_z:nyn_z,nxl_z:nxr_z)
	323	#endif
	324
	325
	326	nnyh = (ny+1) / 2
	327
	328	!
	329	!-- Define constant elements of the tridiagonal matrix.
[683]	330	!$OMP PARALLEL PRIVATE ( k, i )
	331	!$OMP DO
[1]	332	DO k = 0, nz-1
	333	DO i = nxl_z, nxr_z
[667]	334	tri(2,i,k) = ddzu_pres(k+1) * ddzw(k+1)
	335	tri(3,i,k) = ddzu_pres(k+2) * ddzw(k+1)
[1]	336	ENDDO
	337	ENDDO
[683]	338	!$OMP END PARALLEL
[1]	339
	340	#if defined( __parallel )
	341	!
	342	!-- Repeat for all y-levels.
[683]	343	!$OMP PARALLEL FIRSTPRIVATE( tri ) PRIVATE ( ar1, j )
	344	!$OMP DO
[1]	345	DO j = nys_z, nyn_z
	346	IF ( j <= nnyh ) THEN
	347	CALL maketri( tri, j )
	348	ELSE
	349	CALL maketri( tri, ny+1-j )
	350	ENDIF
	351	CALL split( tri )
	352	CALL substi( ar, ar1, tri, j )
	353	ENDDO
[683]	354	!$OMP END PARALLEL
[1]	355	#else
	356	!
	357	!-- First y-level.
	358	CALL maketri( tri, nys_z )
	359	CALL split( tri )
	360	CALL substi( ar, ar1, tri, 0 )
	361
	362	!
	363	!-- Further y-levels.
	364	DO j = 1, nnyh - 1
	365	CALL maketri( tri, j )
	366	CALL split( tri )
	367	CALL substi( ar, ar1, tri, j )
	368	CALL substi( ar, ar1, tri, ny+1-j )
	369	ENDDO
	370	CALL maketri( tri, nnyh )
	371	CALL split( tri )
	372	CALL substi( ar, ar1, tri, nnyh+nys )
	373	#endif
	374
	375	CONTAINS
	376
	377	SUBROUTINE maketri( tri, j )
	378
	379	!------------------------------------------------------------------------------!
	380	! Computes the i- and j-dependent component of the matrix
	381	!------------------------------------------------------------------------------!
	382
	383	USE arrays_3d
	384	USE constants
	385	USE control_parameters
	386	USE grid_variables
	387
	388	IMPLICIT NONE
	389
	390	INTEGER :: i, j, k, nnxh
	391	REAL :: a, c
	392	REAL :: ll(nxl_z:nxr_z)
	393	REAL :: tri(5,nxl_z:nxr_z,0:nz-1)
	394
	395
	396	nnxh = ( nx + 1 ) / 2
	397
	398	!
	399	!-- Provide the tridiagonal matrix for solution of the Poisson equation in
	400	!-- Fourier space. The coefficients are computed following the method of
	401	!-- Schmidt et al. (DFVLR-Mitteilung 84-15), which departs from Stephan
	402	!-- Siano's original version by discretizing the Poisson equation,
	403	!-- before it is Fourier-transformed
	404	#if defined( __parallel )
	405	DO i = nxl_z, nxr_z
[128]	406	IF ( i >= 0 .AND. i <= nnxh ) THEN
[1]	407	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
	408	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
	409	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
	410	FLOAT( ny+1 ) ) ) / ( dy * dy )
	411	ELSE
	412	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
	413	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
	414	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
	415	FLOAT( ny+1 ) ) ) / ( dy * dy )
	416	ENDIF
	417	DO k = 0,nz-1
[667]	418	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
	419	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
[1]	420	tri(1,i,k) = a + c - ll(i)
	421	ENDDO
	422	ENDDO
	423	#else
	424	DO i = 0, nnxh
	425	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / FLOAT( nx+1 ) ) ) / &
	426	( dx * dx ) + &
	427	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / FLOAT( ny+1 ) ) ) / &
	428	( dy * dy )
	429	DO k = 0, nz-1
[667]	430	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
	431	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
[1]	432	tri(1,i,k) = a + c - ll(i)
	433	IF ( i >= 1 .and. i < nnxh ) THEN
	434	tri(1,nx+1-i,k) = tri(1,i,k)
	435	ENDIF
	436	ENDDO
	437	ENDDO
	438	#endif
	439	IF ( ibc_p_b == 1 .OR. ibc_p_b == 2 ) THEN
	440	DO i = nxl_z, nxr_z
	441	tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
	442	ENDDO
	443	ENDIF
	444	IF ( ibc_p_t == 1 ) THEN
	445	DO i = nxl_z, nxr_z
	446	tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
	447	ENDDO
	448	ENDIF
	449
	450	END SUBROUTINE maketri
	451
	452
	453	SUBROUTINE substi( ar, ar1, tri, j )
	454
	455	!------------------------------------------------------------------------------!
	456	! Substitution (Forward and Backward) (Thomas algorithm)
	457	!------------------------------------------------------------------------------!
	458
[76]	459	USE control_parameters
	460
[1]	461	IMPLICIT NONE
	462
	463	INTEGER :: i, j, k
	464	REAL :: ar1(nxl_z:nxr_z,0:nz-1)
	465	REAL :: tri(5,nxl_z:nxr_z,0:nz-1)
	466	#if defined( __parallel )
[1003]	467	REAL :: ar(nxl_z:nxr_z,nys_z:nyn_z,1:nz)
[1]	468	#else
	469	REAL :: ar(1:nz,nys_z:nyn_z,nxl_z:nxr_z)
	470	#endif
	471
	472	!
	473	!-- Forward substitution.
	474	DO i = nxl_z, nxr_z
	475	#if defined( __parallel )
	476	ar1(i,0) = ar(i,j,1)
	477	#else
	478	ar1(i,0) = ar(1,j,i)
	479	#endif
	480	ENDDO
	481	DO k = 1, nz - 1
	482	DO i = nxl_z, nxr_z
	483	#if defined( __parallel )
	484	ar1(i,k) = ar(i,j,k+1) - tri(5,i,k) * ar1(i,k-1)
	485	#else
	486	ar1(i,k) = ar(k+1,j,i) - tri(5,i,k) * ar1(i,k-1)
	487	#endif
	488	ENDDO
	489	ENDDO
	490
	491	!
[877]	492	!-- Backward substitution
	493	!-- Note, the 1.0E-20 in the denominator is due to avoid divisions
	494	!-- by zero appearing if the pressure bc is set to neumann at the top of
	495	!-- the model domain.
[1]	496	DO i = nxl_z, nxr_z
	497	#if defined( __parallel )
[877]	498	ar(i,j,nz) = ar1(i,nz-1) / ( tri(4,i,nz-1) + 1.0E-20 )
[1]	499	#else
[877]	500	ar(nz,j,i) = ar1(i,nz-1) / ( tri(4,i,nz-1) + 1.0E-20 )
[1]	501	#endif
	502	ENDDO
	503	DO k = nz-2, 0, -1
	504	DO i = nxl_z, nxr_z
	505	#if defined( __parallel )
	506	ar(i,j,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,j,k+2) ) &
	507	/ tri(4,i,k)
	508	#else
	509	ar(k+1,j,i) = ( ar1(i,k) - tri(3,i,k) * ar(k+2,j,i) ) &
	510	/ tri(4,i,k)
	511	#endif
	512	ENDDO
	513	ENDDO
	514
[76]	515	!
	516	!-- Indices i=0, j=0 correspond to horizontally averaged pressure.
	517	!-- The respective values of ar should be zero at all k-levels if
	518	!-- acceleration of horizontally averaged vertical velocity is zero.
	519	IF ( ibc_p_b == 1 .AND. ibc_p_t == 1 ) THEN
	520	IF ( j == 0 .AND. nxl_z == 0 ) THEN
	521	#if defined( __parallel )
	522	DO k = 1, nz
	523	ar(nxl_z,j,k) = 0.0
	524	ENDDO
	525	#else
	526	DO k = 1, nz
	527	ar(k,j,nxl_z) = 0.0
	528	ENDDO
	529	#endif
	530	ENDIF
	531	ENDIF
	532
[1]	533	END SUBROUTINE substi
	534
	535
	536	SUBROUTINE split( tri )
	537
	538	!------------------------------------------------------------------------------!
	539	! Splitting of the tridiagonal matrix (Thomas algorithm)
	540	!------------------------------------------------------------------------------!
	541
	542	IMPLICIT NONE
	543
	544	INTEGER :: i, k
	545	REAL :: tri(5,nxl_z:nxr_z,0:nz-1)
	546
	547	!
	548	!-- Splitting.
	549	DO i = nxl_z, nxr_z
	550	tri(4,i,0) = tri(1,i,0)
	551	ENDDO
	552	DO k = 1, nz-1
	553	DO i = nxl_z, nxr_z
	554	tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
	555	tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
	556	ENDDO
	557	ENDDO
	558
	559	END SUBROUTINE split
	560
	561	END SUBROUTINE tridia
	562
	563
	564	#if defined( __parallel )
	565	SUBROUTINE fftxp( ar, direction )
	566
	567	!------------------------------------------------------------------------------!
	568	! Fourier-transformation along x-direction Parallelized version
	569	!------------------------------------------------------------------------------!
	570
	571	IMPLICIT NONE
	572
	573	CHARACTER (LEN=*) :: direction
	574	INTEGER :: j, k
[1003]	575	REAL :: ar(0:nx,nys_x:nyn_x,nzb_x:nzt_x)
[1]	576
	577	!
	578	!-- Performing the fft with one of the methods implemented
[683]	579	!$OMP PARALLEL PRIVATE ( j, k )
	580	!$OMP DO
[1]	581	DO k = nzb_x, nzt_x
	582	DO j = nys_x, nyn_x
	583	CALL fft_x( ar(0:nx,j,k), direction )
	584	ENDDO
	585	ENDDO
[683]	586	!$OMP END PARALLEL
[1]	587
	588	END SUBROUTINE fftxp
	589
	590	#else
	591	SUBROUTINE fftx( ar, direction )
	592
	593	!------------------------------------------------------------------------------!
	594	! Fourier-transformation along x-direction Non parallel version
	595	!------------------------------------------------------------------------------!
	596
	597	IMPLICIT NONE
	598
	599	CHARACTER (LEN=*) :: direction
	600	INTEGER :: i, j, k
	601	REAL :: ar(1:nz,0:ny,0:nx)
	602
	603	!
	604	!-- Performing the fft with one of the methods implemented
[683]	605	!$OMP PARALLEL PRIVATE ( j, k )
	606	!$OMP DO
[1]	607	DO k = 1, nz
	608	DO j = 0, ny
	609	CALL fft_x( ar(k,j,0:nx), direction )
	610	ENDDO
	611	ENDDO
[683]	612	!$OMP END PARALLEL
[1]	613
	614	END SUBROUTINE fftx
	615	#endif
	616
	617
	618	#if defined( __parallel )
	619	SUBROUTINE fftyp( ar, direction )
	620
	621	!------------------------------------------------------------------------------!
	622	! Fourier-transformation along y-direction Parallelized version
	623	!------------------------------------------------------------------------------!
	624
	625	IMPLICIT NONE
	626
	627	CHARACTER (LEN=*) :: direction
	628	INTEGER :: i, k
[1003]	629	REAL :: ar(0:ny,nxl_y:nxr_y,nzb_y:nzt_y)
[1]	630
	631	!
	632	!-- Performing the fft with one of the methods implemented
[683]	633	!$OMP PARALLEL PRIVATE ( i, k )
	634	!$OMP DO
[1]	635	DO k = nzb_y, nzt_y
	636	DO i = nxl_y, nxr_y
	637	CALL fft_y( ar(0:ny,i,k), direction )
	638	ENDDO
	639	ENDDO
[683]	640	!$OMP END PARALLEL
[1]	641
	642	END SUBROUTINE fftyp
	643
	644	#else
	645	SUBROUTINE ffty( ar, direction )
	646
	647	!------------------------------------------------------------------------------!
	648	! Fourier-transformation along y-direction Non parallel version
	649	!------------------------------------------------------------------------------!
	650
	651	IMPLICIT NONE
	652
	653	CHARACTER (LEN=*) :: direction
	654	INTEGER :: i, k
	655	REAL :: ar(1:nz,0:ny,0:nx)
	656
	657	!
	658	!-- Performing the fft with one of the methods implemented
[683]	659	!$OMP PARALLEL PRIVATE ( i, k )
	660	!$OMP DO
[1]	661	DO k = 1, nz
	662	DO i = 0, nx
	663	CALL fft_y( ar(k,0:ny,i), direction )
	664	ENDDO
	665	ENDDO
[683]	666	!$OMP END PARALLEL
[1]	667
	668	END SUBROUTINE ffty
	669	#endif
	670
	671	#if defined( __parallel )
	672	SUBROUTINE ffty_tr_yx( f_in, work, f_out )
	673
	674	!------------------------------------------------------------------------------!
	675	! Fourier-transformation along y with subsequent transposition y --> x for
	676	! a 1d-decomposition along x
	677	!
	678	! ATTENTION: The performance of this routine is much faster on the NEC-SX6,
	679	! if the first index of work_ffty_vec is odd. Otherwise
	680	! memory bank conflicts may occur (especially if the index is a
	681	! multiple of 128). That's why work_ffty_vec is dimensioned as
	682	! 0:ny+1.
	683	! Of course, this will not work if users are using an odd number
	684	! of gridpoints along y.
	685	!------------------------------------------------------------------------------!
	686
	687	USE control_parameters
	688	USE cpulog
	689	USE indices
	690	USE interfaces
	691	USE pegrid
	692	USE transpose_indices
	693
	694	IMPLICIT NONE
	695
	696	INTEGER :: i, iend, iouter, ir, j, k
	697	INTEGER, PARAMETER :: stridex = 4
	698
	699	REAL, DIMENSION(0:ny,stridex) :: work_ffty
	700	#if defined( __nec )
	701	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
	702	#endif
[1003]	703	REAL, DIMENSION(1:nz,0:ny,nxl:nxr) :: f_in
	704	REAL, DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_out
	705	REAL, DIMENSION(nxl:nxr,1:nz,0:ny) :: work
[1]	706
	707	!
	708	!-- Carry out the FFT along y, where all data are present due to the
	709	!-- 1d-decomposition along x. Resort the data in a way that x becomes
	710	!-- the first index.
	711	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
	712
	713	IF ( host(1:3) == 'nec' ) THEN
	714	#if defined( __nec )
	715	!
	716	!-- Code optimized for vector processors
[85]	717	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	718	!$OMP DO
	719	DO i = nxl, nxr
	720
	721	DO j = 0, ny
	722	DO k = 1, nz
	723	work_ffty_vec(j,k,i) = f_in(k,j,i)
	724	ENDDO
	725	ENDDO
	726
	727	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' )
	728
	729	ENDDO
	730
	731	!$OMP DO
	732	DO k = 1, nz
	733	DO j = 0, ny
	734	DO i = nxl, nxr
	735	work(i,k,j) = work_ffty_vec(j,k,i)
	736	ENDDO
	737	ENDDO
	738	ENDDO
	739	!$OMP END PARALLEL
	740	#endif
	741
	742	ELSE
	743
	744	!
	745	!-- Cache optimized code.
	746	!-- The i-(x-)direction is split into a strided outer loop and an inner
	747	!-- loop for better cache performance
	748	!$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty)
	749	!$OMP DO
	750	DO iouter = nxl, nxr, stridex
	751
	752	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
	753
	754	DO k = 1, nz
	755
	756	DO i = iouter, iend
	757
	758	ir = i-iouter+1 ! counter within a stride
	759	DO j = 0, ny
	760	work_ffty(j,ir) = f_in(k,j,i)
	761	ENDDO
	762	!
	763	!-- FFT along y
	764	CALL fft_y( work_ffty(:,ir), 'forward' )
	765
	766	ENDDO
	767
	768	!
	769	!-- Resort
	770	DO j = 0, ny
	771	DO i = iouter, iend
	772	work(i,k,j) = work_ffty(j,i-iouter+1)
	773	ENDDO
	774	ENDDO
	775
	776	ENDDO
	777
	778	ENDDO
	779	!$OMP END PARALLEL
	780
	781	ENDIF
	782	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	783
	784	!
	785	!-- Transpose array
	786	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	787	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	788	CALL MPI_ALLTOALL( work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
	789	f_out(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
	790	comm1dx, ierr )
	791	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	792
	793	END SUBROUTINE ffty_tr_yx
	794
	795
	796	SUBROUTINE tr_xy_ffty( f_in, work, f_out )
	797
	798	!------------------------------------------------------------------------------!
	799	! Transposition x --> y with a subsequent backward Fourier transformation for
	800	! a 1d-decomposition along x
	801	!------------------------------------------------------------------------------!
	802
	803	USE control_parameters
	804	USE cpulog
	805	USE indices
	806	USE interfaces
	807	USE pegrid
	808	USE transpose_indices
	809
	810	IMPLICIT NONE
	811
	812	INTEGER :: i, iend, iouter, ir, j, k
	813	INTEGER, PARAMETER :: stridex = 4
	814
	815	REAL, DIMENSION(0:ny,stridex) :: work_ffty
	816	#if defined( __nec )
	817	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
	818	#endif
[1003]	819	REAL, DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_in
	820	REAL, DIMENSION(1:nz,0:ny,nxl:nxr) :: f_out
	821	REAL, DIMENSION(nxl:nxr,1:nz,0:ny) :: work
[1]	822
	823	!
	824	!-- Transpose array
	825	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	826	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	827	CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
	828	work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
	829	comm1dx, ierr )
	830	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	831
	832	!
	833	!-- Resort the data in a way that y becomes the first index and carry out the
	834	!-- backward fft along y.
	835	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
	836
	837	IF ( host(1:3) == 'nec' ) THEN
	838	#if defined( __nec )
	839	!
	840	!-- Code optimized for vector processors
[85]	841	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	842	!$OMP DO
	843	DO k = 1, nz
	844	DO j = 0, ny
	845	DO i = nxl, nxr
	846	work_ffty_vec(j,k,i) = work(i,k,j)
	847	ENDDO
	848	ENDDO
	849	ENDDO
	850
	851	!$OMP DO
	852	DO i = nxl, nxr
	853
	854	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' )
	855
	856	DO j = 0, ny
	857	DO k = 1, nz
	858	f_out(k,j,i) = work_ffty_vec(j,k,i)
	859	ENDDO
	860	ENDDO
	861
	862	ENDDO
	863	!$OMP END PARALLEL
	864	#endif
	865
	866	ELSE
	867
	868	!
	869	!-- Cache optimized code.
	870	!-- The i-(x-)direction is split into a strided outer loop and an inner
	871	!-- loop for better cache performance
	872	!$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty )
	873	!$OMP DO
	874	DO iouter = nxl, nxr, stridex
	875
	876	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
	877
	878	DO k = 1, nz
	879	!
	880	!-- Resort
	881	DO j = 0, ny
	882	DO i = iouter, iend
	883	work_ffty(j,i-iouter+1) = work(i,k,j)
	884	ENDDO
	885	ENDDO
	886
	887	DO i = iouter, iend
	888
	889	!
	890	!-- FFT along y
	891	ir = i-iouter+1 ! counter within a stride
	892	CALL fft_y( work_ffty(:,ir), 'backward' )
	893
	894	DO j = 0, ny
	895	f_out(k,j,i) = work_ffty(j,ir)
	896	ENDDO
	897	ENDDO
	898
	899	ENDDO
	900
	901	ENDDO
	902	!$OMP END PARALLEL
	903
	904	ENDIF
	905
	906	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	907
	908	END SUBROUTINE tr_xy_ffty
	909
	910
	911	SUBROUTINE fftx_tri_fftx( ar )
	912
	913	!------------------------------------------------------------------------------!
	914	! FFT along x, solution of the tridiagonal system and backward FFT for
	915	! a 1d-decomposition along x
	916	!
	917	! WARNING: this subroutine may still not work for hybrid parallelization
	918	! with OpenMP (for possible necessary changes see the original
	919	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
	920	!------------------------------------------------------------------------------!
	921
	922	USE control_parameters
	923	USE cpulog
	924	USE grid_variables
	925	USE indices
	926	USE interfaces
	927	USE pegrid
	928	USE transpose_indices
	929
	930	IMPLICIT NONE
	931
	932	character(len=3) :: myth_char
	933
	934	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
	935
[1003]	936	REAL, DIMENSION(0:nx) :: work_fftx
	937	REAL, DIMENSION(0:nx,1:nz) :: work_trix
	938	REAL, DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: ar
	939	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
[1]	940
	941
	942	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'start' )
	943
	944	ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) )
	945
	946	tn = 0 ! Default thread number in case of one thread
	947	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_fftx, work_trix )
	948	DO j = nys_x, nyn_x
	949
	950	!$ tn = omp_get_thread_num()
	951
	952	IF ( host(1:3) == 'nec' ) THEN
	953	!
	954	!-- Code optimized for vector processors
	955	DO k = 1, nz
	956
	957	m = 0
	958	DO n = 1, pdims(1)
[1003]	959	DO i = 1, nnx
[1]	960	work_trix(m,k) = ar(i,k,j,n)
	961	m = m + 1
	962	ENDDO
	963	ENDDO
	964
	965	ENDDO
	966
	967	CALL fft_x_m( work_trix, 'forward' )
	968
	969	ELSE
	970	!
	971	!-- Cache optimized code
	972	DO k = 1, nz
	973
	974	m = 0
	975	DO n = 1, pdims(1)
[1003]	976	DO i = 1, nnx
[1]	977	work_fftx(m) = ar(i,k,j,n)
	978	m = m + 1
	979	ENDDO
	980	ENDDO
	981
	982	CALL fft_x( work_fftx, 'forward' )
	983
	984	DO i = 0, nx
	985	work_trix(i,k) = work_fftx(i)
	986	ENDDO
	987
	988	ENDDO
	989
	990	ENDIF
	991
	992	!
	993	!-- Solve the linear equation system
	994	CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) )
	995
	996	IF ( host(1:3) == 'nec' ) THEN
	997	!
	998	!-- Code optimized for vector processors
	999	CALL fft_x_m( work_trix, 'backward' )
	1000
	1001	DO k = 1, nz
	1002
	1003	m = 0
	1004	DO n = 1, pdims(1)
[1003]	1005	DO i = 1, nnx
[1]	1006	ar(i,k,j,n) = work_trix(m,k)
	1007	m = m + 1
	1008	ENDDO
	1009	ENDDO
	1010
	1011	ENDDO
	1012
	1013	ELSE
	1014	!
	1015	!-- Cache optimized code
	1016	DO k = 1, nz
	1017
	1018	DO i = 0, nx
	1019	work_fftx(i) = work_trix(i,k)
	1020	ENDDO
	1021
	1022	CALL fft_x( work_fftx, 'backward' )
	1023
	1024	m = 0
	1025	DO n = 1, pdims(1)
[1003]	1026	DO i = 1, nnx
[1]	1027	ar(i,k,j,n) = work_fftx(m)
	1028	m = m + 1
	1029	ENDDO
	1030	ENDDO
	1031
	1032	ENDDO
	1033
	1034	ENDIF
	1035
	1036	ENDDO
	1037
	1038	DEALLOCATE( tri )
	1039
	1040	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'stop' )
	1041
	1042	END SUBROUTINE fftx_tri_fftx
	1043
	1044
	1045	SUBROUTINE fftx_tr_xy( f_in, work, f_out )
	1046
	1047	!------------------------------------------------------------------------------!
	1048	! Fourier-transformation along x with subsequent transposition x --> y for
	1049	! a 1d-decomposition along y
	1050	!
	1051	! ATTENTION: The NEC-branch of this routine may significantly profit from
	1052	! further optimizations. So far, performance is much worse than
	1053	! for routine ffty_tr_yx (more than three times slower).
	1054	!------------------------------------------------------------------------------!
	1055
	1056	USE control_parameters
	1057	USE cpulog
	1058	USE indices
	1059	USE interfaces
	1060	USE pegrid
	1061	USE transpose_indices
	1062
	1063	IMPLICIT NONE
	1064
	1065	INTEGER :: i, j, k
	1066
[1003]	1067	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
	1068	REAL, DIMENSION(1:nz,nys:nyn,0:nx) :: f_in
	1069	REAL, DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_out
	1070	REAL, DIMENSION(nys:nyn,1:nz,0:nx) :: work
[1]	1071
	1072	!
	1073	!-- Carry out the FFT along x, where all data are present due to the
	1074	!-- 1d-decomposition along y. Resort the data in a way that y becomes
	1075	!-- the first index.
	1076	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
	1077
	1078	IF ( host(1:3) == 'nec' ) THEN
	1079	!
	1080	!-- Code for vector processors
[85]	1081	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	1082	!$OMP DO
	1083	DO i = 0, nx
	1084
	1085	DO j = nys, nyn
	1086	DO k = 1, nz
	1087	work_fftx(i,k,j) = f_in(k,j,i)
	1088	ENDDO
	1089	ENDDO
	1090
	1091	ENDDO
	1092
	1093	!$OMP DO
	1094	DO j = nys, nyn
	1095
	1096	CALL fft_x_m( work_fftx(:,:,j), 'forward' )
	1097
	1098	DO k = 1, nz
	1099	DO i = 0, nx
	1100	work(j,k,i) = work_fftx(i,k,j)
	1101	ENDDO
	1102	ENDDO
	1103
	1104	ENDDO
	1105	!$OMP END PARALLEL
	1106
	1107	ELSE
	1108
	1109	!
	1110	!-- Cache optimized code (there might be still a potential for better
	1111	!-- optimization).
[696]	1112	!$OMP PARALLEL PRIVATE (i,j,k)
[1]	1113	!$OMP DO
	1114	DO i = 0, nx
	1115
	1116	DO j = nys, nyn
	1117	DO k = 1, nz
	1118	work_fftx(i,k,j) = f_in(k,j,i)
	1119	ENDDO
	1120	ENDDO
	1121
	1122	ENDDO
	1123
	1124	!$OMP DO
	1125	DO j = nys, nyn
	1126	DO k = 1, nz
	1127
	1128	CALL fft_x( work_fftx(0:nx,k,j), 'forward' )
	1129
	1130	DO i = 0, nx
	1131	work(j,k,i) = work_fftx(i,k,j)
	1132	ENDDO
	1133	ENDDO
	1134
	1135	ENDDO
	1136	!$OMP END PARALLEL
	1137
	1138	ENDIF
	1139	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
	1140
	1141	!
	1142	!-- Transpose array
	1143	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	1144	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	1145	CALL MPI_ALLTOALL( work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
	1146	f_out(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
	1147	comm1dy, ierr )
	1148	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	1149
	1150	END SUBROUTINE fftx_tr_xy
	1151
	1152
	1153	SUBROUTINE tr_yx_fftx( f_in, work, f_out )
	1154
	1155	!------------------------------------------------------------------------------!
	1156	! Transposition y --> x with a subsequent backward Fourier transformation for
	1157	! a 1d-decomposition along x
	1158	!------------------------------------------------------------------------------!
	1159
	1160	USE control_parameters
	1161	USE cpulog
	1162	USE indices
	1163	USE interfaces
	1164	USE pegrid
	1165	USE transpose_indices
	1166
	1167	IMPLICIT NONE
	1168
	1169	INTEGER :: i, j, k
	1170
[1003]	1171	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
	1172	REAL, DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_in
	1173	REAL, DIMENSION(1:nz,nys:nyn,0:nx) :: f_out
	1174	REAL, DIMENSION(nys:nyn,1:nz,0:nx) :: work
[1]	1175
	1176	!
	1177	!-- Transpose array
	1178	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	1179	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	1180	CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
	1181	work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
	1182	comm1dy, ierr )
	1183	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	1184
	1185	!
	1186	!-- Carry out the FFT along x, where all data are present due to the
	1187	!-- 1d-decomposition along y. Resort the data in a way that y becomes
	1188	!-- the first index.
	1189	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
	1190
	1191	IF ( host(1:3) == 'nec' ) THEN
	1192	!
	1193	!-- Code optimized for vector processors
[85]	1194	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	1195	!$OMP DO
	1196	DO j = nys, nyn
	1197
	1198	DO k = 1, nz
	1199	DO i = 0, nx
	1200	work_fftx(i,k,j) = work(j,k,i)
	1201	ENDDO
	1202	ENDDO
	1203
	1204	CALL fft_x_m( work_fftx(:,:,j), 'backward' )
	1205
	1206	ENDDO
	1207
	1208	!$OMP DO
	1209	DO i = 0, nx
	1210	DO j = nys, nyn
	1211	DO k = 1, nz
	1212	f_out(k,j,i) = work_fftx(i,k,j)
	1213	ENDDO
	1214	ENDDO
	1215	ENDDO
	1216	!$OMP END PARALLEL
	1217
	1218	ELSE
	1219
	1220	!
	1221	!-- Cache optimized code (there might be still a potential for better
	1222	!-- optimization).
[696]	1223	!$OMP PARALLEL PRIVATE (i,j,k)
[1]	1224	!$OMP DO
	1225	DO j = nys, nyn
	1226	DO k = 1, nz
	1227
	1228	DO i = 0, nx
	1229	work_fftx(i,k,j) = work(j,k,i)
	1230	ENDDO
	1231
	1232	CALL fft_x( work_fftx(0:nx,k,j), 'backward' )
	1233
	1234	ENDDO
	1235	ENDDO
	1236
	1237	!$OMP DO
	1238	DO i = 0, nx
	1239	DO j = nys, nyn
	1240	DO k = 1, nz
	1241	f_out(k,j,i) = work_fftx(i,k,j)
	1242	ENDDO
	1243	ENDDO
	1244	ENDDO
	1245	!$OMP END PARALLEL
	1246
	1247	ENDIF
	1248	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	1249
	1250	END SUBROUTINE tr_yx_fftx
	1251
	1252
	1253	SUBROUTINE ffty_tri_ffty( ar )
	1254
	1255	!------------------------------------------------------------------------------!
	1256	! FFT along y, solution of the tridiagonal system and backward FFT for
	1257	! a 1d-decomposition along y
	1258	!
	1259	! WARNING: this subroutine may still not work for hybrid parallelization
	1260	! with OpenMP (for possible necessary changes see the original
	1261	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
	1262	!------------------------------------------------------------------------------!
	1263
	1264	USE control_parameters
	1265	USE cpulog
	1266	USE grid_variables
	1267	USE indices
	1268	USE interfaces
	1269	USE pegrid
	1270	USE transpose_indices
	1271
	1272	IMPLICIT NONE
	1273
	1274	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
	1275
[1003]	1276	REAL, DIMENSION(0:ny) :: work_ffty
	1277	REAL, DIMENSION(0:ny,1:nz) :: work_triy
	1278	REAL, DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: ar
	1279	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
[1]	1280
	1281
	1282	CALL cpu_log( log_point_s(39), 'fft_y + tridia', 'start' )
	1283
	1284	ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) )
	1285
	1286	tn = 0 ! Default thread number in case of one thread
[696]	1287	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy )
[1]	1288	DO i = nxl_y, nxr_y
	1289
	1290	!$ tn = omp_get_thread_num()
	1291
	1292	IF ( host(1:3) == 'nec' ) THEN
	1293	!
	1294	!-- Code optimized for vector processors
	1295	DO k = 1, nz
	1296
	1297	m = 0
	1298	DO n = 1, pdims(2)
[1003]	1299	DO j = 1, nny
[1]	1300	work_triy(m,k) = ar(j,k,i,n)
	1301	m = m + 1
	1302	ENDDO
	1303	ENDDO
	1304
	1305	ENDDO
	1306
	1307	CALL fft_y_m( work_triy, ny, 'forward' )
	1308
	1309	ELSE
	1310	!
	1311	!-- Cache optimized code
	1312	DO k = 1, nz
	1313
	1314	m = 0
	1315	DO n = 1, pdims(2)
[1003]	1316	DO j = 1, nny
[1]	1317	work_ffty(m) = ar(j,k,i,n)
	1318	m = m + 1
	1319	ENDDO
	1320	ENDDO
	1321
	1322	CALL fft_y( work_ffty, 'forward' )
	1323
	1324	DO j = 0, ny
	1325	work_triy(j,k) = work_ffty(j)
	1326	ENDDO
	1327
	1328	ENDDO
	1329
	1330	ENDIF
	1331
	1332	!
	1333	!-- Solve the linear equation system
	1334	CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) )
	1335
	1336	IF ( host(1:3) == 'nec' ) THEN
	1337	!
	1338	!-- Code optimized for vector processors
	1339	CALL fft_y_m( work_triy, ny, 'backward' )
	1340
	1341	DO k = 1, nz
	1342
	1343	m = 0
	1344	DO n = 1, pdims(2)
[1003]	1345	DO j = 1, nny
[1]	1346	ar(j,k,i,n) = work_triy(m,k)
	1347	m = m + 1
	1348	ENDDO
	1349	ENDDO
	1350
	1351	ENDDO
	1352
	1353	ELSE
	1354	!
	1355	!-- Cache optimized code
	1356	DO k = 1, nz
	1357
	1358	DO j = 0, ny
	1359	work_ffty(j) = work_triy(j,k)
	1360	ENDDO
	1361
	1362	CALL fft_y( work_ffty, 'backward' )
	1363
	1364	m = 0
	1365	DO n = 1, pdims(2)
[1003]	1366	DO j = 1, nny
[1]	1367	ar(j,k,i,n) = work_ffty(m)
	1368	m = m + 1
	1369	ENDDO
	1370	ENDDO
	1371
	1372	ENDDO
	1373
	1374	ENDIF
	1375
	1376	ENDDO
	1377
	1378	DEALLOCATE( tri )
	1379
	1380	CALL cpu_log( log_point_s(39), 'fft_y + tridia', 'stop' )
	1381
	1382	END SUBROUTINE ffty_tri_ffty
	1383
	1384
	1385	SUBROUTINE tridia_1dd( ddx2, ddy2, nx, ny, j, ar, tri )
	1386
	1387	!------------------------------------------------------------------------------!
	1388	! Solves the linear system of equations for a 1d-decomposition along x (see
	1389	! tridia)
	1390	!
[940]	1391	! Attention: when using the intel compilers older than 12.0, array tri must
	1392	! be passed as an argument to the contained subroutines. Otherwise
	1393	! addres faults will occur. This feature can be activated with
	1394	! cpp-switch __intel11
[1]	1395	! On NEC, tri should not be passed (except for routine substi_1dd)
	1396	! because this causes very bad performance.
	1397	!------------------------------------------------------------------------------!
	1398
	1399	USE arrays_3d
	1400	USE control_parameters
	1401
	1402	USE pegrid
	1403
	1404	IMPLICIT NONE
	1405
	1406	INTEGER :: i, j, k, nnyh, nx, ny, omp_get_thread_num, tn
	1407
	1408	REAL :: ddx2, ddy2
	1409
	1410	REAL, DIMENSION(0:nx,1:nz) :: ar
	1411	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
	1412	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	1413
	1414
	1415	nnyh = ( ny + 1 ) / 2
	1416
	1417	!
	1418	!-- Define constant elements of the tridiagonal matrix.
	1419	!-- The compiler on SX6 does loop exchange. If 0:nx is a high power of 2,
	1420	!-- the exchanged loops create bank conflicts. The following directive
	1421	!-- prohibits loop exchange and the loops perform much better.
	1422	! tn = omp_get_thread_num()
	1423	! WRITE( 120+tn, * ) '+++ id=',myid,' nx=',nx,' thread=', omp_get_thread_num()
[82]	1424	! CALL local_flush( 120+tn )
[1]	1425	!CDIR NOLOOPCHG
	1426	DO k = 0, nz-1
	1427	DO i = 0,nx
[667]	1428	tri(2,i,k) = ddzu_pres(k+1) * ddzw(k+1)
	1429	tri(3,i,k) = ddzu_pres(k+2) * ddzw(k+1)
[1]	1430	ENDDO
	1431	ENDDO
	1432	! WRITE( 120+tn, * ) '+++ id=',myid,' end of first tridia loop thread=', omp_get_thread_num()
[82]	1433	! CALL local_flush( 120+tn )
[1]	1434
	1435	IF ( j <= nnyh ) THEN
[940]	1436	#if defined( __intel11 )
[1]	1437	CALL maketri_1dd( j, tri )
	1438	#else
	1439	CALL maketri_1dd( j )
	1440	#endif
	1441	ELSE
[940]	1442	#if defined( __intel11 )
[1]	1443	CALL maketri_1dd( ny+1-j, tri )
	1444	#else
	1445	CALL maketri_1dd( ny+1-j )
	1446	#endif
	1447	ENDIF
[940]	1448	#if defined( __intel11 )
[1]	1449	CALL split_1dd( tri )
	1450	#else
	1451	CALL split_1dd
	1452	#endif
	1453	CALL substi_1dd( ar, tri )
	1454
	1455	CONTAINS
	1456
[940]	1457	#if defined( __intel11 )
[1]	1458	SUBROUTINE maketri_1dd( j, tri )
	1459	#else
	1460	SUBROUTINE maketri_1dd( j )
	1461	#endif
	1462
	1463	!------------------------------------------------------------------------------!
	1464	! computes the i- and j-dependent component of the matrix
	1465	!------------------------------------------------------------------------------!
	1466
	1467	USE constants
	1468
	1469	IMPLICIT NONE
	1470
	1471	INTEGER :: i, j, k, nnxh
	1472	REAL :: a, c
	1473
	1474	REAL, DIMENSION(0:nx) :: l
	1475
[940]	1476	#if defined( __intel11 )
[1]	1477	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	1478	#endif
	1479
	1480
	1481	nnxh = ( nx + 1 ) / 2
	1482	!
	1483	!-- Provide the tridiagonal matrix for solution of the Poisson equation in
	1484	!-- Fourier space. The coefficients are computed following the method of
	1485	!-- Schmidt et al. (DFVLR-Mitteilung 84-15), which departs from Stephan
	1486	!-- Siano's original version by discretizing the Poisson equation,
	1487	!-- before it is Fourier-transformed
	1488	DO i = 0, nx
[128]	1489	IF ( i >= 0 .AND. i <= nnxh ) THEN
[1]	1490	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
	1491	FLOAT( nx+1 ) ) ) * ddx2 + &
	1492	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
	1493	FLOAT( ny+1 ) ) ) * ddy2
	1494	ELSE
	1495	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
	1496	FLOAT( nx+1 ) ) ) * ddx2 + &
	1497	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
	1498	FLOAT( ny+1 ) ) ) * ddy2
	1499	ENDIF
	1500	ENDDO
	1501
	1502	DO k = 0, nz-1
	1503	DO i = 0, nx
[667]	1504	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
	1505	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
[1]	1506	tri(1,i,k) = a + c - l(i)
	1507	ENDDO
	1508	ENDDO
	1509	IF ( ibc_p_b == 1 .OR. ibc_p_b == 2 ) THEN
	1510	DO i = 0, nx
	1511	tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
	1512	ENDDO
	1513	ENDIF
	1514	IF ( ibc_p_t == 1 ) THEN
	1515	DO i = 0, nx
	1516	tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
	1517	ENDDO
	1518	ENDIF
	1519
	1520	END SUBROUTINE maketri_1dd
	1521
	1522
[940]	1523	#if defined( __intel11 )
[1]	1524	SUBROUTINE split_1dd( tri )
	1525	#else
	1526	SUBROUTINE split_1dd
	1527	#endif
	1528
	1529	!------------------------------------------------------------------------------!
	1530	! Splitting of the tridiagonal matrix (Thomas algorithm)
	1531	!------------------------------------------------------------------------------!
	1532
	1533	IMPLICIT NONE
	1534
	1535	INTEGER :: i, k
	1536
[940]	1537	#if defined( __intel11 )
[1]	1538	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	1539	#endif
	1540
	1541
	1542	!
	1543	!-- Splitting
	1544	DO i = 0, nx
	1545	tri(4,i,0) = tri(1,i,0)
	1546	ENDDO
	1547	DO k = 1, nz-1
	1548	DO i = 0, nx
	1549	tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
	1550	tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
	1551	ENDDO
	1552	ENDDO
	1553
	1554	END SUBROUTINE split_1dd
	1555
	1556
	1557	SUBROUTINE substi_1dd( ar, tri )
	1558
	1559	!------------------------------------------------------------------------------!
	1560	! Substitution (Forward and Backward) (Thomas algorithm)
	1561	!------------------------------------------------------------------------------!
	1562
	1563	IMPLICIT NONE
	1564
[76]	1565	INTEGER :: i, k
[1]	1566
	1567	REAL, DIMENSION(0:nx,nz) :: ar
	1568	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
	1569	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	1570
	1571	!
	1572	!-- Forward substitution
	1573	DO i = 0, nx
	1574	ar1(i,0) = ar(i,1)
	1575	ENDDO
	1576	DO k = 1, nz-1
	1577	DO i = 0, nx
	1578	ar1(i,k) = ar(i,k+1) - tri(5,i,k) * ar1(i,k-1)
	1579	ENDDO
	1580	ENDDO
	1581
	1582	!
	1583	!-- Backward substitution
[763]	1584	!-- Note, the add of 1.0E-20 in the denominator is due to avoid divisions
	1585	!-- by zero appearing if the pressure bc is set to neumann at the top of
	1586	!-- the model domain.
[1]	1587	DO i = 0, nx
[761]	1588	ar(i,nz) = ar1(i,nz-1) / ( tri(4,i,nz-1) + 1.0E-20 )
[1]	1589	ENDDO
	1590	DO k = nz-2, 0, -1
	1591	DO i = 0, nx
	1592	ar(i,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,k+2) ) &
	1593	/ tri(4,i,k)
	1594	ENDDO
	1595	ENDDO
	1596
[76]	1597	!
	1598	!-- Indices i=0, j=0 correspond to horizontally averaged pressure.
	1599	!-- The respective values of ar should be zero at all k-levels if
	1600	!-- acceleration of horizontally averaged vertical velocity is zero.
	1601	IF ( ibc_p_b == 1 .AND. ibc_p_t == 1 ) THEN
	1602	IF ( j == 0 ) THEN
	1603	DO k = 1, nz
	1604	ar(0,k) = 0.0
	1605	ENDDO
	1606	ENDIF
	1607	ENDIF
	1608
[1]	1609	END SUBROUTINE substi_1dd
	1610
	1611	END SUBROUTINE tridia_1dd
	1612
	1613	#endif
[807]	1614	#endif
[1]	1615	END MODULE poisfft_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |