Home

Context Navigation

source: palm/trunk/SOURCE/poisfft.f90 @ 568

Last change on this file since 568 was 484, checked in by raasch, 15 years ago
typo in file headers removed
Property svn:keywords set to `Id`
File size: 44.6 KB

Rev	Line
[1]	1	MODULE poisfft_mod
	2
	3	!------------------------------------------------------------------------------!
[484]	4	! Current revisions:
[1]	5	! -----------------
[392]	6	!
[1]	7	!
	8	! Former revisions:
	9	! -----------------
[3]	10	! $Id: poisfft.f90 484 2010-02-05 07:36:54Z helmke $
[77]	11	!
[392]	12	! 377 2009-09-04 11:09:00Z raasch
	13	! __lcmuk changed to __lc to avoid problems with Intel compiler on sgi-ice
	14	!
[198]	15	! 164 2008-05-15 08:46:15Z raasch
	16	! Arguments removed from transpose routines
	17	!
[139]	18	! 128 2007-10-26 13:11:14Z raasch
	19	! Bugfix: wavenumber calculation for even nx in routines maketri
	20	!
[90]	21	! 85 2007-05-11 09:35:14Z raasch
	22	! Bugfix: work_fft*_vec removed from some PRIVATE-declarations
	23	!
[77]	24	! 76 2007-03-29 00:58:32Z raasch
	25	! Tridiagonal coefficients adjusted for Neumann boundary conditions both at
	26	! the bottom and the top.
	27	!
[3]	28	! RCS Log replace by Id keyword, revision history cleaned up
	29	!
[1]	30	! Revision 1.24 2006/08/04 15:00:24 raasch
	31	! Default setting of the thread number tn in case of not using OpenMP
	32	!
	33	! Revision 1.23 2006/02/23 12:48:38 raasch
	34	! Additional compiler directive in routine tridia_1dd for preventing loop
	35	! exchange on NEC-SX6
	36	!
	37	! Revision 1.20 2004/04/30 12:38:09 raasch
	38	! Parts of former poisfft_hybrid moved to this subroutine,
	39	! former subroutine changed to a module, renaming of FFT-subroutines and
	40	! -module, FFTs completely substituted by calls of fft_x and fft_y,
	41	! NAG fft used in the non-parallel case completely removed, l in maketri
	42	! is now a 1d-array, variables passed by modules instead of using parameter
	43	! lists, enlarged transposition arrays introduced
	44	!
	45	! Revision 1.1 1997/07/24 11:24:14 raasch
	46	! Initial revision
	47	!
	48	!
	49	! Description:
	50	! ------------
	51	! See below.
	52	!------------------------------------------------------------------------------!
	53
	54	!--------------------------------------------------------------------------!
	55	! poisfft !
	56	! !
	57	! Original version: Stephan Siano (pois3d) !
	58	! !
	59	! Institute of Meteorology and Climatology, University of Hannover !
	60	! Germany !
	61	! !
	62	! Version as of July 23,1996 !
	63	! !
	64	! !
	65	! Version for parallel computers: Siegfried Raasch !
	66	! !
	67	! Version as of July 03,1997 !
	68	! !
	69	! Solves the Poisson equation with a 2D spectral method !
	70	! d^2 p / dx^2 + d^2 p / dy^2 + d^2 p / dz^2 = s !
	71	! !
	72	! Input: !
	73	! real ar contains in the (nnx,nny,nnz) elements, !
	74	! starting from the element (1,nys,nxl), the !
	75	! values for s !
	76	! real work Temporary array !
	77	! !
	78	! Output: !
	79	! real ar contains the solution for p !
	80	!--------------------------------------------------------------------------!
	81
	82	USE fft_xy
	83	USE indices
	84	USE transpose_indices
	85
	86	IMPLICIT NONE
	87
	88	PRIVATE
	89	PUBLIC poisfft, poisfft_init
	90
	91	INTERFACE poisfft
	92	MODULE PROCEDURE poisfft
	93	END INTERFACE poisfft
	94
	95	INTERFACE poisfft_init
	96	MODULE PROCEDURE poisfft_init
	97	END INTERFACE poisfft_init
	98
	99	CONTAINS
	100
	101	SUBROUTINE poisfft_init
	102
	103	CALL fft_init
	104
	105	END SUBROUTINE poisfft_init
	106
	107
	108	SUBROUTINE poisfft( ar, work )
	109
	110	USE cpulog
	111	USE interfaces
	112	USE pegrid
	113
	114	IMPLICIT NONE
	115
	116	REAL, DIMENSION(1:nza,nys:nyna,nxl:nxra) :: ar, work
	117
	118
	119	CALL cpu_log( log_point_s(3), 'poisfft', 'start' )
	120
	121	!
	122	!-- Two-dimensional Fourier Transformation in x- and y-direction.
	123	#if defined( __parallel )
	124	IF ( pdims(2) == 1 ) THEN
	125
	126	!
	127	!-- 1d-domain-decomposition along x:
	128	!-- FFT along y and transposition y --> x
	129	CALL ffty_tr_yx( ar, work, ar )
	130
	131	!
	132	!-- FFT along x, solving the tridiagonal system and backward FFT
	133	CALL fftx_tri_fftx( ar )
	134
	135	!
	136	!-- Transposition x --> y and backward FFT along y
	137	CALL tr_xy_ffty( ar, work, ar )
	138
	139	ELSEIF ( pdims(1) == 1 ) THEN
	140
	141	!
	142	!-- 1d-domain-decomposition along y:
	143	!-- FFT along x and transposition x --> y
	144	CALL fftx_tr_xy( ar, work, ar )
	145
	146	!
	147	!-- FFT along y, solving the tridiagonal system and backward FFT
	148	CALL ffty_tri_ffty( ar )
	149
	150	!
	151	!-- Transposition y --> x and backward FFT along x
	152	CALL tr_yx_fftx( ar, work, ar )
	153
	154	ELSE
	155
	156	!
	157	!-- 2d-domain-decomposition
	158	!-- Transposition z --> x
	159	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
[164]	160	CALL transpose_zx( ar, work, ar )
[1]	161	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
	162
	163	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
	164	CALL fftxp( ar, 'forward' )
	165	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
	166
	167	!
	168	!-- Transposition x --> y
	169	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
[164]	170	CALL transpose_xy( ar, work, ar )
[1]	171	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
	172
	173	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
	174	CALL fftyp( ar, 'forward' )
	175	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	176
	177	!
	178	!-- Transposition y --> z
	179	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
[164]	180	CALL transpose_yz( ar, work, ar )
[1]	181	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
	182
	183	!
	184	!-- Solve the Poisson equation in z-direction in cartesian space.
	185	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
	186	CALL tridia( ar )
	187	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
	188
	189	!
	190	!-- Inverse Fourier Transformation
	191	!-- Transposition z --> y
	192	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
[164]	193	CALL transpose_zy( ar, work, ar )
[1]	194	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
	195
	196	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
	197	CALL fftyp( ar, 'backward' )
	198	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	199
	200	!
	201	!-- Transposition y --> x
	202	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
[164]	203	CALL transpose_yx( ar, work, ar )
[1]	204	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
	205
	206	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
	207	CALL fftxp( ar, 'backward' )
	208	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	209
	210	!
	211	!-- Transposition x --> z
	212	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
[164]	213	CALL transpose_xz( ar, work, ar )
[1]	214	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
	215
	216	ENDIF
	217
	218	#else
	219
	220	!
	221	!-- Two-dimensional Fourier Transformation along x- and y-direction.
	222	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
	223	CALL fftx( ar, 'forward' )
	224	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
	225	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
	226	CALL ffty( ar, 'forward' )
	227	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	228
	229	!
	230	!-- Solve the Poisson equation in z-direction in cartesian space.
	231	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
	232	CALL tridia( ar )
	233	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
	234
	235	!
	236	!-- Inverse Fourier Transformation.
	237	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
	238	CALL ffty( ar, 'backward' )
	239	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	240	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
	241	CALL fftx( ar, 'backward' )
	242	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	243
	244	#endif
	245
	246	CALL cpu_log( log_point_s(3), 'poisfft', 'stop' )
	247
	248	END SUBROUTINE poisfft
	249
	250
	251
	252	SUBROUTINE tridia( ar )
	253
	254	!------------------------------------------------------------------------------!
	255	! solves the linear system of equations:
	256	!
	257	! -(4 pi^2(i^2/(dx^2nnx^2)+j^2/(dy^2nny^2))+
	258	! 1/(dzu(k)dzw(k))+1/(dzu(k-1)dzw(k)))*p(i,j,k)+
	259	! 1/(dzu(k)dzw(k))p(i,j,k+1)+1/(dzu(k-1)dzw(k))p(i,j,k-1)=d(i,j,k)
	260	!
	261	! by using the Thomas algorithm
	262	!------------------------------------------------------------------------------!
	263
	264	USE arrays_3d
	265
	266	IMPLICIT NONE
	267
	268	INTEGER :: i, j, k, nnyh
	269
	270	REAL, DIMENSION(nxl_z:nxr_z,0:nz-1) :: ar1
	271	REAL, DIMENSION(5,nxl_z:nxr_z,0:nz-1) :: tri
	272
	273	#if defined( __parallel )
	274	REAL :: ar(nxl_z:nxr_za,nys_z:nyn_za,1:nza)
	275	#else
	276	REAL :: ar(1:nz,nys_z:nyn_z,nxl_z:nxr_z)
	277	#endif
	278
	279
	280	nnyh = (ny+1) / 2
	281
	282	!
	283	!-- Define constant elements of the tridiagonal matrix.
	284	DO k = 0, nz-1
	285	DO i = nxl_z, nxr_z
	286	tri(2,i,k) = ddzu(k+1) * ddzw(k+1)
	287	tri(3,i,k) = ddzu(k+2) * ddzw(k+1)
	288	ENDDO
	289	ENDDO
	290
	291	#if defined( __parallel )
	292	!
	293	!-- Repeat for all y-levels.
	294	DO j = nys_z, nyn_z
	295	IF ( j <= nnyh ) THEN
	296	CALL maketri( tri, j )
	297	ELSE
	298	CALL maketri( tri, ny+1-j )
	299	ENDIF
	300	CALL split( tri )
	301	CALL substi( ar, ar1, tri, j )
	302	ENDDO
	303	#else
	304	!
	305	!-- First y-level.
	306	CALL maketri( tri, nys_z )
	307	CALL split( tri )
	308	CALL substi( ar, ar1, tri, 0 )
	309
	310	!
	311	!-- Further y-levels.
	312	DO j = 1, nnyh - 1
	313	CALL maketri( tri, j )
	314	CALL split( tri )
	315	CALL substi( ar, ar1, tri, j )
	316	CALL substi( ar, ar1, tri, ny+1-j )
	317	ENDDO
	318	CALL maketri( tri, nnyh )
	319	CALL split( tri )
	320	CALL substi( ar, ar1, tri, nnyh+nys )
	321	#endif
	322
	323	CONTAINS
	324
	325	SUBROUTINE maketri( tri, j )
	326
	327	!------------------------------------------------------------------------------!
	328	! Computes the i- and j-dependent component of the matrix
	329	!------------------------------------------------------------------------------!
	330
	331	USE arrays_3d
	332	USE constants
	333	USE control_parameters
	334	USE grid_variables
	335
	336	IMPLICIT NONE
	337
	338	INTEGER :: i, j, k, nnxh
	339	REAL :: a, c
	340	REAL :: ll(nxl_z:nxr_z)
	341	REAL :: tri(5,nxl_z:nxr_z,0:nz-1)
	342
	343
	344	nnxh = ( nx + 1 ) / 2
	345
	346	!
	347	!-- Provide the tridiagonal matrix for solution of the Poisson equation in
	348	!-- Fourier space. The coefficients are computed following the method of
	349	!-- Schmidt et al. (DFVLR-Mitteilung 84-15), which departs from Stephan
	350	!-- Siano's original version by discretizing the Poisson equation,
	351	!-- before it is Fourier-transformed
	352	#if defined( __parallel )
	353	DO i = nxl_z, nxr_z
[128]	354	IF ( i >= 0 .AND. i <= nnxh ) THEN
[1]	355	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
	356	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
	357	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
	358	FLOAT( ny+1 ) ) ) / ( dy * dy )
	359	ELSE
	360	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
	361	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
	362	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
	363	FLOAT( ny+1 ) ) ) / ( dy * dy )
	364	ENDIF
	365	DO k = 0,nz-1
	366	a = -1.0 * ddzu(k+2) * ddzw(k+1)
	367	c = -1.0 * ddzu(k+1) * ddzw(k+1)
	368	tri(1,i,k) = a + c - ll(i)
	369	ENDDO
	370	ENDDO
	371	#else
	372	DO i = 0, nnxh
	373	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / FLOAT( nx+1 ) ) ) / &
	374	( dx * dx ) + &
	375	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / FLOAT( ny+1 ) ) ) / &
	376	( dy * dy )
	377	DO k = 0, nz-1
	378	a = -1.0 * ddzu(k+2) * ddzw(k+1)
	379	c = -1.0 * ddzu(k+1) * ddzw(k+1)
	380	tri(1,i,k) = a + c - ll(i)
	381	IF ( i >= 1 .and. i < nnxh ) THEN
	382	tri(1,nx+1-i,k) = tri(1,i,k)
	383	ENDIF
	384	ENDDO
	385	ENDDO
	386	#endif
	387	IF ( ibc_p_b == 1 .OR. ibc_p_b == 2 ) THEN
	388	DO i = nxl_z, nxr_z
	389	tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
	390	ENDDO
	391	ENDIF
	392	IF ( ibc_p_t == 1 ) THEN
	393	DO i = nxl_z, nxr_z
	394	tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
	395	ENDDO
	396	ENDIF
	397
	398	END SUBROUTINE maketri
	399
	400
	401	SUBROUTINE substi( ar, ar1, tri, j )
	402
	403	!------------------------------------------------------------------------------!
	404	! Substitution (Forward and Backward) (Thomas algorithm)
	405	!------------------------------------------------------------------------------!
	406
[76]	407	USE control_parameters
	408
[1]	409	IMPLICIT NONE
	410
	411	INTEGER :: i, j, k
	412	REAL :: ar1(nxl_z:nxr_z,0:nz-1)
	413	REAL :: tri(5,nxl_z:nxr_z,0:nz-1)
	414	#if defined( __parallel )
	415	REAL :: ar(nxl_z:nxr_za,nys_z:nyn_za,1:nza)
	416	#else
	417	REAL :: ar(1:nz,nys_z:nyn_z,nxl_z:nxr_z)
	418	#endif
	419
	420	!
	421	!-- Forward substitution.
	422	DO i = nxl_z, nxr_z
	423	#if defined( __parallel )
	424	ar1(i,0) = ar(i,j,1)
	425	#else
	426	ar1(i,0) = ar(1,j,i)
	427	#endif
	428	ENDDO
	429	DO k = 1, nz - 1
	430	DO i = nxl_z, nxr_z
	431	#if defined( __parallel )
	432	ar1(i,k) = ar(i,j,k+1) - tri(5,i,k) * ar1(i,k-1)
	433	#else
	434	ar1(i,k) = ar(k+1,j,i) - tri(5,i,k) * ar1(i,k-1)
	435	#endif
	436	ENDDO
	437	ENDDO
	438
	439	!
	440	!-- Backward substitution.
	441	DO i = nxl_z, nxr_z
	442	#if defined( __parallel )
	443	ar(i,j,nz) = ar1(i,nz-1) / tri(4,i,nz-1)
	444	#else
	445	ar(nz,j,i) = ar1(i,nz-1) / tri(4,i,nz-1)
	446	#endif
	447	ENDDO
	448	DO k = nz-2, 0, -1
	449	DO i = nxl_z, nxr_z
	450	#if defined( __parallel )
	451	ar(i,j,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,j,k+2) ) &
	452	/ tri(4,i,k)
	453	#else
	454	ar(k+1,j,i) = ( ar1(i,k) - tri(3,i,k) * ar(k+2,j,i) ) &
	455	/ tri(4,i,k)
	456	#endif
	457	ENDDO
	458	ENDDO
	459
[76]	460	!
	461	!-- Indices i=0, j=0 correspond to horizontally averaged pressure.
	462	!-- The respective values of ar should be zero at all k-levels if
	463	!-- acceleration of horizontally averaged vertical velocity is zero.
	464	IF ( ibc_p_b == 1 .AND. ibc_p_t == 1 ) THEN
	465	IF ( j == 0 .AND. nxl_z == 0 ) THEN
	466	#if defined( __parallel )
	467	DO k = 1, nz
	468	ar(nxl_z,j,k) = 0.0
	469	ENDDO
	470	#else
	471	DO k = 1, nz
	472	ar(k,j,nxl_z) = 0.0
	473	ENDDO
	474	#endif
	475	ENDIF
	476	ENDIF
	477
[1]	478	END SUBROUTINE substi
	479
	480
	481	SUBROUTINE split( tri )
	482
	483	!------------------------------------------------------------------------------!
	484	! Splitting of the tridiagonal matrix (Thomas algorithm)
	485	!------------------------------------------------------------------------------!
	486
	487	IMPLICIT NONE
	488
	489	INTEGER :: i, k
	490	REAL :: tri(5,nxl_z:nxr_z,0:nz-1)
	491
	492	!
	493	!-- Splitting.
	494	DO i = nxl_z, nxr_z
	495	tri(4,i,0) = tri(1,i,0)
	496	ENDDO
	497	DO k = 1, nz-1
	498	DO i = nxl_z, nxr_z
	499	tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
	500	tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
	501	ENDDO
	502	ENDDO
	503
	504	END SUBROUTINE split
	505
	506	END SUBROUTINE tridia
	507
	508
	509	#if defined( __parallel )
	510	SUBROUTINE fftxp( ar, direction )
	511
	512	!------------------------------------------------------------------------------!
	513	! Fourier-transformation along x-direction Parallelized version
	514	!------------------------------------------------------------------------------!
	515
	516	IMPLICIT NONE
	517
	518	CHARACTER (LEN=*) :: direction
	519	INTEGER :: j, k
	520	REAL :: ar(0:nxa,nys_x:nyn_xa,nzb_x:nzt_xa)
	521
	522	!
	523	!-- Performing the fft with one of the methods implemented
	524	DO k = nzb_x, nzt_x
	525	DO j = nys_x, nyn_x
	526	CALL fft_x( ar(0:nx,j,k), direction )
	527	ENDDO
	528	ENDDO
	529
	530	END SUBROUTINE fftxp
	531
	532	#else
	533	SUBROUTINE fftx( ar, direction )
	534
	535	!------------------------------------------------------------------------------!
	536	! Fourier-transformation along x-direction Non parallel version
	537	!------------------------------------------------------------------------------!
	538
	539	IMPLICIT NONE
	540
	541	CHARACTER (LEN=*) :: direction
	542	INTEGER :: i, j, k
	543	REAL :: ar(1:nz,0:ny,0:nx)
	544
	545	!
	546	!-- Performing the fft with one of the methods implemented
	547	DO k = 1, nz
	548	DO j = 0, ny
	549	CALL fft_x( ar(k,j,0:nx), direction )
	550	ENDDO
	551	ENDDO
	552
	553	END SUBROUTINE fftx
	554	#endif
	555
	556
	557	#if defined( __parallel )
	558	SUBROUTINE fftyp( ar, direction )
	559
	560	!------------------------------------------------------------------------------!
	561	! Fourier-transformation along y-direction Parallelized version
	562	!------------------------------------------------------------------------------!
	563
	564	IMPLICIT NONE
	565
	566	CHARACTER (LEN=*) :: direction
	567	INTEGER :: i, k
	568	REAL :: ar(0:nya,nxl_y:nxr_ya,nzb_y:nzt_ya)
	569
	570	!
	571	!-- Performing the fft with one of the methods implemented
	572	DO k = nzb_y, nzt_y
	573	DO i = nxl_y, nxr_y
	574	CALL fft_y( ar(0:ny,i,k), direction )
	575	ENDDO
	576	ENDDO
	577
	578	END SUBROUTINE fftyp
	579
	580	#else
	581	SUBROUTINE ffty( ar, direction )
	582
	583	!------------------------------------------------------------------------------!
	584	! Fourier-transformation along y-direction Non parallel version
	585	!------------------------------------------------------------------------------!
	586
	587	IMPLICIT NONE
	588
	589	CHARACTER (LEN=*) :: direction
	590	INTEGER :: i, k
	591	REAL :: ar(1:nz,0:ny,0:nx)
	592
	593	!
	594	!-- Performing the fft with one of the methods implemented
	595	DO k = 1, nz
	596	DO i = 0, nx
	597	CALL fft_y( ar(k,0:ny,i), direction )
	598	ENDDO
	599	ENDDO
	600
	601	END SUBROUTINE ffty
	602	#endif
	603
	604	#if defined( __parallel )
	605	SUBROUTINE ffty_tr_yx( f_in, work, f_out )
	606
	607	!------------------------------------------------------------------------------!
	608	! Fourier-transformation along y with subsequent transposition y --> x for
	609	! a 1d-decomposition along x
	610	!
	611	! ATTENTION: The performance of this routine is much faster on the NEC-SX6,
	612	! if the first index of work_ffty_vec is odd. Otherwise
	613	! memory bank conflicts may occur (especially if the index is a
	614	! multiple of 128). That's why work_ffty_vec is dimensioned as
	615	! 0:ny+1.
	616	! Of course, this will not work if users are using an odd number
	617	! of gridpoints along y.
	618	!------------------------------------------------------------------------------!
	619
	620	USE control_parameters
	621	USE cpulog
	622	USE indices
	623	USE interfaces
	624	USE pegrid
	625	USE transpose_indices
	626
	627	IMPLICIT NONE
	628
	629	INTEGER :: i, iend, iouter, ir, j, k
	630	INTEGER, PARAMETER :: stridex = 4
	631
	632	REAL, DIMENSION(0:ny,stridex) :: work_ffty
	633	#if defined( __nec )
	634	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
	635	#endif
	636	REAL, DIMENSION(1:nza,0:nya,nxl:nxra) :: f_in
	637	REAL, DIMENSION(nnx,1:nza,nys_x:nyn_xa,pdims(1)) :: f_out
	638	REAL, DIMENSION(nxl:nxra,1:nza,0:nya) :: work
	639
	640	!
	641	!-- Carry out the FFT along y, where all data are present due to the
	642	!-- 1d-decomposition along x. Resort the data in a way that x becomes
	643	!-- the first index.
	644	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
	645
	646	IF ( host(1:3) == 'nec' ) THEN
	647	#if defined( __nec )
	648	!
	649	!-- Code optimized for vector processors
[85]	650	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	651	!$OMP DO
	652	DO i = nxl, nxr
	653
	654	DO j = 0, ny
	655	DO k = 1, nz
	656	work_ffty_vec(j,k,i) = f_in(k,j,i)
	657	ENDDO
	658	ENDDO
	659
	660	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' )
	661
	662	ENDDO
	663
	664	!$OMP DO
	665	DO k = 1, nz
	666	DO j = 0, ny
	667	DO i = nxl, nxr
	668	work(i,k,j) = work_ffty_vec(j,k,i)
	669	ENDDO
	670	ENDDO
	671	ENDDO
	672	!$OMP END PARALLEL
	673	#endif
	674
	675	ELSE
	676
	677	!
	678	!-- Cache optimized code.
	679	!-- The i-(x-)direction is split into a strided outer loop and an inner
	680	!-- loop for better cache performance
	681	!$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty)
	682	!$OMP DO
	683	DO iouter = nxl, nxr, stridex
	684
	685	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
	686
	687	DO k = 1, nz
	688
	689	DO i = iouter, iend
	690
	691	ir = i-iouter+1 ! counter within a stride
	692	DO j = 0, ny
	693	work_ffty(j,ir) = f_in(k,j,i)
	694	ENDDO
	695	!
	696	!-- FFT along y
	697	CALL fft_y( work_ffty(:,ir), 'forward' )
	698
	699	ENDDO
	700
	701	!
	702	!-- Resort
	703	DO j = 0, ny
	704	DO i = iouter, iend
	705	work(i,k,j) = work_ffty(j,i-iouter+1)
	706	ENDDO
	707	ENDDO
	708
	709	ENDDO
	710
	711	ENDDO
	712	!$OMP END PARALLEL
	713
	714	ENDIF
	715	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	716
	717	!
	718	!-- Transpose array
	719	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
	720	CALL MPI_ALLTOALL( work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
	721	f_out(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
	722	comm1dx, ierr )
	723	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	724
	725	END SUBROUTINE ffty_tr_yx
	726
	727
	728	SUBROUTINE tr_xy_ffty( f_in, work, f_out )
	729
	730	!------------------------------------------------------------------------------!
	731	! Transposition x --> y with a subsequent backward Fourier transformation for
	732	! a 1d-decomposition along x
	733	!------------------------------------------------------------------------------!
	734
	735	USE control_parameters
	736	USE cpulog
	737	USE indices
	738	USE interfaces
	739	USE pegrid
	740	USE transpose_indices
	741
	742	IMPLICIT NONE
	743
	744	INTEGER :: i, iend, iouter, ir, j, k
	745	INTEGER, PARAMETER :: stridex = 4
	746
	747	REAL, DIMENSION(0:ny,stridex) :: work_ffty
	748	#if defined( __nec )
	749	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
	750	#endif
	751	REAL, DIMENSION(nnx,1:nza,nys_x:nyn_xa,pdims(1)) :: f_in
	752	REAL, DIMENSION(1:nza,0:nya,nxl:nxra) :: f_out
	753	REAL, DIMENSION(nxl:nxra,1:nza,0:nya) :: work
	754
	755	!
	756	!-- Transpose array
	757	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
	758	CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
	759	work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
	760	comm1dx, ierr )
	761	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	762
	763	!
	764	!-- Resort the data in a way that y becomes the first index and carry out the
	765	!-- backward fft along y.
	766	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
	767
	768	IF ( host(1:3) == 'nec' ) THEN
	769	#if defined( __nec )
	770	!
	771	!-- Code optimized for vector processors
[85]	772	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	773	!$OMP DO
	774	DO k = 1, nz
	775	DO j = 0, ny
	776	DO i = nxl, nxr
	777	work_ffty_vec(j,k,i) = work(i,k,j)
	778	ENDDO
	779	ENDDO
	780	ENDDO
	781
	782	!$OMP DO
	783	DO i = nxl, nxr
	784
	785	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' )
	786
	787	DO j = 0, ny
	788	DO k = 1, nz
	789	f_out(k,j,i) = work_ffty_vec(j,k,i)
	790	ENDDO
	791	ENDDO
	792
	793	ENDDO
	794	!$OMP END PARALLEL
	795	#endif
	796
	797	ELSE
	798
	799	!
	800	!-- Cache optimized code.
	801	!-- The i-(x-)direction is split into a strided outer loop and an inner
	802	!-- loop for better cache performance
	803	!$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty )
	804	!$OMP DO
	805	DO iouter = nxl, nxr, stridex
	806
	807	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
	808
	809	DO k = 1, nz
	810	!
	811	!-- Resort
	812	DO j = 0, ny
	813	DO i = iouter, iend
	814	work_ffty(j,i-iouter+1) = work(i,k,j)
	815	ENDDO
	816	ENDDO
	817
	818	DO i = iouter, iend
	819
	820	!
	821	!-- FFT along y
	822	ir = i-iouter+1 ! counter within a stride
	823	CALL fft_y( work_ffty(:,ir), 'backward' )
	824
	825	DO j = 0, ny
	826	f_out(k,j,i) = work_ffty(j,ir)
	827	ENDDO
	828	ENDDO
	829
	830	ENDDO
	831
	832	ENDDO
	833	!$OMP END PARALLEL
	834
	835	ENDIF
	836
	837	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	838
	839	END SUBROUTINE tr_xy_ffty
	840
	841
	842	SUBROUTINE fftx_tri_fftx( ar )
	843
	844	!------------------------------------------------------------------------------!
	845	! FFT along x, solution of the tridiagonal system and backward FFT for
	846	! a 1d-decomposition along x
	847	!
	848	! WARNING: this subroutine may still not work for hybrid parallelization
	849	! with OpenMP (for possible necessary changes see the original
	850	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
	851	!------------------------------------------------------------------------------!
	852
	853	USE control_parameters
	854	USE cpulog
	855	USE grid_variables
	856	USE indices
	857	USE interfaces
	858	USE pegrid
	859	USE transpose_indices
	860
	861	IMPLICIT NONE
	862
	863	character(len=3) :: myth_char
	864
	865	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
	866
	867	REAL, DIMENSION(0:nx) :: work_fftx
	868	REAL, DIMENSION(0:nx,1:nz) :: work_trix
	869	REAL, DIMENSION(nnx,1:nza,nys_x:nyn_xa,pdims(1)) :: ar
	870	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
	871
	872
	873	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'start' )
	874
	875	ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) )
	876
	877	tn = 0 ! Default thread number in case of one thread
	878	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_fftx, work_trix )
	879	DO j = nys_x, nyn_x
	880
	881	!$ tn = omp_get_thread_num()
	882
	883	IF ( host(1:3) == 'nec' ) THEN
	884	!
	885	!-- Code optimized for vector processors
	886	DO k = 1, nz
	887
	888	m = 0
	889	DO n = 1, pdims(1)
	890	DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
	891	work_trix(m,k) = ar(i,k,j,n)
	892	m = m + 1
	893	ENDDO
	894	ENDDO
	895
	896	ENDDO
	897
	898	CALL fft_x_m( work_trix, 'forward' )
	899
	900	ELSE
	901	!
	902	!-- Cache optimized code
	903	DO k = 1, nz
	904
	905	m = 0
	906	DO n = 1, pdims(1)
	907	DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
	908	work_fftx(m) = ar(i,k,j,n)
	909	m = m + 1
	910	ENDDO
	911	ENDDO
	912
	913	CALL fft_x( work_fftx, 'forward' )
	914
	915	DO i = 0, nx
	916	work_trix(i,k) = work_fftx(i)
	917	ENDDO
	918
	919	ENDDO
	920
	921	ENDIF
	922
	923	!
	924	!-- Solve the linear equation system
	925	CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) )
	926
	927	IF ( host(1:3) == 'nec' ) THEN
	928	!
	929	!-- Code optimized for vector processors
	930	CALL fft_x_m( work_trix, 'backward' )
	931
	932	DO k = 1, nz
	933
	934	m = 0
	935	DO n = 1, pdims(1)
	936	DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
	937	ar(i,k,j,n) = work_trix(m,k)
	938	m = m + 1
	939	ENDDO
	940	ENDDO
	941
	942	ENDDO
	943
	944	ELSE
	945	!
	946	!-- Cache optimized code
	947	DO k = 1, nz
	948
	949	DO i = 0, nx
	950	work_fftx(i) = work_trix(i,k)
	951	ENDDO
	952
	953	CALL fft_x( work_fftx, 'backward' )
	954
	955	m = 0
	956	DO n = 1, pdims(1)
	957	DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
	958	ar(i,k,j,n) = work_fftx(m)
	959	m = m + 1
	960	ENDDO
	961	ENDDO
	962
	963	ENDDO
	964
	965	ENDIF
	966
	967	ENDDO
	968
	969	DEALLOCATE( tri )
	970
	971	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'stop' )
	972
	973	END SUBROUTINE fftx_tri_fftx
	974
	975
	976	SUBROUTINE fftx_tr_xy( f_in, work, f_out )
	977
	978	!------------------------------------------------------------------------------!
	979	! Fourier-transformation along x with subsequent transposition x --> y for
	980	! a 1d-decomposition along y
	981	!
	982	! ATTENTION: The NEC-branch of this routine may significantly profit from
	983	! further optimizations. So far, performance is much worse than
	984	! for routine ffty_tr_yx (more than three times slower).
	985	!------------------------------------------------------------------------------!
	986
	987	USE control_parameters
	988	USE cpulog
	989	USE indices
	990	USE interfaces
	991	USE pegrid
	992	USE transpose_indices
	993
	994	IMPLICIT NONE
	995
	996	INTEGER :: i, j, k
	997
	998	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
	999	REAL, DIMENSION(1:nza,nys:nyna,0:nxa) :: f_in
	1000	REAL, DIMENSION(nny,1:nza,nxl_y:nxr_ya,pdims(2)) :: f_out
	1001	REAL, DIMENSION(nys:nyna,1:nza,0:nxa) :: work
	1002
	1003	!
	1004	!-- Carry out the FFT along x, where all data are present due to the
	1005	!-- 1d-decomposition along y. Resort the data in a way that y becomes
	1006	!-- the first index.
	1007	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
	1008
	1009	IF ( host(1:3) == 'nec' ) THEN
	1010	!
	1011	!-- Code for vector processors
[85]	1012	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	1013	!$OMP DO
	1014	DO i = 0, nx
	1015
	1016	DO j = nys, nyn
	1017	DO k = 1, nz
	1018	work_fftx(i,k,j) = f_in(k,j,i)
	1019	ENDDO
	1020	ENDDO
	1021
	1022	ENDDO
	1023
	1024	!$OMP DO
	1025	DO j = nys, nyn
	1026
	1027	CALL fft_x_m( work_fftx(:,:,j), 'forward' )
	1028
	1029	DO k = 1, nz
	1030	DO i = 0, nx
	1031	work(j,k,i) = work_fftx(i,k,j)
	1032	ENDDO
	1033	ENDDO
	1034
	1035	ENDDO
	1036	!$OMP END PARALLEL
	1037
	1038	ELSE
	1039
	1040	!
	1041	!-- Cache optimized code (there might be still a potential for better
	1042	!-- optimization).
	1043	!$OMP PARALLEL PRIVATE (i,j,k,work_fftx)
	1044	!$OMP DO
	1045	DO i = 0, nx
	1046
	1047	DO j = nys, nyn
	1048	DO k = 1, nz
	1049	work_fftx(i,k,j) = f_in(k,j,i)
	1050	ENDDO
	1051	ENDDO
	1052
	1053	ENDDO
	1054
	1055	!$OMP DO
	1056	DO j = nys, nyn
	1057	DO k = 1, nz
	1058
	1059	CALL fft_x( work_fftx(0:nx,k,j), 'forward' )
	1060
	1061	DO i = 0, nx
	1062	work(j,k,i) = work_fftx(i,k,j)
	1063	ENDDO
	1064	ENDDO
	1065
	1066	ENDDO
	1067	!$OMP END PARALLEL
	1068
	1069	ENDIF
	1070	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
	1071
	1072	!
	1073	!-- Transpose array
	1074	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
	1075	CALL MPI_ALLTOALL( work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
	1076	f_out(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
	1077	comm1dy, ierr )
	1078	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	1079
	1080	END SUBROUTINE fftx_tr_xy
	1081
	1082
	1083	SUBROUTINE tr_yx_fftx( f_in, work, f_out )
	1084
	1085	!------------------------------------------------------------------------------!
	1086	! Transposition y --> x with a subsequent backward Fourier transformation for
	1087	! a 1d-decomposition along x
	1088	!------------------------------------------------------------------------------!
	1089
	1090	USE control_parameters
	1091	USE cpulog
	1092	USE indices
	1093	USE interfaces
	1094	USE pegrid
	1095	USE transpose_indices
	1096
	1097	IMPLICIT NONE
	1098
	1099	INTEGER :: i, j, k
	1100
	1101	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
	1102	REAL, DIMENSION(nny,1:nza,nxl_y:nxr_ya,pdims(2)) :: f_in
	1103	REAL, DIMENSION(1:nza,nys:nyna,0:nxa) :: f_out
	1104	REAL, DIMENSION(nys:nyna,1:nza,0:nxa) :: work
	1105
	1106	!
	1107	!-- Transpose array
	1108	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
	1109	CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
	1110	work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
	1111	comm1dy, ierr )
	1112	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	1113
	1114	!
	1115	!-- Carry out the FFT along x, where all data are present due to the
	1116	!-- 1d-decomposition along y. Resort the data in a way that y becomes
	1117	!-- the first index.
	1118	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
	1119
	1120	IF ( host(1:3) == 'nec' ) THEN
	1121	!
	1122	!-- Code optimized for vector processors
[85]	1123	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	1124	!$OMP DO
	1125	DO j = nys, nyn
	1126
	1127	DO k = 1, nz
	1128	DO i = 0, nx
	1129	work_fftx(i,k,j) = work(j,k,i)
	1130	ENDDO
	1131	ENDDO
	1132
	1133	CALL fft_x_m( work_fftx(:,:,j), 'backward' )
	1134
	1135	ENDDO
	1136
	1137	!$OMP DO
	1138	DO i = 0, nx
	1139	DO j = nys, nyn
	1140	DO k = 1, nz
	1141	f_out(k,j,i) = work_fftx(i,k,j)
	1142	ENDDO
	1143	ENDDO
	1144	ENDDO
	1145	!$OMP END PARALLEL
	1146
	1147	ELSE
	1148
	1149	!
	1150	!-- Cache optimized code (there might be still a potential for better
	1151	!-- optimization).
	1152	!$OMP PARALLEL PRIVATE (i,j,k,work_fftx)
	1153	!$OMP DO
	1154	DO j = nys, nyn
	1155	DO k = 1, nz
	1156
	1157	DO i = 0, nx
	1158	work_fftx(i,k,j) = work(j,k,i)
	1159	ENDDO
	1160
	1161	CALL fft_x( work_fftx(0:nx,k,j), 'backward' )
	1162
	1163	ENDDO
	1164	ENDDO
	1165
	1166	!$OMP DO
	1167	DO i = 0, nx
	1168	DO j = nys, nyn
	1169	DO k = 1, nz
	1170	f_out(k,j,i) = work_fftx(i,k,j)
	1171	ENDDO
	1172	ENDDO
	1173	ENDDO
	1174	!$OMP END PARALLEL
	1175
	1176	ENDIF
	1177	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	1178
	1179	END SUBROUTINE tr_yx_fftx
	1180
	1181
	1182	SUBROUTINE ffty_tri_ffty( ar )
	1183
	1184	!------------------------------------------------------------------------------!
	1185	! FFT along y, solution of the tridiagonal system and backward FFT for
	1186	! a 1d-decomposition along y
	1187	!
	1188	! WARNING: this subroutine may still not work for hybrid parallelization
	1189	! with OpenMP (for possible necessary changes see the original
	1190	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
	1191	!------------------------------------------------------------------------------!
	1192
	1193	USE control_parameters
	1194	USE cpulog
	1195	USE grid_variables
	1196	USE indices
	1197	USE interfaces
	1198	USE pegrid
	1199	USE transpose_indices
	1200
	1201	IMPLICIT NONE
	1202
	1203	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
	1204
	1205	REAL, DIMENSION(0:ny) :: work_ffty
	1206	REAL, DIMENSION(0:ny,1:nz) :: work_triy
	1207	REAL, DIMENSION(nny,1:nza,nxl_y:nxr_ya,pdims(2)) :: ar
	1208	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
	1209
	1210
	1211	CALL cpu_log( log_point_s(39), 'fft_y + tridia', 'start' )
	1212
	1213	ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) )
	1214
	1215	tn = 0 ! Default thread number in case of one thread
	1216	!$OMP PARALLEL PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy )
	1217	!$OMP DO
	1218	DO i = nxl_y, nxr_y
	1219
	1220	!$ tn = omp_get_thread_num()
	1221
	1222	IF ( host(1:3) == 'nec' ) THEN
	1223	!
	1224	!-- Code optimized for vector processors
	1225	DO k = 1, nz
	1226
	1227	m = 0
	1228	DO n = 1, pdims(2)
	1229	DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
	1230	work_triy(m,k) = ar(j,k,i,n)
	1231	m = m + 1
	1232	ENDDO
	1233	ENDDO
	1234
	1235	ENDDO
	1236
	1237	CALL fft_y_m( work_triy, ny, 'forward' )
	1238
	1239	ELSE
	1240	!
	1241	!-- Cache optimized code
	1242	DO k = 1, nz
	1243
	1244	m = 0
	1245	DO n = 1, pdims(2)
	1246	DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
	1247	work_ffty(m) = ar(j,k,i,n)
	1248	m = m + 1
	1249	ENDDO
	1250	ENDDO
	1251
	1252	CALL fft_y( work_ffty, 'forward' )
	1253
	1254	DO j = 0, ny
	1255	work_triy(j,k) = work_ffty(j)
	1256	ENDDO
	1257
	1258	ENDDO
	1259
	1260	ENDIF
	1261
	1262	!
	1263	!-- Solve the linear equation system
	1264	CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) )
	1265
	1266	IF ( host(1:3) == 'nec' ) THEN
	1267	!
	1268	!-- Code optimized for vector processors
	1269	CALL fft_y_m( work_triy, ny, 'backward' )
	1270
	1271	DO k = 1, nz
	1272
	1273	m = 0
	1274	DO n = 1, pdims(2)
	1275	DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
	1276	ar(j,k,i,n) = work_triy(m,k)
	1277	m = m + 1
	1278	ENDDO
	1279	ENDDO
	1280
	1281	ENDDO
	1282
	1283	ELSE
	1284	!
	1285	!-- Cache optimized code
	1286	DO k = 1, nz
	1287
	1288	DO j = 0, ny
	1289	work_ffty(j) = work_triy(j,k)
	1290	ENDDO
	1291
	1292	CALL fft_y( work_ffty, 'backward' )
	1293
	1294	m = 0
	1295	DO n = 1, pdims(2)
	1296	DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
	1297	ar(j,k,i,n) = work_ffty(m)
	1298	m = m + 1
	1299	ENDDO
	1300	ENDDO
	1301
	1302	ENDDO
	1303
	1304	ENDIF
	1305
	1306	ENDDO
	1307	!$OMP END PARALLEL
	1308
	1309	DEALLOCATE( tri )
	1310
	1311	CALL cpu_log( log_point_s(39), 'fft_y + tridia', 'stop' )
	1312
	1313	END SUBROUTINE ffty_tri_ffty
	1314
	1315
	1316	SUBROUTINE tridia_1dd( ddx2, ddy2, nx, ny, j, ar, tri )
	1317
	1318	!------------------------------------------------------------------------------!
	1319	! Solves the linear system of equations for a 1d-decomposition along x (see
	1320	! tridia)
	1321	!
	1322	! Attention: when using the intel compiler, array tri must be passed as an
	1323	! argument to the contained subroutines. Otherwise addres faults
	1324	! will occur.
	1325	! On NEC, tri should not be passed (except for routine substi_1dd)
	1326	! because this causes very bad performance.
	1327	!------------------------------------------------------------------------------!
	1328
	1329	USE arrays_3d
	1330	USE control_parameters
	1331
	1332	USE pegrid
	1333
	1334	IMPLICIT NONE
	1335
	1336	INTEGER :: i, j, k, nnyh, nx, ny, omp_get_thread_num, tn
	1337
	1338	REAL :: ddx2, ddy2
	1339
	1340	REAL, DIMENSION(0:nx,1:nz) :: ar
	1341	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
	1342	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	1343
	1344
	1345	nnyh = ( ny + 1 ) / 2
	1346
	1347	!
	1348	!-- Define constant elements of the tridiagonal matrix.
	1349	!-- The compiler on SX6 does loop exchange. If 0:nx is a high power of 2,
	1350	!-- the exchanged loops create bank conflicts. The following directive
	1351	!-- prohibits loop exchange and the loops perform much better.
	1352	! tn = omp_get_thread_num()
	1353	! WRITE( 120+tn, * ) '+++ id=',myid,' nx=',nx,' thread=', omp_get_thread_num()
[82]	1354	! CALL local_flush( 120+tn )
[1]	1355	!CDIR NOLOOPCHG
	1356	DO k = 0, nz-1
	1357	DO i = 0,nx
	1358	tri(2,i,k) = ddzu(k+1) * ddzw(k+1)
	1359	tri(3,i,k) = ddzu(k+2) * ddzw(k+1)
	1360	ENDDO
	1361	ENDDO
	1362	! WRITE( 120+tn, * ) '+++ id=',myid,' end of first tridia loop thread=', omp_get_thread_num()
[82]	1363	! CALL local_flush( 120+tn )
[1]	1364
	1365	IF ( j <= nnyh ) THEN
[377]	1366	#if defined( __lc )
[1]	1367	CALL maketri_1dd( j, tri )
	1368	#else
	1369	CALL maketri_1dd( j )
	1370	#endif
	1371	ELSE
[377]	1372	#if defined( __lc )
[1]	1373	CALL maketri_1dd( ny+1-j, tri )
	1374	#else
	1375	CALL maketri_1dd( ny+1-j )
	1376	#endif
	1377	ENDIF
[377]	1378	#if defined( __lc )
[1]	1379	CALL split_1dd( tri )
	1380	#else
	1381	CALL split_1dd
	1382	#endif
	1383	CALL substi_1dd( ar, tri )
	1384
	1385	CONTAINS
	1386
[377]	1387	#if defined( __lc )
[1]	1388	SUBROUTINE maketri_1dd( j, tri )
	1389	#else
	1390	SUBROUTINE maketri_1dd( j )
	1391	#endif
	1392
	1393	!------------------------------------------------------------------------------!
	1394	! computes the i- and j-dependent component of the matrix
	1395	!------------------------------------------------------------------------------!
	1396
	1397	USE constants
	1398
	1399	IMPLICIT NONE
	1400
	1401	INTEGER :: i, j, k, nnxh
	1402	REAL :: a, c
	1403
	1404	REAL, DIMENSION(0:nx) :: l
	1405
[377]	1406	#if defined( __lc )
[1]	1407	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	1408	#endif
	1409
	1410
	1411	nnxh = ( nx + 1 ) / 2
	1412	!
	1413	!-- Provide the tridiagonal matrix for solution of the Poisson equation in
	1414	!-- Fourier space. The coefficients are computed following the method of
	1415	!-- Schmidt et al. (DFVLR-Mitteilung 84-15), which departs from Stephan
	1416	!-- Siano's original version by discretizing the Poisson equation,
	1417	!-- before it is Fourier-transformed
	1418	DO i = 0, nx
[128]	1419	IF ( i >= 0 .AND. i <= nnxh ) THEN
[1]	1420	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
	1421	FLOAT( nx+1 ) ) ) * ddx2 + &
	1422	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
	1423	FLOAT( ny+1 ) ) ) * ddy2
	1424	ELSE
	1425	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
	1426	FLOAT( nx+1 ) ) ) * ddx2 + &
	1427	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
	1428	FLOAT( ny+1 ) ) ) * ddy2
	1429	ENDIF
	1430	ENDDO
	1431
	1432	DO k = 0, nz-1
	1433	DO i = 0, nx
	1434	a = -1.0 * ddzu(k+2) * ddzw(k+1)
	1435	c = -1.0 * ddzu(k+1) * ddzw(k+1)
	1436	tri(1,i,k) = a + c - l(i)
	1437	ENDDO
	1438	ENDDO
	1439	IF ( ibc_p_b == 1 .OR. ibc_p_b == 2 ) THEN
	1440	DO i = 0, nx
	1441	tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
	1442	ENDDO
	1443	ENDIF
	1444	IF ( ibc_p_t == 1 ) THEN
	1445	DO i = 0, nx
	1446	tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
	1447	ENDDO
	1448	ENDIF
	1449
	1450	END SUBROUTINE maketri_1dd
	1451
	1452
[377]	1453	#if defined( __lc )
[1]	1454	SUBROUTINE split_1dd( tri )
	1455	#else
	1456	SUBROUTINE split_1dd
	1457	#endif
	1458
	1459	!------------------------------------------------------------------------------!
	1460	! Splitting of the tridiagonal matrix (Thomas algorithm)
	1461	!------------------------------------------------------------------------------!
	1462
	1463	IMPLICIT NONE
	1464
	1465	INTEGER :: i, k
	1466
[377]	1467	#if defined( __lc )
[1]	1468	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	1469	#endif
	1470
	1471
	1472	!
	1473	!-- Splitting
	1474	DO i = 0, nx
	1475	tri(4,i,0) = tri(1,i,0)
	1476	ENDDO
	1477	DO k = 1, nz-1
	1478	DO i = 0, nx
	1479	tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
	1480	tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
	1481	ENDDO
	1482	ENDDO
	1483
	1484	END SUBROUTINE split_1dd
	1485
	1486
	1487	SUBROUTINE substi_1dd( ar, tri )
	1488
	1489	!------------------------------------------------------------------------------!
	1490	! Substitution (Forward and Backward) (Thomas algorithm)
	1491	!------------------------------------------------------------------------------!
	1492
	1493	IMPLICIT NONE
	1494
[76]	1495	INTEGER :: i, k
[1]	1496
	1497	REAL, DIMENSION(0:nx,nz) :: ar
	1498	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
	1499	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	1500
	1501	!
	1502	!-- Forward substitution
	1503	DO i = 0, nx
	1504	ar1(i,0) = ar(i,1)
	1505	ENDDO
	1506	DO k = 1, nz-1
	1507	DO i = 0, nx
	1508	ar1(i,k) = ar(i,k+1) - tri(5,i,k) * ar1(i,k-1)
	1509	ENDDO
	1510	ENDDO
	1511
	1512	!
	1513	!-- Backward substitution
	1514	DO i = 0, nx
	1515	ar(i,nz) = ar1(i,nz-1) / tri(4,i,nz-1)
	1516	ENDDO
	1517	DO k = nz-2, 0, -1
	1518	DO i = 0, nx
	1519	ar(i,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,k+2) ) &
	1520	/ tri(4,i,k)
	1521	ENDDO
	1522	ENDDO
	1523
[76]	1524	!
	1525	!-- Indices i=0, j=0 correspond to horizontally averaged pressure.
	1526	!-- The respective values of ar should be zero at all k-levels if
	1527	!-- acceleration of horizontally averaged vertical velocity is zero.
	1528	IF ( ibc_p_b == 1 .AND. ibc_p_t == 1 ) THEN
	1529	IF ( j == 0 ) THEN
	1530	DO k = 1, nz
	1531	ar(0,k) = 0.0
	1532	ENDDO
	1533	ENDIF
	1534	ENDIF
	1535
[1]	1536	END SUBROUTINE substi_1dd
	1537
	1538	END SUBROUTINE tridia_1dd
	1539
	1540	#endif
	1541
	1542	END MODULE poisfft_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |