Home

Context Navigation

source: palm/trunk/SOURCE/poisfft.f90 @ 993

Last change on this file since 993 was 941, checked in by raasch, 12 years ago
last commit documented
Property svn:keywords set to `Id`
File size: 47.2 KB

Rev	Line
[1]	1	MODULE poisfft_mod
	2
	3	!------------------------------------------------------------------------------!
[484]	4	! Current revisions:
[1]	5	! -----------------
	6	!
[941]	7	!
[1]	8	! Former revisions:
	9	! -----------------
[3]	10	! $Id: poisfft.f90 941 2012-07-09 14:50:21Z hoffmann $
[77]	11	!
[941]	12	! 940 2012-07-09 14:31:00Z raasch
	13	! special handling of tri-array as an argument in tridia_1dd routines switched
	14	! off because it caused segmentation faults with intel 12.1 compiler
	15	!
[878]	16	! 877 2012-04-03 11:21:44Z suehring
	17	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
	18	! pressure at the top of the model domain.
	19	!
[810]	20	! 809 2012-01-30 13:32:58Z maronga
	21	! Bugfix: replaced .AND. and .NOT. with && and ! in the preprocessor directives
	22	!
[808]	23	! 807 2012-01-25 11:53:51Z maronga
	24	! New cpp directive "__check" implemented which is used by check_namelist_files
	25	! (most of the code is unneeded by check_namelist_files).
	26	!
[764]	27	! 763 2011-10-06 09:32:09Z suehring
	28	! Comment added concerning the last change.
	29	!
[762]	30	! 761 2011-10-05 17:58:52Z suehring
	31	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
	32	! pressure at the top of the model domain.
	33	!
[697]	34	! 696 2011-03-18 07:03:49Z raasch
	35	! work_fftx removed from PRIVATE clauses in fftx_tr_xy and tr_yx_fftx
	36	!
[684]	37	! 683 2011-02-09 14:25:15Z raasch
	38	! openMP parallelization for 2d-domain-decomposition
	39	!
[668]	40	! 667 2010-12-23 12:06:00Z suehring/gryschka
	41	! ddzu replaced by ddzu_pres due to changes in zu(0)
	42	!
[623]	43	! 622 2010-12-10 08:08:13Z raasch
	44	! optional barriers included in order to speed up collective operations
	45	!
[392]	46	! 377 2009-09-04 11:09:00Z raasch
	47	! __lcmuk changed to __lc to avoid problems with Intel compiler on sgi-ice
	48	!
[198]	49	! 164 2008-05-15 08:46:15Z raasch
	50	! Arguments removed from transpose routines
	51	!
[139]	52	! 128 2007-10-26 13:11:14Z raasch
	53	! Bugfix: wavenumber calculation for even nx in routines maketri
	54	!
[90]	55	! 85 2007-05-11 09:35:14Z raasch
	56	! Bugfix: work_fft*_vec removed from some PRIVATE-declarations
	57	!
[77]	58	! 76 2007-03-29 00:58:32Z raasch
	59	! Tridiagonal coefficients adjusted for Neumann boundary conditions both at
	60	! the bottom and the top.
	61	!
[3]	62	! RCS Log replace by Id keyword, revision history cleaned up
	63	!
[1]	64	! Revision 1.24 2006/08/04 15:00:24 raasch
	65	! Default setting of the thread number tn in case of not using OpenMP
	66	!
	67	! Revision 1.23 2006/02/23 12:48:38 raasch
	68	! Additional compiler directive in routine tridia_1dd for preventing loop
	69	! exchange on NEC-SX6
	70	!
	71	! Revision 1.20 2004/04/30 12:38:09 raasch
	72	! Parts of former poisfft_hybrid moved to this subroutine,
	73	! former subroutine changed to a module, renaming of FFT-subroutines and
	74	! -module, FFTs completely substituted by calls of fft_x and fft_y,
	75	! NAG fft used in the non-parallel case completely removed, l in maketri
	76	! is now a 1d-array, variables passed by modules instead of using parameter
	77	! lists, enlarged transposition arrays introduced
	78	!
	79	! Revision 1.1 1997/07/24 11:24:14 raasch
	80	! Initial revision
	81	!
	82	!
	83	! Description:
	84	! ------------
	85	! See below.
	86	!------------------------------------------------------------------------------!
	87
	88	!--------------------------------------------------------------------------!
	89	! poisfft !
	90	! !
	91	! Original version: Stephan Siano (pois3d) !
	92	! !
	93	! Institute of Meteorology and Climatology, University of Hannover !
	94	! Germany !
	95	! !
	96	! Version as of July 23,1996 !
	97	! !
	98	! !
	99	! Version for parallel computers: Siegfried Raasch !
	100	! !
	101	! Version as of July 03,1997 !
	102	! !
	103	! Solves the Poisson equation with a 2D spectral method !
	104	! d^2 p / dx^2 + d^2 p / dy^2 + d^2 p / dz^2 = s !
	105	! !
	106	! Input: !
	107	! real ar contains in the (nnx,nny,nnz) elements, !
	108	! starting from the element (1,nys,nxl), the !
	109	! values for s !
	110	! real work Temporary array !
	111	! !
	112	! Output: !
	113	! real ar contains the solution for p !
	114	!--------------------------------------------------------------------------!
	115
	116	USE fft_xy
	117	USE indices
	118	USE transpose_indices
	119
	120	IMPLICIT NONE
	121
	122	PRIVATE
[807]	123
[809]	124	#if ! defined ( __check )
[1]	125	PUBLIC poisfft, poisfft_init
	126
	127	INTERFACE poisfft
	128	MODULE PROCEDURE poisfft
	129	END INTERFACE poisfft
	130
	131	INTERFACE poisfft_init
	132	MODULE PROCEDURE poisfft_init
	133	END INTERFACE poisfft_init
[807]	134	#else
	135	PUBLIC poisfft_init
[1]	136
[807]	137	INTERFACE poisfft_init
	138	MODULE PROCEDURE poisfft_init
	139	END INTERFACE poisfft_init
	140	#endif
	141
[1]	142	CONTAINS
	143
	144	SUBROUTINE poisfft_init
	145
	146	CALL fft_init
	147
	148	END SUBROUTINE poisfft_init
	149
[809]	150	#if ! defined ( __check )
[1]	151	SUBROUTINE poisfft( ar, work )
	152
	153	USE cpulog
	154	USE interfaces
	155	USE pegrid
	156
	157	IMPLICIT NONE
	158
	159	REAL, DIMENSION(1:nza,nys:nyna,nxl:nxra) :: ar, work
	160
	161
	162	CALL cpu_log( log_point_s(3), 'poisfft', 'start' )
	163
	164	!
	165	!-- Two-dimensional Fourier Transformation in x- and y-direction.
	166	#if defined( __parallel )
	167	IF ( pdims(2) == 1 ) THEN
	168
	169	!
	170	!-- 1d-domain-decomposition along x:
	171	!-- FFT along y and transposition y --> x
	172	CALL ffty_tr_yx( ar, work, ar )
	173
	174	!
	175	!-- FFT along x, solving the tridiagonal system and backward FFT
	176	CALL fftx_tri_fftx( ar )
	177
	178	!
	179	!-- Transposition x --> y and backward FFT along y
	180	CALL tr_xy_ffty( ar, work, ar )
	181
	182	ELSEIF ( pdims(1) == 1 ) THEN
	183
	184	!
	185	!-- 1d-domain-decomposition along y:
	186	!-- FFT along x and transposition x --> y
	187	CALL fftx_tr_xy( ar, work, ar )
	188
	189	!
	190	!-- FFT along y, solving the tridiagonal system and backward FFT
	191	CALL ffty_tri_ffty( ar )
	192
	193	!
	194	!-- Transposition y --> x and backward FFT along x
	195	CALL tr_yx_fftx( ar, work, ar )
	196
	197	ELSE
	198
	199	!
	200	!-- 2d-domain-decomposition
	201	!-- Transposition z --> x
	202	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
[164]	203	CALL transpose_zx( ar, work, ar )
[1]	204	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
	205
	206	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
	207	CALL fftxp( ar, 'forward' )
	208	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
	209
	210	!
	211	!-- Transposition x --> y
	212	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
[164]	213	CALL transpose_xy( ar, work, ar )
[1]	214	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
	215
	216	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
	217	CALL fftyp( ar, 'forward' )
	218	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	219
	220	!
	221	!-- Transposition y --> z
	222	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
[164]	223	CALL transpose_yz( ar, work, ar )
[1]	224	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
	225
	226	!
	227	!-- Solve the Poisson equation in z-direction in cartesian space.
	228	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
	229	CALL tridia( ar )
	230	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
	231
	232	!
	233	!-- Inverse Fourier Transformation
	234	!-- Transposition z --> y
	235	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
[164]	236	CALL transpose_zy( ar, work, ar )
[1]	237	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
	238
	239	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
	240	CALL fftyp( ar, 'backward' )
	241	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	242
	243	!
	244	!-- Transposition y --> x
	245	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
[164]	246	CALL transpose_yx( ar, work, ar )
[1]	247	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
	248
	249	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
	250	CALL fftxp( ar, 'backward' )
	251	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	252
	253	!
	254	!-- Transposition x --> z
	255	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
[164]	256	CALL transpose_xz( ar, work, ar )
[1]	257	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
	258
	259	ENDIF
	260
	261	#else
	262
	263	!
	264	!-- Two-dimensional Fourier Transformation along x- and y-direction.
	265	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
	266	CALL fftx( ar, 'forward' )
	267	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
	268	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
	269	CALL ffty( ar, 'forward' )
	270	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	271
	272	!
	273	!-- Solve the Poisson equation in z-direction in cartesian space.
	274	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
	275	CALL tridia( ar )
	276	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
	277
	278	!
	279	!-- Inverse Fourier Transformation.
	280	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
	281	CALL ffty( ar, 'backward' )
	282	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	283	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
	284	CALL fftx( ar, 'backward' )
	285	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	286
	287	#endif
	288
	289	CALL cpu_log( log_point_s(3), 'poisfft', 'stop' )
	290
	291	END SUBROUTINE poisfft
	292
	293
	294
	295	SUBROUTINE tridia( ar )
	296
	297	!------------------------------------------------------------------------------!
	298	! solves the linear system of equations:
	299	!
	300	! -(4 pi^2(i^2/(dx^2nnx^2)+j^2/(dy^2nny^2))+
	301	! 1/(dzu(k)dzw(k))+1/(dzu(k-1)dzw(k)))*p(i,j,k)+
	302	! 1/(dzu(k)dzw(k))p(i,j,k+1)+1/(dzu(k-1)dzw(k))p(i,j,k-1)=d(i,j,k)
	303	!
	304	! by using the Thomas algorithm
	305	!------------------------------------------------------------------------------!
	306
	307	USE arrays_3d
	308
	309	IMPLICIT NONE
	310
	311	INTEGER :: i, j, k, nnyh
	312
	313	REAL, DIMENSION(nxl_z:nxr_z,0:nz-1) :: ar1
	314	REAL, DIMENSION(5,nxl_z:nxr_z,0:nz-1) :: tri
	315
	316	#if defined( __parallel )
	317	REAL :: ar(nxl_z:nxr_za,nys_z:nyn_za,1:nza)
	318	#else
	319	REAL :: ar(1:nz,nys_z:nyn_z,nxl_z:nxr_z)
	320	#endif
	321
	322
	323	nnyh = (ny+1) / 2
	324
	325	!
	326	!-- Define constant elements of the tridiagonal matrix.
[683]	327	!$OMP PARALLEL PRIVATE ( k, i )
	328	!$OMP DO
[1]	329	DO k = 0, nz-1
	330	DO i = nxl_z, nxr_z
[667]	331	tri(2,i,k) = ddzu_pres(k+1) * ddzw(k+1)
	332	tri(3,i,k) = ddzu_pres(k+2) * ddzw(k+1)
[1]	333	ENDDO
	334	ENDDO
[683]	335	!$OMP END PARALLEL
[1]	336
	337	#if defined( __parallel )
	338	!
	339	!-- Repeat for all y-levels.
[683]	340	!$OMP PARALLEL FIRSTPRIVATE( tri ) PRIVATE ( ar1, j )
	341	!$OMP DO
[1]	342	DO j = nys_z, nyn_z
	343	IF ( j <= nnyh ) THEN
	344	CALL maketri( tri, j )
	345	ELSE
	346	CALL maketri( tri, ny+1-j )
	347	ENDIF
	348	CALL split( tri )
	349	CALL substi( ar, ar1, tri, j )
	350	ENDDO
[683]	351	!$OMP END PARALLEL
[1]	352	#else
	353	!
	354	!-- First y-level.
	355	CALL maketri( tri, nys_z )
	356	CALL split( tri )
	357	CALL substi( ar, ar1, tri, 0 )
	358
	359	!
	360	!-- Further y-levels.
	361	DO j = 1, nnyh - 1
	362	CALL maketri( tri, j )
	363	CALL split( tri )
	364	CALL substi( ar, ar1, tri, j )
	365	CALL substi( ar, ar1, tri, ny+1-j )
	366	ENDDO
	367	CALL maketri( tri, nnyh )
	368	CALL split( tri )
	369	CALL substi( ar, ar1, tri, nnyh+nys )
	370	#endif
	371
	372	CONTAINS
	373
	374	SUBROUTINE maketri( tri, j )
	375
	376	!------------------------------------------------------------------------------!
	377	! Computes the i- and j-dependent component of the matrix
	378	!------------------------------------------------------------------------------!
	379
	380	USE arrays_3d
	381	USE constants
	382	USE control_parameters
	383	USE grid_variables
	384
	385	IMPLICIT NONE
	386
	387	INTEGER :: i, j, k, nnxh
	388	REAL :: a, c
	389	REAL :: ll(nxl_z:nxr_z)
	390	REAL :: tri(5,nxl_z:nxr_z,0:nz-1)
	391
	392
	393	nnxh = ( nx + 1 ) / 2
	394
	395	!
	396	!-- Provide the tridiagonal matrix for solution of the Poisson equation in
	397	!-- Fourier space. The coefficients are computed following the method of
	398	!-- Schmidt et al. (DFVLR-Mitteilung 84-15), which departs from Stephan
	399	!-- Siano's original version by discretizing the Poisson equation,
	400	!-- before it is Fourier-transformed
	401	#if defined( __parallel )
	402	DO i = nxl_z, nxr_z
[128]	403	IF ( i >= 0 .AND. i <= nnxh ) THEN
[1]	404	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
	405	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
	406	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
	407	FLOAT( ny+1 ) ) ) / ( dy * dy )
	408	ELSE
	409	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
	410	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
	411	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
	412	FLOAT( ny+1 ) ) ) / ( dy * dy )
	413	ENDIF
	414	DO k = 0,nz-1
[667]	415	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
	416	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
[1]	417	tri(1,i,k) = a + c - ll(i)
	418	ENDDO
	419	ENDDO
	420	#else
	421	DO i = 0, nnxh
	422	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / FLOAT( nx+1 ) ) ) / &
	423	( dx * dx ) + &
	424	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / FLOAT( ny+1 ) ) ) / &
	425	( dy * dy )
	426	DO k = 0, nz-1
[667]	427	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
	428	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
[1]	429	tri(1,i,k) = a + c - ll(i)
	430	IF ( i >= 1 .and. i < nnxh ) THEN
	431	tri(1,nx+1-i,k) = tri(1,i,k)
	432	ENDIF
	433	ENDDO
	434	ENDDO
	435	#endif
	436	IF ( ibc_p_b == 1 .OR. ibc_p_b == 2 ) THEN
	437	DO i = nxl_z, nxr_z
	438	tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
	439	ENDDO
	440	ENDIF
	441	IF ( ibc_p_t == 1 ) THEN
	442	DO i = nxl_z, nxr_z
	443	tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
	444	ENDDO
	445	ENDIF
	446
	447	END SUBROUTINE maketri
	448
	449
	450	SUBROUTINE substi( ar, ar1, tri, j )
	451
	452	!------------------------------------------------------------------------------!
	453	! Substitution (Forward and Backward) (Thomas algorithm)
	454	!------------------------------------------------------------------------------!
	455
[76]	456	USE control_parameters
	457
[1]	458	IMPLICIT NONE
	459
	460	INTEGER :: i, j, k
	461	REAL :: ar1(nxl_z:nxr_z,0:nz-1)
	462	REAL :: tri(5,nxl_z:nxr_z,0:nz-1)
	463	#if defined( __parallel )
	464	REAL :: ar(nxl_z:nxr_za,nys_z:nyn_za,1:nza)
	465	#else
	466	REAL :: ar(1:nz,nys_z:nyn_z,nxl_z:nxr_z)
	467	#endif
	468
	469	!
	470	!-- Forward substitution.
	471	DO i = nxl_z, nxr_z
	472	#if defined( __parallel )
	473	ar1(i,0) = ar(i,j,1)
	474	#else
	475	ar1(i,0) = ar(1,j,i)
	476	#endif
	477	ENDDO
	478	DO k = 1, nz - 1
	479	DO i = nxl_z, nxr_z
	480	#if defined( __parallel )
	481	ar1(i,k) = ar(i,j,k+1) - tri(5,i,k) * ar1(i,k-1)
	482	#else
	483	ar1(i,k) = ar(k+1,j,i) - tri(5,i,k) * ar1(i,k-1)
	484	#endif
	485	ENDDO
	486	ENDDO
	487
	488	!
[877]	489	!-- Backward substitution
	490	!-- Note, the 1.0E-20 in the denominator is due to avoid divisions
	491	!-- by zero appearing if the pressure bc is set to neumann at the top of
	492	!-- the model domain.
[1]	493	DO i = nxl_z, nxr_z
	494	#if defined( __parallel )
[877]	495	ar(i,j,nz) = ar1(i,nz-1) / ( tri(4,i,nz-1) + 1.0E-20 )
[1]	496	#else
[877]	497	ar(nz,j,i) = ar1(i,nz-1) / ( tri(4,i,nz-1) + 1.0E-20 )
[1]	498	#endif
	499	ENDDO
	500	DO k = nz-2, 0, -1
	501	DO i = nxl_z, nxr_z
	502	#if defined( __parallel )
	503	ar(i,j,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,j,k+2) ) &
	504	/ tri(4,i,k)
	505	#else
	506	ar(k+1,j,i) = ( ar1(i,k) - tri(3,i,k) * ar(k+2,j,i) ) &
	507	/ tri(4,i,k)
	508	#endif
	509	ENDDO
	510	ENDDO
	511
[76]	512	!
	513	!-- Indices i=0, j=0 correspond to horizontally averaged pressure.
	514	!-- The respective values of ar should be zero at all k-levels if
	515	!-- acceleration of horizontally averaged vertical velocity is zero.
	516	IF ( ibc_p_b == 1 .AND. ibc_p_t == 1 ) THEN
	517	IF ( j == 0 .AND. nxl_z == 0 ) THEN
	518	#if defined( __parallel )
	519	DO k = 1, nz
	520	ar(nxl_z,j,k) = 0.0
	521	ENDDO
	522	#else
	523	DO k = 1, nz
	524	ar(k,j,nxl_z) = 0.0
	525	ENDDO
	526	#endif
	527	ENDIF
	528	ENDIF
	529
[1]	530	END SUBROUTINE substi
	531
	532
	533	SUBROUTINE split( tri )
	534
	535	!------------------------------------------------------------------------------!
	536	! Splitting of the tridiagonal matrix (Thomas algorithm)
	537	!------------------------------------------------------------------------------!
	538
	539	IMPLICIT NONE
	540
	541	INTEGER :: i, k
	542	REAL :: tri(5,nxl_z:nxr_z,0:nz-1)
	543
	544	!
	545	!-- Splitting.
	546	DO i = nxl_z, nxr_z
	547	tri(4,i,0) = tri(1,i,0)
	548	ENDDO
	549	DO k = 1, nz-1
	550	DO i = nxl_z, nxr_z
	551	tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
	552	tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
	553	ENDDO
	554	ENDDO
	555
	556	END SUBROUTINE split
	557
	558	END SUBROUTINE tridia
	559
	560
	561	#if defined( __parallel )
	562	SUBROUTINE fftxp( ar, direction )
	563
	564	!------------------------------------------------------------------------------!
	565	! Fourier-transformation along x-direction Parallelized version
	566	!------------------------------------------------------------------------------!
	567
	568	IMPLICIT NONE
	569
	570	CHARACTER (LEN=*) :: direction
	571	INTEGER :: j, k
	572	REAL :: ar(0:nxa,nys_x:nyn_xa,nzb_x:nzt_xa)
	573
	574	!
	575	!-- Performing the fft with one of the methods implemented
[683]	576	!$OMP PARALLEL PRIVATE ( j, k )
	577	!$OMP DO
[1]	578	DO k = nzb_x, nzt_x
	579	DO j = nys_x, nyn_x
	580	CALL fft_x( ar(0:nx,j,k), direction )
	581	ENDDO
	582	ENDDO
[683]	583	!$OMP END PARALLEL
[1]	584
	585	END SUBROUTINE fftxp
	586
	587	#else
	588	SUBROUTINE fftx( ar, direction )
	589
	590	!------------------------------------------------------------------------------!
	591	! Fourier-transformation along x-direction Non parallel version
	592	!------------------------------------------------------------------------------!
	593
	594	IMPLICIT NONE
	595
	596	CHARACTER (LEN=*) :: direction
	597	INTEGER :: i, j, k
	598	REAL :: ar(1:nz,0:ny,0:nx)
	599
	600	!
	601	!-- Performing the fft with one of the methods implemented
[683]	602	!$OMP PARALLEL PRIVATE ( j, k )
	603	!$OMP DO
[1]	604	DO k = 1, nz
	605	DO j = 0, ny
	606	CALL fft_x( ar(k,j,0:nx), direction )
	607	ENDDO
	608	ENDDO
[683]	609	!$OMP END PARALLEL
[1]	610
	611	END SUBROUTINE fftx
	612	#endif
	613
	614
	615	#if defined( __parallel )
	616	SUBROUTINE fftyp( ar, direction )
	617
	618	!------------------------------------------------------------------------------!
	619	! Fourier-transformation along y-direction Parallelized version
	620	!------------------------------------------------------------------------------!
	621
	622	IMPLICIT NONE
	623
	624	CHARACTER (LEN=*) :: direction
	625	INTEGER :: i, k
	626	REAL :: ar(0:nya,nxl_y:nxr_ya,nzb_y:nzt_ya)
	627
	628	!
	629	!-- Performing the fft with one of the methods implemented
[683]	630	!$OMP PARALLEL PRIVATE ( i, k )
	631	!$OMP DO
[1]	632	DO k = nzb_y, nzt_y
	633	DO i = nxl_y, nxr_y
	634	CALL fft_y( ar(0:ny,i,k), direction )
	635	ENDDO
	636	ENDDO
[683]	637	!$OMP END PARALLEL
[1]	638
	639	END SUBROUTINE fftyp
	640
	641	#else
	642	SUBROUTINE ffty( ar, direction )
	643
	644	!------------------------------------------------------------------------------!
	645	! Fourier-transformation along y-direction Non parallel version
	646	!------------------------------------------------------------------------------!
	647
	648	IMPLICIT NONE
	649
	650	CHARACTER (LEN=*) :: direction
	651	INTEGER :: i, k
	652	REAL :: ar(1:nz,0:ny,0:nx)
	653
	654	!
	655	!-- Performing the fft with one of the methods implemented
[683]	656	!$OMP PARALLEL PRIVATE ( i, k )
	657	!$OMP DO
[1]	658	DO k = 1, nz
	659	DO i = 0, nx
	660	CALL fft_y( ar(k,0:ny,i), direction )
	661	ENDDO
	662	ENDDO
[683]	663	!$OMP END PARALLEL
[1]	664
	665	END SUBROUTINE ffty
	666	#endif
	667
	668	#if defined( __parallel )
	669	SUBROUTINE ffty_tr_yx( f_in, work, f_out )
	670
	671	!------------------------------------------------------------------------------!
	672	! Fourier-transformation along y with subsequent transposition y --> x for
	673	! a 1d-decomposition along x
	674	!
	675	! ATTENTION: The performance of this routine is much faster on the NEC-SX6,
	676	! if the first index of work_ffty_vec is odd. Otherwise
	677	! memory bank conflicts may occur (especially if the index is a
	678	! multiple of 128). That's why work_ffty_vec is dimensioned as
	679	! 0:ny+1.
	680	! Of course, this will not work if users are using an odd number
	681	! of gridpoints along y.
	682	!------------------------------------------------------------------------------!
	683
	684	USE control_parameters
	685	USE cpulog
	686	USE indices
	687	USE interfaces
	688	USE pegrid
	689	USE transpose_indices
	690
	691	IMPLICIT NONE
	692
	693	INTEGER :: i, iend, iouter, ir, j, k
	694	INTEGER, PARAMETER :: stridex = 4
	695
	696	REAL, DIMENSION(0:ny,stridex) :: work_ffty
	697	#if defined( __nec )
	698	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
	699	#endif
	700	REAL, DIMENSION(1:nza,0:nya,nxl:nxra) :: f_in
	701	REAL, DIMENSION(nnx,1:nza,nys_x:nyn_xa,pdims(1)) :: f_out
	702	REAL, DIMENSION(nxl:nxra,1:nza,0:nya) :: work
	703
	704	!
	705	!-- Carry out the FFT along y, where all data are present due to the
	706	!-- 1d-decomposition along x. Resort the data in a way that x becomes
	707	!-- the first index.
	708	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
	709
	710	IF ( host(1:3) == 'nec' ) THEN
	711	#if defined( __nec )
	712	!
	713	!-- Code optimized for vector processors
[85]	714	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	715	!$OMP DO
	716	DO i = nxl, nxr
	717
	718	DO j = 0, ny
	719	DO k = 1, nz
	720	work_ffty_vec(j,k,i) = f_in(k,j,i)
	721	ENDDO
	722	ENDDO
	723
	724	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' )
	725
	726	ENDDO
	727
	728	!$OMP DO
	729	DO k = 1, nz
	730	DO j = 0, ny
	731	DO i = nxl, nxr
	732	work(i,k,j) = work_ffty_vec(j,k,i)
	733	ENDDO
	734	ENDDO
	735	ENDDO
	736	!$OMP END PARALLEL
	737	#endif
	738
	739	ELSE
	740
	741	!
	742	!-- Cache optimized code.
	743	!-- The i-(x-)direction is split into a strided outer loop and an inner
	744	!-- loop for better cache performance
	745	!$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty)
	746	!$OMP DO
	747	DO iouter = nxl, nxr, stridex
	748
	749	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
	750
	751	DO k = 1, nz
	752
	753	DO i = iouter, iend
	754
	755	ir = i-iouter+1 ! counter within a stride
	756	DO j = 0, ny
	757	work_ffty(j,ir) = f_in(k,j,i)
	758	ENDDO
	759	!
	760	!-- FFT along y
	761	CALL fft_y( work_ffty(:,ir), 'forward' )
	762
	763	ENDDO
	764
	765	!
	766	!-- Resort
	767	DO j = 0, ny
	768	DO i = iouter, iend
	769	work(i,k,j) = work_ffty(j,i-iouter+1)
	770	ENDDO
	771	ENDDO
	772
	773	ENDDO
	774
	775	ENDDO
	776	!$OMP END PARALLEL
	777
	778	ENDIF
	779	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	780
	781	!
	782	!-- Transpose array
	783	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	784	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	785	CALL MPI_ALLTOALL( work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
	786	f_out(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
	787	comm1dx, ierr )
	788	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	789
	790	END SUBROUTINE ffty_tr_yx
	791
	792
	793	SUBROUTINE tr_xy_ffty( f_in, work, f_out )
	794
	795	!------------------------------------------------------------------------------!
	796	! Transposition x --> y with a subsequent backward Fourier transformation for
	797	! a 1d-decomposition along x
	798	!------------------------------------------------------------------------------!
	799
	800	USE control_parameters
	801	USE cpulog
	802	USE indices
	803	USE interfaces
	804	USE pegrid
	805	USE transpose_indices
	806
	807	IMPLICIT NONE
	808
	809	INTEGER :: i, iend, iouter, ir, j, k
	810	INTEGER, PARAMETER :: stridex = 4
	811
	812	REAL, DIMENSION(0:ny,stridex) :: work_ffty
	813	#if defined( __nec )
	814	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
	815	#endif
	816	REAL, DIMENSION(nnx,1:nza,nys_x:nyn_xa,pdims(1)) :: f_in
	817	REAL, DIMENSION(1:nza,0:nya,nxl:nxra) :: f_out
	818	REAL, DIMENSION(nxl:nxra,1:nza,0:nya) :: work
	819
	820	!
	821	!-- Transpose array
	822	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	823	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	824	CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
	825	work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
	826	comm1dx, ierr )
	827	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	828
	829	!
	830	!-- Resort the data in a way that y becomes the first index and carry out the
	831	!-- backward fft along y.
	832	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
	833
	834	IF ( host(1:3) == 'nec' ) THEN
	835	#if defined( __nec )
	836	!
	837	!-- Code optimized for vector processors
[85]	838	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	839	!$OMP DO
	840	DO k = 1, nz
	841	DO j = 0, ny
	842	DO i = nxl, nxr
	843	work_ffty_vec(j,k,i) = work(i,k,j)
	844	ENDDO
	845	ENDDO
	846	ENDDO
	847
	848	!$OMP DO
	849	DO i = nxl, nxr
	850
	851	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' )
	852
	853	DO j = 0, ny
	854	DO k = 1, nz
	855	f_out(k,j,i) = work_ffty_vec(j,k,i)
	856	ENDDO
	857	ENDDO
	858
	859	ENDDO
	860	!$OMP END PARALLEL
	861	#endif
	862
	863	ELSE
	864
	865	!
	866	!-- Cache optimized code.
	867	!-- The i-(x-)direction is split into a strided outer loop and an inner
	868	!-- loop for better cache performance
	869	!$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty )
	870	!$OMP DO
	871	DO iouter = nxl, nxr, stridex
	872
	873	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
	874
	875	DO k = 1, nz
	876	!
	877	!-- Resort
	878	DO j = 0, ny
	879	DO i = iouter, iend
	880	work_ffty(j,i-iouter+1) = work(i,k,j)
	881	ENDDO
	882	ENDDO
	883
	884	DO i = iouter, iend
	885
	886	!
	887	!-- FFT along y
	888	ir = i-iouter+1 ! counter within a stride
	889	CALL fft_y( work_ffty(:,ir), 'backward' )
	890
	891	DO j = 0, ny
	892	f_out(k,j,i) = work_ffty(j,ir)
	893	ENDDO
	894	ENDDO
	895
	896	ENDDO
	897
	898	ENDDO
	899	!$OMP END PARALLEL
	900
	901	ENDIF
	902
	903	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	904
	905	END SUBROUTINE tr_xy_ffty
	906
	907
	908	SUBROUTINE fftx_tri_fftx( ar )
	909
	910	!------------------------------------------------------------------------------!
	911	! FFT along x, solution of the tridiagonal system and backward FFT for
	912	! a 1d-decomposition along x
	913	!
	914	! WARNING: this subroutine may still not work for hybrid parallelization
	915	! with OpenMP (for possible necessary changes see the original
	916	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
	917	!------------------------------------------------------------------------------!
	918
	919	USE control_parameters
	920	USE cpulog
	921	USE grid_variables
	922	USE indices
	923	USE interfaces
	924	USE pegrid
	925	USE transpose_indices
	926
	927	IMPLICIT NONE
	928
	929	character(len=3) :: myth_char
	930
	931	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
	932
	933	REAL, DIMENSION(0:nx) :: work_fftx
	934	REAL, DIMENSION(0:nx,1:nz) :: work_trix
	935	REAL, DIMENSION(nnx,1:nza,nys_x:nyn_xa,pdims(1)) :: ar
	936	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
	937
	938
	939	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'start' )
	940
	941	ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) )
	942
	943	tn = 0 ! Default thread number in case of one thread
	944	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_fftx, work_trix )
	945	DO j = nys_x, nyn_x
	946
	947	!$ tn = omp_get_thread_num()
	948
	949	IF ( host(1:3) == 'nec' ) THEN
	950	!
	951	!-- Code optimized for vector processors
	952	DO k = 1, nz
	953
	954	m = 0
	955	DO n = 1, pdims(1)
	956	DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
	957	work_trix(m,k) = ar(i,k,j,n)
	958	m = m + 1
	959	ENDDO
	960	ENDDO
	961
	962	ENDDO
	963
	964	CALL fft_x_m( work_trix, 'forward' )
	965
	966	ELSE
	967	!
	968	!-- Cache optimized code
	969	DO k = 1, nz
	970
	971	m = 0
	972	DO n = 1, pdims(1)
	973	DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
	974	work_fftx(m) = ar(i,k,j,n)
	975	m = m + 1
	976	ENDDO
	977	ENDDO
	978
	979	CALL fft_x( work_fftx, 'forward' )
	980
	981	DO i = 0, nx
	982	work_trix(i,k) = work_fftx(i)
	983	ENDDO
	984
	985	ENDDO
	986
	987	ENDIF
	988
	989	!
	990	!-- Solve the linear equation system
	991	CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) )
	992
	993	IF ( host(1:3) == 'nec' ) THEN
	994	!
	995	!-- Code optimized for vector processors
	996	CALL fft_x_m( work_trix, 'backward' )
	997
	998	DO k = 1, nz
	999
	1000	m = 0
	1001	DO n = 1, pdims(1)
	1002	DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
	1003	ar(i,k,j,n) = work_trix(m,k)
	1004	m = m + 1
	1005	ENDDO
	1006	ENDDO
	1007
	1008	ENDDO
	1009
	1010	ELSE
	1011	!
	1012	!-- Cache optimized code
	1013	DO k = 1, nz
	1014
	1015	DO i = 0, nx
	1016	work_fftx(i) = work_trix(i,k)
	1017	ENDDO
	1018
	1019	CALL fft_x( work_fftx, 'backward' )
	1020
	1021	m = 0
	1022	DO n = 1, pdims(1)
	1023	DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
	1024	ar(i,k,j,n) = work_fftx(m)
	1025	m = m + 1
	1026	ENDDO
	1027	ENDDO
	1028
	1029	ENDDO
	1030
	1031	ENDIF
	1032
	1033	ENDDO
	1034
	1035	DEALLOCATE( tri )
	1036
	1037	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'stop' )
	1038
	1039	END SUBROUTINE fftx_tri_fftx
	1040
	1041
	1042	SUBROUTINE fftx_tr_xy( f_in, work, f_out )
	1043
	1044	!------------------------------------------------------------------------------!
	1045	! Fourier-transformation along x with subsequent transposition x --> y for
	1046	! a 1d-decomposition along y
	1047	!
	1048	! ATTENTION: The NEC-branch of this routine may significantly profit from
	1049	! further optimizations. So far, performance is much worse than
	1050	! for routine ffty_tr_yx (more than three times slower).
	1051	!------------------------------------------------------------------------------!
	1052
	1053	USE control_parameters
	1054	USE cpulog
	1055	USE indices
	1056	USE interfaces
	1057	USE pegrid
	1058	USE transpose_indices
	1059
	1060	IMPLICIT NONE
	1061
	1062	INTEGER :: i, j, k
	1063
	1064	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
	1065	REAL, DIMENSION(1:nza,nys:nyna,0:nxa) :: f_in
	1066	REAL, DIMENSION(nny,1:nza,nxl_y:nxr_ya,pdims(2)) :: f_out
	1067	REAL, DIMENSION(nys:nyna,1:nza,0:nxa) :: work
	1068
	1069	!
	1070	!-- Carry out the FFT along x, where all data are present due to the
	1071	!-- 1d-decomposition along y. Resort the data in a way that y becomes
	1072	!-- the first index.
	1073	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
	1074
	1075	IF ( host(1:3) == 'nec' ) THEN
	1076	!
	1077	!-- Code for vector processors
[85]	1078	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	1079	!$OMP DO
	1080	DO i = 0, nx
	1081
	1082	DO j = nys, nyn
	1083	DO k = 1, nz
	1084	work_fftx(i,k,j) = f_in(k,j,i)
	1085	ENDDO
	1086	ENDDO
	1087
	1088	ENDDO
	1089
	1090	!$OMP DO
	1091	DO j = nys, nyn
	1092
	1093	CALL fft_x_m( work_fftx(:,:,j), 'forward' )
	1094
	1095	DO k = 1, nz
	1096	DO i = 0, nx
	1097	work(j,k,i) = work_fftx(i,k,j)
	1098	ENDDO
	1099	ENDDO
	1100
	1101	ENDDO
	1102	!$OMP END PARALLEL
	1103
	1104	ELSE
	1105
	1106	!
	1107	!-- Cache optimized code (there might be still a potential for better
	1108	!-- optimization).
[696]	1109	!$OMP PARALLEL PRIVATE (i,j,k)
[1]	1110	!$OMP DO
	1111	DO i = 0, nx
	1112
	1113	DO j = nys, nyn
	1114	DO k = 1, nz
	1115	work_fftx(i,k,j) = f_in(k,j,i)
	1116	ENDDO
	1117	ENDDO
	1118
	1119	ENDDO
	1120
	1121	!$OMP DO
	1122	DO j = nys, nyn
	1123	DO k = 1, nz
	1124
	1125	CALL fft_x( work_fftx(0:nx,k,j), 'forward' )
	1126
	1127	DO i = 0, nx
	1128	work(j,k,i) = work_fftx(i,k,j)
	1129	ENDDO
	1130	ENDDO
	1131
	1132	ENDDO
	1133	!$OMP END PARALLEL
	1134
	1135	ENDIF
	1136	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
	1137
	1138	!
	1139	!-- Transpose array
	1140	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	1141	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	1142	CALL MPI_ALLTOALL( work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
	1143	f_out(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
	1144	comm1dy, ierr )
	1145	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	1146
	1147	END SUBROUTINE fftx_tr_xy
	1148
	1149
	1150	SUBROUTINE tr_yx_fftx( f_in, work, f_out )
	1151
	1152	!------------------------------------------------------------------------------!
	1153	! Transposition y --> x with a subsequent backward Fourier transformation for
	1154	! a 1d-decomposition along x
	1155	!------------------------------------------------------------------------------!
	1156
	1157	USE control_parameters
	1158	USE cpulog
	1159	USE indices
	1160	USE interfaces
	1161	USE pegrid
	1162	USE transpose_indices
	1163
	1164	IMPLICIT NONE
	1165
	1166	INTEGER :: i, j, k
	1167
	1168	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
	1169	REAL, DIMENSION(nny,1:nza,nxl_y:nxr_ya,pdims(2)) :: f_in
	1170	REAL, DIMENSION(1:nza,nys:nyna,0:nxa) :: f_out
	1171	REAL, DIMENSION(nys:nyna,1:nza,0:nxa) :: work
	1172
	1173	!
	1174	!-- Transpose array
	1175	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	1176	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	1177	CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
	1178	work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
	1179	comm1dy, ierr )
	1180	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	1181
	1182	!
	1183	!-- Carry out the FFT along x, where all data are present due to the
	1184	!-- 1d-decomposition along y. Resort the data in a way that y becomes
	1185	!-- the first index.
	1186	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
	1187
	1188	IF ( host(1:3) == 'nec' ) THEN
	1189	!
	1190	!-- Code optimized for vector processors
[85]	1191	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	1192	!$OMP DO
	1193	DO j = nys, nyn
	1194
	1195	DO k = 1, nz
	1196	DO i = 0, nx
	1197	work_fftx(i,k,j) = work(j,k,i)
	1198	ENDDO
	1199	ENDDO
	1200
	1201	CALL fft_x_m( work_fftx(:,:,j), 'backward' )
	1202
	1203	ENDDO
	1204
	1205	!$OMP DO
	1206	DO i = 0, nx
	1207	DO j = nys, nyn
	1208	DO k = 1, nz
	1209	f_out(k,j,i) = work_fftx(i,k,j)
	1210	ENDDO
	1211	ENDDO
	1212	ENDDO
	1213	!$OMP END PARALLEL
	1214
	1215	ELSE
	1216
	1217	!
	1218	!-- Cache optimized code (there might be still a potential for better
	1219	!-- optimization).
[696]	1220	!$OMP PARALLEL PRIVATE (i,j,k)
[1]	1221	!$OMP DO
	1222	DO j = nys, nyn
	1223	DO k = 1, nz
	1224
	1225	DO i = 0, nx
	1226	work_fftx(i,k,j) = work(j,k,i)
	1227	ENDDO
	1228
	1229	CALL fft_x( work_fftx(0:nx,k,j), 'backward' )
	1230
	1231	ENDDO
	1232	ENDDO
	1233
	1234	!$OMP DO
	1235	DO i = 0, nx
	1236	DO j = nys, nyn
	1237	DO k = 1, nz
	1238	f_out(k,j,i) = work_fftx(i,k,j)
	1239	ENDDO
	1240	ENDDO
	1241	ENDDO
	1242	!$OMP END PARALLEL
	1243
	1244	ENDIF
	1245	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	1246
	1247	END SUBROUTINE tr_yx_fftx
	1248
	1249
	1250	SUBROUTINE ffty_tri_ffty( ar )
	1251
	1252	!------------------------------------------------------------------------------!
	1253	! FFT along y, solution of the tridiagonal system and backward FFT for
	1254	! a 1d-decomposition along y
	1255	!
	1256	! WARNING: this subroutine may still not work for hybrid parallelization
	1257	! with OpenMP (for possible necessary changes see the original
	1258	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
	1259	!------------------------------------------------------------------------------!
	1260
	1261	USE control_parameters
	1262	USE cpulog
	1263	USE grid_variables
	1264	USE indices
	1265	USE interfaces
	1266	USE pegrid
	1267	USE transpose_indices
	1268
	1269	IMPLICIT NONE
	1270
	1271	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
	1272
	1273	REAL, DIMENSION(0:ny) :: work_ffty
	1274	REAL, DIMENSION(0:ny,1:nz) :: work_triy
	1275	REAL, DIMENSION(nny,1:nza,nxl_y:nxr_ya,pdims(2)) :: ar
	1276	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
	1277
	1278
	1279	CALL cpu_log( log_point_s(39), 'fft_y + tridia', 'start' )
	1280
	1281	ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) )
	1282
	1283	tn = 0 ! Default thread number in case of one thread
[696]	1284	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy )
[1]	1285	DO i = nxl_y, nxr_y
	1286
	1287	!$ tn = omp_get_thread_num()
	1288
	1289	IF ( host(1:3) == 'nec' ) THEN
	1290	!
	1291	!-- Code optimized for vector processors
	1292	DO k = 1, nz
	1293
	1294	m = 0
	1295	DO n = 1, pdims(2)
	1296	DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
	1297	work_triy(m,k) = ar(j,k,i,n)
	1298	m = m + 1
	1299	ENDDO
	1300	ENDDO
	1301
	1302	ENDDO
	1303
	1304	CALL fft_y_m( work_triy, ny, 'forward' )
	1305
	1306	ELSE
	1307	!
	1308	!-- Cache optimized code
	1309	DO k = 1, nz
	1310
	1311	m = 0
	1312	DO n = 1, pdims(2)
	1313	DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
	1314	work_ffty(m) = ar(j,k,i,n)
	1315	m = m + 1
	1316	ENDDO
	1317	ENDDO
	1318
	1319	CALL fft_y( work_ffty, 'forward' )
	1320
	1321	DO j = 0, ny
	1322	work_triy(j,k) = work_ffty(j)
	1323	ENDDO
	1324
	1325	ENDDO
	1326
	1327	ENDIF
	1328
	1329	!
	1330	!-- Solve the linear equation system
	1331	CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) )
	1332
	1333	IF ( host(1:3) == 'nec' ) THEN
	1334	!
	1335	!-- Code optimized for vector processors
	1336	CALL fft_y_m( work_triy, ny, 'backward' )
	1337
	1338	DO k = 1, nz
	1339
	1340	m = 0
	1341	DO n = 1, pdims(2)
	1342	DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
	1343	ar(j,k,i,n) = work_triy(m,k)
	1344	m = m + 1
	1345	ENDDO
	1346	ENDDO
	1347
	1348	ENDDO
	1349
	1350	ELSE
	1351	!
	1352	!-- Cache optimized code
	1353	DO k = 1, nz
	1354
	1355	DO j = 0, ny
	1356	work_ffty(j) = work_triy(j,k)
	1357	ENDDO
	1358
	1359	CALL fft_y( work_ffty, 'backward' )
	1360
	1361	m = 0
	1362	DO n = 1, pdims(2)
	1363	DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
	1364	ar(j,k,i,n) = work_ffty(m)
	1365	m = m + 1
	1366	ENDDO
	1367	ENDDO
	1368
	1369	ENDDO
	1370
	1371	ENDIF
	1372
	1373	ENDDO
	1374
	1375	DEALLOCATE( tri )
	1376
	1377	CALL cpu_log( log_point_s(39), 'fft_y + tridia', 'stop' )
	1378
	1379	END SUBROUTINE ffty_tri_ffty
	1380
	1381
	1382	SUBROUTINE tridia_1dd( ddx2, ddy2, nx, ny, j, ar, tri )
	1383
	1384	!------------------------------------------------------------------------------!
	1385	! Solves the linear system of equations for a 1d-decomposition along x (see
	1386	! tridia)
	1387	!
[940]	1388	! Attention: when using the intel compilers older than 12.0, array tri must
	1389	! be passed as an argument to the contained subroutines. Otherwise
	1390	! addres faults will occur. This feature can be activated with
	1391	! cpp-switch __intel11
[1]	1392	! On NEC, tri should not be passed (except for routine substi_1dd)
	1393	! because this causes very bad performance.
	1394	!------------------------------------------------------------------------------!
	1395
	1396	USE arrays_3d
	1397	USE control_parameters
	1398
	1399	USE pegrid
	1400
	1401	IMPLICIT NONE
	1402
	1403	INTEGER :: i, j, k, nnyh, nx, ny, omp_get_thread_num, tn
	1404
	1405	REAL :: ddx2, ddy2
	1406
	1407	REAL, DIMENSION(0:nx,1:nz) :: ar
	1408	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
	1409	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	1410
	1411
	1412	nnyh = ( ny + 1 ) / 2
	1413
	1414	!
	1415	!-- Define constant elements of the tridiagonal matrix.
	1416	!-- The compiler on SX6 does loop exchange. If 0:nx is a high power of 2,
	1417	!-- the exchanged loops create bank conflicts. The following directive
	1418	!-- prohibits loop exchange and the loops perform much better.
	1419	! tn = omp_get_thread_num()
	1420	! WRITE( 120+tn, * ) '+++ id=',myid,' nx=',nx,' thread=', omp_get_thread_num()
[82]	1421	! CALL local_flush( 120+tn )
[1]	1422	!CDIR NOLOOPCHG
	1423	DO k = 0, nz-1
	1424	DO i = 0,nx
[667]	1425	tri(2,i,k) = ddzu_pres(k+1) * ddzw(k+1)
	1426	tri(3,i,k) = ddzu_pres(k+2) * ddzw(k+1)
[1]	1427	ENDDO
	1428	ENDDO
	1429	! WRITE( 120+tn, * ) '+++ id=',myid,' end of first tridia loop thread=', omp_get_thread_num()
[82]	1430	! CALL local_flush( 120+tn )
[1]	1431
	1432	IF ( j <= nnyh ) THEN
[940]	1433	#if defined( __intel11 )
[1]	1434	CALL maketri_1dd( j, tri )
	1435	#else
	1436	CALL maketri_1dd( j )
	1437	#endif
	1438	ELSE
[940]	1439	#if defined( __intel11 )
[1]	1440	CALL maketri_1dd( ny+1-j, tri )
	1441	#else
	1442	CALL maketri_1dd( ny+1-j )
	1443	#endif
	1444	ENDIF
[940]	1445	#if defined( __intel11 )
[1]	1446	CALL split_1dd( tri )
	1447	#else
	1448	CALL split_1dd
	1449	#endif
	1450	CALL substi_1dd( ar, tri )
	1451
	1452	CONTAINS
	1453
[940]	1454	#if defined( __intel11 )
[1]	1455	SUBROUTINE maketri_1dd( j, tri )
	1456	#else
	1457	SUBROUTINE maketri_1dd( j )
	1458	#endif
	1459
	1460	!------------------------------------------------------------------------------!
	1461	! computes the i- and j-dependent component of the matrix
	1462	!------------------------------------------------------------------------------!
	1463
	1464	USE constants
	1465
	1466	IMPLICIT NONE
	1467
	1468	INTEGER :: i, j, k, nnxh
	1469	REAL :: a, c
	1470
	1471	REAL, DIMENSION(0:nx) :: l
	1472
[940]	1473	#if defined( __intel11 )
[1]	1474	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	1475	#endif
	1476
	1477
	1478	nnxh = ( nx + 1 ) / 2
	1479	!
	1480	!-- Provide the tridiagonal matrix for solution of the Poisson equation in
	1481	!-- Fourier space. The coefficients are computed following the method of
	1482	!-- Schmidt et al. (DFVLR-Mitteilung 84-15), which departs from Stephan
	1483	!-- Siano's original version by discretizing the Poisson equation,
	1484	!-- before it is Fourier-transformed
	1485	DO i = 0, nx
[128]	1486	IF ( i >= 0 .AND. i <= nnxh ) THEN
[1]	1487	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
	1488	FLOAT( nx+1 ) ) ) * ddx2 + &
	1489	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
	1490	FLOAT( ny+1 ) ) ) * ddy2
	1491	ELSE
	1492	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
	1493	FLOAT( nx+1 ) ) ) * ddx2 + &
	1494	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
	1495	FLOAT( ny+1 ) ) ) * ddy2
	1496	ENDIF
	1497	ENDDO
	1498
	1499	DO k = 0, nz-1
	1500	DO i = 0, nx
[667]	1501	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
	1502	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
[1]	1503	tri(1,i,k) = a + c - l(i)
	1504	ENDDO
	1505	ENDDO
	1506	IF ( ibc_p_b == 1 .OR. ibc_p_b == 2 ) THEN
	1507	DO i = 0, nx
	1508	tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
	1509	ENDDO
	1510	ENDIF
	1511	IF ( ibc_p_t == 1 ) THEN
	1512	DO i = 0, nx
	1513	tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
	1514	ENDDO
	1515	ENDIF
	1516
	1517	END SUBROUTINE maketri_1dd
	1518
	1519
[940]	1520	#if defined( __intel11 )
[1]	1521	SUBROUTINE split_1dd( tri )
	1522	#else
	1523	SUBROUTINE split_1dd
	1524	#endif
	1525
	1526	!------------------------------------------------------------------------------!
	1527	! Splitting of the tridiagonal matrix (Thomas algorithm)
	1528	!------------------------------------------------------------------------------!
	1529
	1530	IMPLICIT NONE
	1531
	1532	INTEGER :: i, k
	1533
[940]	1534	#if defined( __intel11 )
[1]	1535	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	1536	#endif
	1537
	1538
	1539	!
	1540	!-- Splitting
	1541	DO i = 0, nx
	1542	tri(4,i,0) = tri(1,i,0)
	1543	ENDDO
	1544	DO k = 1, nz-1
	1545	DO i = 0, nx
	1546	tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
	1547	tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
	1548	ENDDO
	1549	ENDDO
	1550
	1551	END SUBROUTINE split_1dd
	1552
	1553
	1554	SUBROUTINE substi_1dd( ar, tri )
	1555
	1556	!------------------------------------------------------------------------------!
	1557	! Substitution (Forward and Backward) (Thomas algorithm)
	1558	!------------------------------------------------------------------------------!
	1559
	1560	IMPLICIT NONE
	1561
[76]	1562	INTEGER :: i, k
[1]	1563
	1564	REAL, DIMENSION(0:nx,nz) :: ar
	1565	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
	1566	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	1567
	1568	!
	1569	!-- Forward substitution
	1570	DO i = 0, nx
	1571	ar1(i,0) = ar(i,1)
	1572	ENDDO
	1573	DO k = 1, nz-1
	1574	DO i = 0, nx
	1575	ar1(i,k) = ar(i,k+1) - tri(5,i,k) * ar1(i,k-1)
	1576	ENDDO
	1577	ENDDO
	1578
	1579	!
	1580	!-- Backward substitution
[763]	1581	!-- Note, the add of 1.0E-20 in the denominator is due to avoid divisions
	1582	!-- by zero appearing if the pressure bc is set to neumann at the top of
	1583	!-- the model domain.
[1]	1584	DO i = 0, nx
[761]	1585	ar(i,nz) = ar1(i,nz-1) / ( tri(4,i,nz-1) + 1.0E-20 )
[1]	1586	ENDDO
	1587	DO k = nz-2, 0, -1
	1588	DO i = 0, nx
	1589	ar(i,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,k+2) ) &
	1590	/ tri(4,i,k)
	1591	ENDDO
	1592	ENDDO
	1593
[76]	1594	!
	1595	!-- Indices i=0, j=0 correspond to horizontally averaged pressure.
	1596	!-- The respective values of ar should be zero at all k-levels if
	1597	!-- acceleration of horizontally averaged vertical velocity is zero.
	1598	IF ( ibc_p_b == 1 .AND. ibc_p_t == 1 ) THEN
	1599	IF ( j == 0 ) THEN
	1600	DO k = 1, nz
	1601	ar(0,k) = 0.0
	1602	ENDDO
	1603	ENDIF
	1604	ENDIF
	1605
[1]	1606	END SUBROUTINE substi_1dd
	1607
	1608	END SUBROUTINE tridia_1dd
	1609
	1610	#endif
[807]	1611	#endif
[1]	1612	END MODULE poisfft_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |