Home

Context Navigation

poisfft.f90 @ 90

Last change on this file since 90 was 90, checked in by raasch, 17 years ago

New:
---
Calculation and output of user-defined profiles. New &userpar parameters data_output_pr_user and max_pr_user.

check_parameters, flow_statistics, modules, parin, read_var_list, user_interface, write_var_list

Changed:

Division through dt_3d replaced by multiplication of the inverse. For performance optimisation, this is done in the loop calculating the divergence instead of using a seperate loop. (pres.f90) var_hom and var_sum renamed pr_palm.

data_output_profiles, flow_statistics, init_3d_model, modules, parin, pres, read_var_list, run_control, time_integration

Errors:

Bugfix: work_fft*_vec removed from some PRIVATE-declarations (poisfft).

Bugfix: field_chr renamed field_char (user_interface).

Bugfix: output of use_upstream_for_tke (header).

header, poisfft, user_interface

Property svn:keywords set to Id

File size: 45.0 KB

Rev	Line
[1]	1	MODULE poisfft_mod
	2
	3	!------------------------------------------------------------------------------!
	4	! Actual revisions:
	5	! -----------------
[90]	6	!
[1]	7	!
	8	! Former revisions:
	9	! -----------------
[3]	10	! $Id: poisfft.f90 90 2007-05-30 09:18:47Z raasch $
[77]	11	!
[90]	12	! 85 2007-05-11 09:35:14Z raasch
	13	! Bugfix: work_fft*_vec removed from some PRIVATE-declarations
	14	!
[77]	15	! 76 2007-03-29 00:58:32Z raasch
	16	! Tridiagonal coefficients adjusted for Neumann boundary conditions both at
	17	! the bottom and the top.
	18	!
[3]	19	! RCS Log replace by Id keyword, revision history cleaned up
	20	!
[1]	21	! Revision 1.24 2006/08/04 15:00:24 raasch
	22	! Default setting of the thread number tn in case of not using OpenMP
	23	!
	24	! Revision 1.23 2006/02/23 12:48:38 raasch
	25	! Additional compiler directive in routine tridia_1dd for preventing loop
	26	! exchange on NEC-SX6
	27	!
	28	! Revision 1.20 2004/04/30 12:38:09 raasch
	29	! Parts of former poisfft_hybrid moved to this subroutine,
	30	! former subroutine changed to a module, renaming of FFT-subroutines and
	31	! -module, FFTs completely substituted by calls of fft_x and fft_y,
	32	! NAG fft used in the non-parallel case completely removed, l in maketri
	33	! is now a 1d-array, variables passed by modules instead of using parameter
	34	! lists, enlarged transposition arrays introduced
	35	!
	36	! Revision 1.1 1997/07/24 11:24:14 raasch
	37	! Initial revision
	38	!
	39	!
	40	! Description:
	41	! ------------
	42	! See below.
	43	!------------------------------------------------------------------------------!
	44
	45	!--------------------------------------------------------------------------!
	46	! poisfft !
	47	! !
	48	! Original version: Stephan Siano (pois3d) !
	49	! !
	50	! Institute of Meteorology and Climatology, University of Hannover !
	51	! Germany !
	52	! !
	53	! Version as of July 23,1996 !
	54	! !
	55	! !
	56	! Version for parallel computers: Siegfried Raasch !
	57	! !
	58	! Version as of July 03,1997 !
	59	! !
	60	! Solves the Poisson equation with a 2D spectral method !
	61	! d^2 p / dx^2 + d^2 p / dy^2 + d^2 p / dz^2 = s !
	62	! !
	63	! Input: !
	64	! real ar contains in the (nnx,nny,nnz) elements, !
	65	! starting from the element (1,nys,nxl), the !
	66	! values for s !
	67	! real work Temporary array !
	68	! !
	69	! Output: !
	70	! real ar contains the solution for p !
	71	!--------------------------------------------------------------------------!
	72
	73	USE fft_xy
	74	USE indices
	75	USE transpose_indices
	76
	77	IMPLICIT NONE
	78
	79	PRIVATE
	80	PUBLIC poisfft, poisfft_init
	81
	82	INTERFACE poisfft
	83	MODULE PROCEDURE poisfft
	84	END INTERFACE poisfft
	85
	86	INTERFACE poisfft_init
	87	MODULE PROCEDURE poisfft_init
	88	END INTERFACE poisfft_init
	89
	90	CONTAINS
	91
	92	SUBROUTINE poisfft_init
	93
	94	CALL fft_init
	95
	96	END SUBROUTINE poisfft_init
	97
	98
	99	SUBROUTINE poisfft( ar, work )
	100
	101	USE cpulog
	102	USE interfaces
	103	USE pegrid
	104
	105	IMPLICIT NONE
	106
	107	REAL, DIMENSION(1:nza,nys:nyna,nxl:nxra) :: ar, work
	108
	109
	110	CALL cpu_log( log_point_s(3), 'poisfft', 'start' )
	111
	112	!
	113	!-- Two-dimensional Fourier Transformation in x- and y-direction.
	114	#if defined( __parallel )
	115	IF ( pdims(2) == 1 ) THEN
	116
	117	!
	118	!-- 1d-domain-decomposition along x:
	119	!-- FFT along y and transposition y --> x
	120	CALL ffty_tr_yx( ar, work, ar )
	121
	122	!
	123	!-- FFT along x, solving the tridiagonal system and backward FFT
	124	CALL fftx_tri_fftx( ar )
	125
	126	!
	127	!-- Transposition x --> y and backward FFT along y
	128	CALL tr_xy_ffty( ar, work, ar )
	129
	130	ELSEIF ( pdims(1) == 1 ) THEN
	131
	132	!
	133	!-- 1d-domain-decomposition along y:
	134	!-- FFT along x and transposition x --> y
	135	CALL fftx_tr_xy( ar, work, ar )
	136
	137	!
	138	!-- FFT along y, solving the tridiagonal system and backward FFT
	139	CALL ffty_tri_ffty( ar )
	140
	141	!
	142	!-- Transposition y --> x and backward FFT along x
	143	CALL tr_yx_fftx( ar, work, ar )
	144
	145	ELSE
	146
	147	!
	148	!-- 2d-domain-decomposition
	149	!-- Transposition z --> x
	150	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
	151	CALL transpose_zx( ar, work, ar, work, ar )
	152	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
	153
	154	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
	155	CALL fftxp( ar, 'forward' )
	156	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
	157
	158	!
	159	!-- Transposition x --> y
	160	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
	161	CALL transpose_xy( ar, work, ar, work, ar )
	162	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
	163
	164	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
	165	CALL fftyp( ar, 'forward' )
	166	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	167
	168	!
	169	!-- Transposition y --> z
	170	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
	171	CALL transpose_yz( ar, work, ar, work, ar )
	172	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
	173
	174	!
	175	!-- Solve the Poisson equation in z-direction in cartesian space.
	176	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
	177	CALL tridia( ar )
	178	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
	179
	180	!
	181	!-- Inverse Fourier Transformation
	182	!-- Transposition z --> y
	183	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
	184	CALL transpose_zy( ar, work, ar, work, ar )
	185	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
	186
	187	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
	188	CALL fftyp( ar, 'backward' )
	189	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	190
	191	!
	192	!-- Transposition y --> x
	193	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
	194	CALL transpose_yx( ar, work, ar, work, ar )
	195	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
	196
	197	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
	198	CALL fftxp( ar, 'backward' )
	199	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	200
	201	!
	202	!-- Transposition x --> z
	203	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
	204	CALL transpose_xz( ar, work, ar, work, ar )
	205	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
	206
	207	ENDIF
	208
	209	#else
	210
	211	!
	212	!-- Two-dimensional Fourier Transformation along x- and y-direction.
	213	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
	214	CALL fftx( ar, 'forward' )
	215	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
	216	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
	217	CALL ffty( ar, 'forward' )
	218	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	219
	220	!
	221	!-- Solve the Poisson equation in z-direction in cartesian space.
	222	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
	223	CALL tridia( ar )
	224	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
	225
	226	!
	227	!-- Inverse Fourier Transformation.
	228	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
	229	CALL ffty( ar, 'backward' )
	230	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	231	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
	232	CALL fftx( ar, 'backward' )
	233	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	234
	235	#endif
	236
	237	CALL cpu_log( log_point_s(3), 'poisfft', 'stop' )
	238
	239	END SUBROUTINE poisfft
	240
	241
	242
	243	SUBROUTINE tridia( ar )
	244
	245	!------------------------------------------------------------------------------!
	246	! solves the linear system of equations:
	247	!
	248	! -(4 pi^2(i^2/(dx^2nnx^2)+j^2/(dy^2nny^2))+
	249	! 1/(dzu(k)dzw(k))+1/(dzu(k-1)dzw(k)))*p(i,j,k)+
	250	! 1/(dzu(k)dzw(k))p(i,j,k+1)+1/(dzu(k-1)dzw(k))p(i,j,k-1)=d(i,j,k)
	251	!
	252	! by using the Thomas algorithm
	253	!------------------------------------------------------------------------------!
	254
	255	USE arrays_3d
	256
	257	IMPLICIT NONE
	258
	259	INTEGER :: i, j, k, nnyh
	260
	261	REAL, DIMENSION(nxl_z:nxr_z,0:nz-1) :: ar1
	262	REAL, DIMENSION(5,nxl_z:nxr_z,0:nz-1) :: tri
	263
	264	#if defined( __parallel )
	265	REAL :: ar(nxl_z:nxr_za,nys_z:nyn_za,1:nza)
	266	#else
	267	REAL :: ar(1:nz,nys_z:nyn_z,nxl_z:nxr_z)
	268	#endif
	269
	270
	271	nnyh = (ny+1) / 2
	272
	273	!
	274	!-- Define constant elements of the tridiagonal matrix.
	275	DO k = 0, nz-1
	276	DO i = nxl_z, nxr_z
	277	tri(2,i,k) = ddzu(k+1) * ddzw(k+1)
	278	tri(3,i,k) = ddzu(k+2) * ddzw(k+1)
	279	ENDDO
	280	ENDDO
	281
	282	#if defined( __parallel )
	283	!
	284	!-- Repeat for all y-levels.
	285	DO j = nys_z, nyn_z
	286	IF ( j <= nnyh ) THEN
	287	CALL maketri( tri, j )
	288	ELSE
	289	CALL maketri( tri, ny+1-j )
	290	ENDIF
	291	CALL split( tri )
	292	CALL substi( ar, ar1, tri, j )
	293	ENDDO
	294	#else
	295	!
	296	!-- First y-level.
	297	CALL maketri( tri, nys_z )
	298	CALL split( tri )
	299	CALL substi( ar, ar1, tri, 0 )
	300
	301	!
	302	!-- Further y-levels.
	303	DO j = 1, nnyh - 1
	304	CALL maketri( tri, j )
	305	CALL split( tri )
	306	CALL substi( ar, ar1, tri, j )
	307	CALL substi( ar, ar1, tri, ny+1-j )
	308	ENDDO
	309	CALL maketri( tri, nnyh )
	310	CALL split( tri )
	311	CALL substi( ar, ar1, tri, nnyh+nys )
	312	#endif
	313
	314	CONTAINS
	315
	316	SUBROUTINE maketri( tri, j )
	317
	318	!------------------------------------------------------------------------------!
	319	! Computes the i- and j-dependent component of the matrix
	320	!------------------------------------------------------------------------------!
	321
	322	USE arrays_3d
	323	USE constants
	324	USE control_parameters
	325	USE grid_variables
	326
	327	IMPLICIT NONE
	328
	329	INTEGER :: i, j, k, nnxh
	330	REAL :: a, c
	331	REAL :: ll(nxl_z:nxr_z)
	332	REAL :: tri(5,nxl_z:nxr_z,0:nz-1)
	333
	334
	335	nnxh = ( nx + 1 ) / 2
	336
	337	!
	338	!-- Provide the tridiagonal matrix for solution of the Poisson equation in
	339	!-- Fourier space. The coefficients are computed following the method of
	340	!-- Schmidt et al. (DFVLR-Mitteilung 84-15), which departs from Stephan
	341	!-- Siano's original version by discretizing the Poisson equation,
	342	!-- before it is Fourier-transformed
	343	#if defined( __parallel )
	344	DO i = nxl_z, nxr_z
	345	IF ( i >= 0 .AND. i < nnxh ) THEN
	346	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
	347	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
	348	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
	349	FLOAT( ny+1 ) ) ) / ( dy * dy )
	350	ELSEIF ( i == nnxh ) THEN
	351	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
	352	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
	353	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
	354	FLOAT( ny+1 ) ) ) / ( dy * dy )
	355	ELSE
	356	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
	357	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
	358	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
	359	FLOAT( ny+1 ) ) ) / ( dy * dy )
	360	ENDIF
	361	DO k = 0,nz-1
	362	a = -1.0 * ddzu(k+2) * ddzw(k+1)
	363	c = -1.0 * ddzu(k+1) * ddzw(k+1)
	364	tri(1,i,k) = a + c - ll(i)
	365	ENDDO
	366	ENDDO
	367	#else
	368	DO i = 0, nnxh
	369	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / FLOAT( nx+1 ) ) ) / &
	370	( dx * dx ) + &
	371	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / FLOAT( ny+1 ) ) ) / &
	372	( dy * dy )
	373	DO k = 0, nz-1
	374	a = -1.0 * ddzu(k+2) * ddzw(k+1)
	375	c = -1.0 * ddzu(k+1) * ddzw(k+1)
	376	tri(1,i,k) = a + c - ll(i)
	377	IF ( i >= 1 .and. i < nnxh ) THEN
	378	tri(1,nx+1-i,k) = tri(1,i,k)
	379	ENDIF
	380	ENDDO
	381	ENDDO
	382	#endif
	383	IF ( ibc_p_b == 1 .OR. ibc_p_b == 2 ) THEN
	384	DO i = nxl_z, nxr_z
	385	tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
	386	ENDDO
	387	ENDIF
	388	IF ( ibc_p_t == 1 ) THEN
	389	DO i = nxl_z, nxr_z
	390	tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
	391	ENDDO
	392	ENDIF
	393
	394	END SUBROUTINE maketri
	395
	396
	397	SUBROUTINE substi( ar, ar1, tri, j )
	398
	399	!------------------------------------------------------------------------------!
	400	! Substitution (Forward and Backward) (Thomas algorithm)
	401	!------------------------------------------------------------------------------!
	402
[76]	403	USE control_parameters
	404
[1]	405	IMPLICIT NONE
	406
	407	INTEGER :: i, j, k
	408	REAL :: ar1(nxl_z:nxr_z,0:nz-1)
	409	REAL :: tri(5,nxl_z:nxr_z,0:nz-1)
	410	#if defined( __parallel )
	411	REAL :: ar(nxl_z:nxr_za,nys_z:nyn_za,1:nza)
	412	#else
	413	REAL :: ar(1:nz,nys_z:nyn_z,nxl_z:nxr_z)
	414	#endif
	415
	416	!
	417	!-- Forward substitution.
	418	DO i = nxl_z, nxr_z
	419	#if defined( __parallel )
	420	ar1(i,0) = ar(i,j,1)
	421	#else
	422	ar1(i,0) = ar(1,j,i)
	423	#endif
	424	ENDDO
	425	DO k = 1, nz - 1
	426	DO i = nxl_z, nxr_z
	427	#if defined( __parallel )
	428	ar1(i,k) = ar(i,j,k+1) - tri(5,i,k) * ar1(i,k-1)
	429	#else
	430	ar1(i,k) = ar(k+1,j,i) - tri(5,i,k) * ar1(i,k-1)
	431	#endif
	432	ENDDO
	433	ENDDO
	434
	435	!
	436	!-- Backward substitution.
	437	DO i = nxl_z, nxr_z
	438	#if defined( __parallel )
	439	ar(i,j,nz) = ar1(i,nz-1) / tri(4,i,nz-1)
	440	#else
	441	ar(nz,j,i) = ar1(i,nz-1) / tri(4,i,nz-1)
	442	#endif
	443	ENDDO
	444	DO k = nz-2, 0, -1
	445	DO i = nxl_z, nxr_z
	446	#if defined( __parallel )
	447	ar(i,j,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,j,k+2) ) &
	448	/ tri(4,i,k)
	449	#else
	450	ar(k+1,j,i) = ( ar1(i,k) - tri(3,i,k) * ar(k+2,j,i) ) &
	451	/ tri(4,i,k)
	452	#endif
	453	ENDDO
	454	ENDDO
	455
[76]	456	!
	457	!-- Indices i=0, j=0 correspond to horizontally averaged pressure.
	458	!-- The respective values of ar should be zero at all k-levels if
	459	!-- acceleration of horizontally averaged vertical velocity is zero.
	460	IF ( ibc_p_b == 1 .AND. ibc_p_t == 1 ) THEN
	461	IF ( j == 0 .AND. nxl_z == 0 ) THEN
	462	#if defined( __parallel )
	463	DO k = 1, nz
	464	ar(nxl_z,j,k) = 0.0
	465	ENDDO
	466	#else
	467	DO k = 1, nz
	468	ar(k,j,nxl_z) = 0.0
	469	ENDDO
	470	#endif
	471	ENDIF
	472	ENDIF
	473
[1]	474	END SUBROUTINE substi
	475
	476
	477	SUBROUTINE split( tri )
	478
	479	!------------------------------------------------------------------------------!
	480	! Splitting of the tridiagonal matrix (Thomas algorithm)
	481	!------------------------------------------------------------------------------!
	482
	483	IMPLICIT NONE
	484
	485	INTEGER :: i, k
	486	REAL :: tri(5,nxl_z:nxr_z,0:nz-1)
	487
	488	!
	489	!-- Splitting.
	490	DO i = nxl_z, nxr_z
	491	tri(4,i,0) = tri(1,i,0)
	492	ENDDO
	493	DO k = 1, nz-1
	494	DO i = nxl_z, nxr_z
	495	tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
	496	tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
	497	ENDDO
	498	ENDDO
	499
	500	END SUBROUTINE split
	501
	502	END SUBROUTINE tridia
	503
	504
	505	#if defined( __parallel )
	506	SUBROUTINE fftxp( ar, direction )
	507
	508	!------------------------------------------------------------------------------!
	509	! Fourier-transformation along x-direction Parallelized version
	510	!------------------------------------------------------------------------------!
	511
	512	IMPLICIT NONE
	513
	514	CHARACTER (LEN=*) :: direction
	515	INTEGER :: j, k
	516	REAL :: ar(0:nxa,nys_x:nyn_xa,nzb_x:nzt_xa)
	517
	518	!
	519	!-- Performing the fft with one of the methods implemented
	520	DO k = nzb_x, nzt_x
	521	DO j = nys_x, nyn_x
	522	CALL fft_x( ar(0:nx,j,k), direction )
	523	ENDDO
	524	ENDDO
	525
	526	END SUBROUTINE fftxp
	527
	528	#else
	529	SUBROUTINE fftx( ar, direction )
	530
	531	!------------------------------------------------------------------------------!
	532	! Fourier-transformation along x-direction Non parallel version
	533	!------------------------------------------------------------------------------!
	534
	535	IMPLICIT NONE
	536
	537	CHARACTER (LEN=*) :: direction
	538	INTEGER :: i, j, k
	539	REAL :: ar(1:nz,0:ny,0:nx)
	540
	541	!
	542	!-- Performing the fft with one of the methods implemented
	543	DO k = 1, nz
	544	DO j = 0, ny
	545	CALL fft_x( ar(k,j,0:nx), direction )
	546	ENDDO
	547	ENDDO
	548
	549	END SUBROUTINE fftx
	550	#endif
	551
	552
	553	#if defined( __parallel )
	554	SUBROUTINE fftyp( ar, direction )
	555
	556	!------------------------------------------------------------------------------!
	557	! Fourier-transformation along y-direction Parallelized version
	558	!------------------------------------------------------------------------------!
	559
	560	IMPLICIT NONE
	561
	562	CHARACTER (LEN=*) :: direction
	563	INTEGER :: i, k
	564	REAL :: ar(0:nya,nxl_y:nxr_ya,nzb_y:nzt_ya)
	565
	566	!
	567	!-- Performing the fft with one of the methods implemented
	568	DO k = nzb_y, nzt_y
	569	DO i = nxl_y, nxr_y
	570	CALL fft_y( ar(0:ny,i,k), direction )
	571	ENDDO
	572	ENDDO
	573
	574	END SUBROUTINE fftyp
	575
	576	#else
	577	SUBROUTINE ffty( ar, direction )
	578
	579	!------------------------------------------------------------------------------!
	580	! Fourier-transformation along y-direction Non parallel version
	581	!------------------------------------------------------------------------------!
	582
	583	IMPLICIT NONE
	584
	585	CHARACTER (LEN=*) :: direction
	586	INTEGER :: i, k
	587	REAL :: ar(1:nz,0:ny,0:nx)
	588
	589	!
	590	!-- Performing the fft with one of the methods implemented
	591	DO k = 1, nz
	592	DO i = 0, nx
	593	CALL fft_y( ar(k,0:ny,i), direction )
	594	ENDDO
	595	ENDDO
	596
	597	END SUBROUTINE ffty
	598	#endif
	599
	600	#if defined( __parallel )
	601	SUBROUTINE ffty_tr_yx( f_in, work, f_out )
	602
	603	!------------------------------------------------------------------------------!
	604	! Fourier-transformation along y with subsequent transposition y --> x for
	605	! a 1d-decomposition along x
	606	!
	607	! ATTENTION: The performance of this routine is much faster on the NEC-SX6,
	608	! if the first index of work_ffty_vec is odd. Otherwise
	609	! memory bank conflicts may occur (especially if the index is a
	610	! multiple of 128). That's why work_ffty_vec is dimensioned as
	611	! 0:ny+1.
	612	! Of course, this will not work if users are using an odd number
	613	! of gridpoints along y.
	614	!------------------------------------------------------------------------------!
	615
	616	USE control_parameters
	617	USE cpulog
	618	USE indices
	619	USE interfaces
	620	USE pegrid
	621	USE transpose_indices
	622
	623	IMPLICIT NONE
	624
	625	INTEGER :: i, iend, iouter, ir, j, k
	626	INTEGER, PARAMETER :: stridex = 4
	627
	628	REAL, DIMENSION(0:ny,stridex) :: work_ffty
	629	#if defined( __nec )
	630	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
	631	#endif
	632	REAL, DIMENSION(1:nza,0:nya,nxl:nxra) :: f_in
	633	REAL, DIMENSION(nnx,1:nza,nys_x:nyn_xa,pdims(1)) :: f_out
	634	REAL, DIMENSION(nxl:nxra,1:nza,0:nya) :: work
	635
	636	!
	637	!-- Carry out the FFT along y, where all data are present due to the
	638	!-- 1d-decomposition along x. Resort the data in a way that x becomes
	639	!-- the first index.
	640	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
	641
	642	IF ( host(1:3) == 'nec' ) THEN
	643	#if defined( __nec )
	644	!
	645	!-- Code optimized for vector processors
[85]	646	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	647	!$OMP DO
	648	DO i = nxl, nxr
	649
	650	DO j = 0, ny
	651	DO k = 1, nz
	652	work_ffty_vec(j,k,i) = f_in(k,j,i)
	653	ENDDO
	654	ENDDO
	655
	656	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' )
	657
	658	ENDDO
	659
	660	!$OMP DO
	661	DO k = 1, nz
	662	DO j = 0, ny
	663	DO i = nxl, nxr
	664	work(i,k,j) = work_ffty_vec(j,k,i)
	665	ENDDO
	666	ENDDO
	667	ENDDO
	668	!$OMP END PARALLEL
	669	#endif
	670
	671	ELSE
	672
	673	!
	674	!-- Cache optimized code.
	675	!-- The i-(x-)direction is split into a strided outer loop and an inner
	676	!-- loop for better cache performance
	677	!$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty)
	678	!$OMP DO
	679	DO iouter = nxl, nxr, stridex
	680
	681	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
	682
	683	DO k = 1, nz
	684
	685	DO i = iouter, iend
	686
	687	ir = i-iouter+1 ! counter within a stride
	688	DO j = 0, ny
	689	work_ffty(j,ir) = f_in(k,j,i)
	690	ENDDO
	691	!
	692	!-- FFT along y
	693	CALL fft_y( work_ffty(:,ir), 'forward' )
	694
	695	ENDDO
	696
	697	!
	698	!-- Resort
	699	DO j = 0, ny
	700	DO i = iouter, iend
	701	work(i,k,j) = work_ffty(j,i-iouter+1)
	702	ENDDO
	703	ENDDO
	704
	705	ENDDO
	706
	707	ENDDO
	708	!$OMP END PARALLEL
	709
	710	ENDIF
	711	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	712
	713	!
	714	!-- Transpose array
	715	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
	716	CALL MPI_ALLTOALL( work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
	717	f_out(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
	718	comm1dx, ierr )
	719	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	720
	721	END SUBROUTINE ffty_tr_yx
	722
	723
	724	SUBROUTINE tr_xy_ffty( f_in, work, f_out )
	725
	726	!------------------------------------------------------------------------------!
	727	! Transposition x --> y with a subsequent backward Fourier transformation for
	728	! a 1d-decomposition along x
	729	!------------------------------------------------------------------------------!
	730
	731	USE control_parameters
	732	USE cpulog
	733	USE indices
	734	USE interfaces
	735	USE pegrid
	736	USE transpose_indices
	737
	738	IMPLICIT NONE
	739
	740	INTEGER :: i, iend, iouter, ir, j, k
	741	INTEGER, PARAMETER :: stridex = 4
	742
	743	REAL, DIMENSION(0:ny,stridex) :: work_ffty
	744	#if defined( __nec )
	745	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
	746	#endif
	747	REAL, DIMENSION(nnx,1:nza,nys_x:nyn_xa,pdims(1)) :: f_in
	748	REAL, DIMENSION(1:nza,0:nya,nxl:nxra) :: f_out
	749	REAL, DIMENSION(nxl:nxra,1:nza,0:nya) :: work
	750
	751	!
	752	!-- Transpose array
	753	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
	754	CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
	755	work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
	756	comm1dx, ierr )
	757	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	758
	759	!
	760	!-- Resort the data in a way that y becomes the first index and carry out the
	761	!-- backward fft along y.
	762	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
	763
	764	IF ( host(1:3) == 'nec' ) THEN
	765	#if defined( __nec )
	766	!
	767	!-- Code optimized for vector processors
[85]	768	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	769	!$OMP DO
	770	DO k = 1, nz
	771	DO j = 0, ny
	772	DO i = nxl, nxr
	773	work_ffty_vec(j,k,i) = work(i,k,j)
	774	ENDDO
	775	ENDDO
	776	ENDDO
	777
	778	!$OMP DO
	779	DO i = nxl, nxr
	780
	781	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' )
	782
	783	DO j = 0, ny
	784	DO k = 1, nz
	785	f_out(k,j,i) = work_ffty_vec(j,k,i)
	786	ENDDO
	787	ENDDO
	788
	789	ENDDO
	790	!$OMP END PARALLEL
	791	#endif
	792
	793	ELSE
	794
	795	!
	796	!-- Cache optimized code.
	797	!-- The i-(x-)direction is split into a strided outer loop and an inner
	798	!-- loop for better cache performance
	799	!$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty )
	800	!$OMP DO
	801	DO iouter = nxl, nxr, stridex
	802
	803	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
	804
	805	DO k = 1, nz
	806	!
	807	!-- Resort
	808	DO j = 0, ny
	809	DO i = iouter, iend
	810	work_ffty(j,i-iouter+1) = work(i,k,j)
	811	ENDDO
	812	ENDDO
	813
	814	DO i = iouter, iend
	815
	816	!
	817	!-- FFT along y
	818	ir = i-iouter+1 ! counter within a stride
	819	CALL fft_y( work_ffty(:,ir), 'backward' )
	820
	821	DO j = 0, ny
	822	f_out(k,j,i) = work_ffty(j,ir)
	823	ENDDO
	824	ENDDO
	825
	826	ENDDO
	827
	828	ENDDO
	829	!$OMP END PARALLEL
	830
	831	ENDIF
	832
	833	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	834
	835	END SUBROUTINE tr_xy_ffty
	836
	837
	838	SUBROUTINE fftx_tri_fftx( ar )
	839
	840	!------------------------------------------------------------------------------!
	841	! FFT along x, solution of the tridiagonal system and backward FFT for
	842	! a 1d-decomposition along x
	843	!
	844	! WARNING: this subroutine may still not work for hybrid parallelization
	845	! with OpenMP (for possible necessary changes see the original
	846	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
	847	!------------------------------------------------------------------------------!
	848
	849	USE control_parameters
	850	USE cpulog
	851	USE grid_variables
	852	USE indices
	853	USE interfaces
	854	USE pegrid
	855	USE transpose_indices
	856
	857	IMPLICIT NONE
	858
	859	character(len=3) :: myth_char
	860
	861	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
	862
	863	REAL, DIMENSION(0:nx) :: work_fftx
	864	REAL, DIMENSION(0:nx,1:nz) :: work_trix
	865	REAL, DIMENSION(nnx,1:nza,nys_x:nyn_xa,pdims(1)) :: ar
	866	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
	867
	868
	869	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'start' )
	870
	871	ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) )
	872
	873	tn = 0 ! Default thread number in case of one thread
	874	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_fftx, work_trix )
	875	DO j = nys_x, nyn_x
	876
	877	!$ tn = omp_get_thread_num()
	878
	879	IF ( host(1:3) == 'nec' ) THEN
	880	!
	881	!-- Code optimized for vector processors
	882	DO k = 1, nz
	883
	884	m = 0
	885	DO n = 1, pdims(1)
	886	DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
	887	work_trix(m,k) = ar(i,k,j,n)
	888	m = m + 1
	889	ENDDO
	890	ENDDO
	891
	892	ENDDO
	893
	894	CALL fft_x_m( work_trix, 'forward' )
	895
	896	ELSE
	897	!
	898	!-- Cache optimized code
	899	DO k = 1, nz
	900
	901	m = 0
	902	DO n = 1, pdims(1)
	903	DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
	904	work_fftx(m) = ar(i,k,j,n)
	905	m = m + 1
	906	ENDDO
	907	ENDDO
	908
	909	CALL fft_x( work_fftx, 'forward' )
	910
	911	DO i = 0, nx
	912	work_trix(i,k) = work_fftx(i)
	913	ENDDO
	914
	915	ENDDO
	916
	917	ENDIF
	918
	919	!
	920	!-- Solve the linear equation system
	921	CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) )
	922
	923	IF ( host(1:3) == 'nec' ) THEN
	924	!
	925	!-- Code optimized for vector processors
	926	CALL fft_x_m( work_trix, 'backward' )
	927
	928	DO k = 1, nz
	929
	930	m = 0
	931	DO n = 1, pdims(1)
	932	DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
	933	ar(i,k,j,n) = work_trix(m,k)
	934	m = m + 1
	935	ENDDO
	936	ENDDO
	937
	938	ENDDO
	939
	940	ELSE
	941	!
	942	!-- Cache optimized code
	943	DO k = 1, nz
	944
	945	DO i = 0, nx
	946	work_fftx(i) = work_trix(i,k)
	947	ENDDO
	948
	949	CALL fft_x( work_fftx, 'backward' )
	950
	951	m = 0
	952	DO n = 1, pdims(1)
	953	DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
	954	ar(i,k,j,n) = work_fftx(m)
	955	m = m + 1
	956	ENDDO
	957	ENDDO
	958
	959	ENDDO
	960
	961	ENDIF
	962
	963	ENDDO
	964
	965	DEALLOCATE( tri )
	966
	967	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'stop' )
	968
	969	END SUBROUTINE fftx_tri_fftx
	970
	971
	972	SUBROUTINE fftx_tr_xy( f_in, work, f_out )
	973
	974	!------------------------------------------------------------------------------!
	975	! Fourier-transformation along x with subsequent transposition x --> y for
	976	! a 1d-decomposition along y
	977	!
	978	! ATTENTION: The NEC-branch of this routine may significantly profit from
	979	! further optimizations. So far, performance is much worse than
	980	! for routine ffty_tr_yx (more than three times slower).
	981	!------------------------------------------------------------------------------!
	982
	983	USE control_parameters
	984	USE cpulog
	985	USE indices
	986	USE interfaces
	987	USE pegrid
	988	USE transpose_indices
	989
	990	IMPLICIT NONE
	991
	992	INTEGER :: i, j, k
	993
	994	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
	995	REAL, DIMENSION(1:nza,nys:nyna,0:nxa) :: f_in
	996	REAL, DIMENSION(nny,1:nza,nxl_y:nxr_ya,pdims(2)) :: f_out
	997	REAL, DIMENSION(nys:nyna,1:nza,0:nxa) :: work
	998
	999	!
	1000	!-- Carry out the FFT along x, where all data are present due to the
	1001	!-- 1d-decomposition along y. Resort the data in a way that y becomes
	1002	!-- the first index.
	1003	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
	1004
	1005	IF ( host(1:3) == 'nec' ) THEN
	1006	!
	1007	!-- Code for vector processors
[85]	1008	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	1009	!$OMP DO
	1010	DO i = 0, nx
	1011
	1012	DO j = nys, nyn
	1013	DO k = 1, nz
	1014	work_fftx(i,k,j) = f_in(k,j,i)
	1015	ENDDO
	1016	ENDDO
	1017
	1018	ENDDO
	1019
	1020	!$OMP DO
	1021	DO j = nys, nyn
	1022
	1023	CALL fft_x_m( work_fftx(:,:,j), 'forward' )
	1024
	1025	DO k = 1, nz
	1026	DO i = 0, nx
	1027	work(j,k,i) = work_fftx(i,k,j)
	1028	ENDDO
	1029	ENDDO
	1030
	1031	ENDDO
	1032	!$OMP END PARALLEL
	1033
	1034	ELSE
	1035
	1036	!
	1037	!-- Cache optimized code (there might be still a potential for better
	1038	!-- optimization).
	1039	!$OMP PARALLEL PRIVATE (i,j,k,work_fftx)
	1040	!$OMP DO
	1041	DO i = 0, nx
	1042
	1043	DO j = nys, nyn
	1044	DO k = 1, nz
	1045	work_fftx(i,k,j) = f_in(k,j,i)
	1046	ENDDO
	1047	ENDDO
	1048
	1049	ENDDO
	1050
	1051	!$OMP DO
	1052	DO j = nys, nyn
	1053	DO k = 1, nz
	1054
	1055	CALL fft_x( work_fftx(0:nx,k,j), 'forward' )
	1056
	1057	DO i = 0, nx
	1058	work(j,k,i) = work_fftx(i,k,j)
	1059	ENDDO
	1060	ENDDO
	1061
	1062	ENDDO
	1063	!$OMP END PARALLEL
	1064
	1065	ENDIF
	1066	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
	1067
	1068	!
	1069	!-- Transpose array
	1070	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
	1071	CALL MPI_ALLTOALL( work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
	1072	f_out(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
	1073	comm1dy, ierr )
	1074	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	1075
	1076	END SUBROUTINE fftx_tr_xy
	1077
	1078
	1079	SUBROUTINE tr_yx_fftx( f_in, work, f_out )
	1080
	1081	!------------------------------------------------------------------------------!
	1082	! Transposition y --> x with a subsequent backward Fourier transformation for
	1083	! a 1d-decomposition along x
	1084	!------------------------------------------------------------------------------!
	1085
	1086	USE control_parameters
	1087	USE cpulog
	1088	USE indices
	1089	USE interfaces
	1090	USE pegrid
	1091	USE transpose_indices
	1092
	1093	IMPLICIT NONE
	1094
	1095	INTEGER :: i, j, k
	1096
	1097	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
	1098	REAL, DIMENSION(nny,1:nza,nxl_y:nxr_ya,pdims(2)) :: f_in
	1099	REAL, DIMENSION(1:nza,nys:nyna,0:nxa) :: f_out
	1100	REAL, DIMENSION(nys:nyna,1:nza,0:nxa) :: work
	1101
	1102	!
	1103	!-- Transpose array
	1104	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
	1105	CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
	1106	work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
	1107	comm1dy, ierr )
	1108	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	1109
	1110	!
	1111	!-- Carry out the FFT along x, where all data are present due to the
	1112	!-- 1d-decomposition along y. Resort the data in a way that y becomes
	1113	!-- the first index.
	1114	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
	1115
	1116	IF ( host(1:3) == 'nec' ) THEN
	1117	!
	1118	!-- Code optimized for vector processors
[85]	1119	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	1120	!$OMP DO
	1121	DO j = nys, nyn
	1122
	1123	DO k = 1, nz
	1124	DO i = 0, nx
	1125	work_fftx(i,k,j) = work(j,k,i)
	1126	ENDDO
	1127	ENDDO
	1128
	1129	CALL fft_x_m( work_fftx(:,:,j), 'backward' )
	1130
	1131	ENDDO
	1132
	1133	!$OMP DO
	1134	DO i = 0, nx
	1135	DO j = nys, nyn
	1136	DO k = 1, nz
	1137	f_out(k,j,i) = work_fftx(i,k,j)
	1138	ENDDO
	1139	ENDDO
	1140	ENDDO
	1141	!$OMP END PARALLEL
	1142
	1143	ELSE
	1144
	1145	!
	1146	!-- Cache optimized code (there might be still a potential for better
	1147	!-- optimization).
	1148	!$OMP PARALLEL PRIVATE (i,j,k,work_fftx)
	1149	!$OMP DO
	1150	DO j = nys, nyn
	1151	DO k = 1, nz
	1152
	1153	DO i = 0, nx
	1154	work_fftx(i,k,j) = work(j,k,i)
	1155	ENDDO
	1156
	1157	CALL fft_x( work_fftx(0:nx,k,j), 'backward' )
	1158
	1159	ENDDO
	1160	ENDDO
	1161
	1162	!$OMP DO
	1163	DO i = 0, nx
	1164	DO j = nys, nyn
	1165	DO k = 1, nz
	1166	f_out(k,j,i) = work_fftx(i,k,j)
	1167	ENDDO
	1168	ENDDO
	1169	ENDDO
	1170	!$OMP END PARALLEL
	1171
	1172	ENDIF
	1173	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	1174
	1175	END SUBROUTINE tr_yx_fftx
	1176
	1177
	1178	SUBROUTINE ffty_tri_ffty( ar )
	1179
	1180	!------------------------------------------------------------------------------!
	1181	! FFT along y, solution of the tridiagonal system and backward FFT for
	1182	! a 1d-decomposition along y
	1183	!
	1184	! WARNING: this subroutine may still not work for hybrid parallelization
	1185	! with OpenMP (for possible necessary changes see the original
	1186	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
	1187	!------------------------------------------------------------------------------!
	1188
	1189	USE control_parameters
	1190	USE cpulog
	1191	USE grid_variables
	1192	USE indices
	1193	USE interfaces
	1194	USE pegrid
	1195	USE transpose_indices
	1196
	1197	IMPLICIT NONE
	1198
	1199	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
	1200
	1201	REAL, DIMENSION(0:ny) :: work_ffty
	1202	REAL, DIMENSION(0:ny,1:nz) :: work_triy
	1203	REAL, DIMENSION(nny,1:nza,nxl_y:nxr_ya,pdims(2)) :: ar
	1204	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
	1205
	1206
	1207	CALL cpu_log( log_point_s(39), 'fft_y + tridia', 'start' )
	1208
	1209	ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) )
	1210
	1211	tn = 0 ! Default thread number in case of one thread
	1212	!$OMP PARALLEL PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy )
	1213	!$OMP DO
	1214	DO i = nxl_y, nxr_y
	1215
	1216	!$ tn = omp_get_thread_num()
	1217
	1218	IF ( host(1:3) == 'nec' ) THEN
	1219	!
	1220	!-- Code optimized for vector processors
	1221	DO k = 1, nz
	1222
	1223	m = 0
	1224	DO n = 1, pdims(2)
	1225	DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
	1226	work_triy(m,k) = ar(j,k,i,n)
	1227	m = m + 1
	1228	ENDDO
	1229	ENDDO
	1230
	1231	ENDDO
	1232
	1233	CALL fft_y_m( work_triy, ny, 'forward' )
	1234
	1235	ELSE
	1236	!
	1237	!-- Cache optimized code
	1238	DO k = 1, nz
	1239
	1240	m = 0
	1241	DO n = 1, pdims(2)
	1242	DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
	1243	work_ffty(m) = ar(j,k,i,n)
	1244	m = m + 1
	1245	ENDDO
	1246	ENDDO
	1247
	1248	CALL fft_y( work_ffty, 'forward' )
	1249
	1250	DO j = 0, ny
	1251	work_triy(j,k) = work_ffty(j)
	1252	ENDDO
	1253
	1254	ENDDO
	1255
	1256	ENDIF
	1257
	1258	!
	1259	!-- Solve the linear equation system
	1260	CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) )
	1261
	1262	IF ( host(1:3) == 'nec' ) THEN
	1263	!
	1264	!-- Code optimized for vector processors
	1265	CALL fft_y_m( work_triy, ny, 'backward' )
	1266
	1267	DO k = 1, nz
	1268
	1269	m = 0
	1270	DO n = 1, pdims(2)
	1271	DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
	1272	ar(j,k,i,n) = work_triy(m,k)
	1273	m = m + 1
	1274	ENDDO
	1275	ENDDO
	1276
	1277	ENDDO
	1278
	1279	ELSE
	1280	!
	1281	!-- Cache optimized code
	1282	DO k = 1, nz
	1283
	1284	DO j = 0, ny
	1285	work_ffty(j) = work_triy(j,k)
	1286	ENDDO
	1287
	1288	CALL fft_y( work_ffty, 'backward' )
	1289
	1290	m = 0
	1291	DO n = 1, pdims(2)
	1292	DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
	1293	ar(j,k,i,n) = work_ffty(m)
	1294	m = m + 1
	1295	ENDDO
	1296	ENDDO
	1297
	1298	ENDDO
	1299
	1300	ENDIF
	1301
	1302	ENDDO
	1303	!$OMP END PARALLEL
	1304
	1305	DEALLOCATE( tri )
	1306
	1307	CALL cpu_log( log_point_s(39), 'fft_y + tridia', 'stop' )
	1308
	1309	END SUBROUTINE ffty_tri_ffty
	1310
	1311
	1312	SUBROUTINE tridia_1dd( ddx2, ddy2, nx, ny, j, ar, tri )
	1313
	1314	!------------------------------------------------------------------------------!
	1315	! Solves the linear system of equations for a 1d-decomposition along x (see
	1316	! tridia)
	1317	!
	1318	! Attention: when using the intel compiler, array tri must be passed as an
	1319	! argument to the contained subroutines. Otherwise addres faults
	1320	! will occur.
	1321	! On NEC, tri should not be passed (except for routine substi_1dd)
	1322	! because this causes very bad performance.
	1323	!------------------------------------------------------------------------------!
	1324
	1325	USE arrays_3d
	1326	USE control_parameters
	1327
	1328	USE pegrid
	1329
	1330	IMPLICIT NONE
	1331
	1332	INTEGER :: i, j, k, nnyh, nx, ny, omp_get_thread_num, tn
	1333
	1334	REAL :: ddx2, ddy2
	1335
	1336	REAL, DIMENSION(0:nx,1:nz) :: ar
	1337	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
	1338	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	1339
	1340
	1341	nnyh = ( ny + 1 ) / 2
	1342
	1343	!
	1344	!-- Define constant elements of the tridiagonal matrix.
	1345	!-- The compiler on SX6 does loop exchange. If 0:nx is a high power of 2,
	1346	!-- the exchanged loops create bank conflicts. The following directive
	1347	!-- prohibits loop exchange and the loops perform much better.
	1348	! tn = omp_get_thread_num()
	1349	! WRITE( 120+tn, * ) '+++ id=',myid,' nx=',nx,' thread=', omp_get_thread_num()
[82]	1350	! CALL local_flush( 120+tn )
[1]	1351	!CDIR NOLOOPCHG
	1352	DO k = 0, nz-1
	1353	DO i = 0,nx
	1354	tri(2,i,k) = ddzu(k+1) * ddzw(k+1)
	1355	tri(3,i,k) = ddzu(k+2) * ddzw(k+1)
	1356	ENDDO
	1357	ENDDO
	1358	! WRITE( 120+tn, * ) '+++ id=',myid,' end of first tridia loop thread=', omp_get_thread_num()
[82]	1359	! CALL local_flush( 120+tn )
[1]	1360
	1361	IF ( j <= nnyh ) THEN
	1362	#if defined( __lcmuk )
	1363	CALL maketri_1dd( j, tri )
	1364	#else
	1365	CALL maketri_1dd( j )
	1366	#endif
	1367	ELSE
	1368	#if defined( __lcmuk )
	1369	CALL maketri_1dd( ny+1-j, tri )
	1370	#else
	1371	CALL maketri_1dd( ny+1-j )
	1372	#endif
	1373	ENDIF
	1374	#if defined( __lcmuk )
	1375	CALL split_1dd( tri )
	1376	#else
	1377	CALL split_1dd
	1378	#endif
	1379	CALL substi_1dd( ar, tri )
	1380
	1381	CONTAINS
	1382
	1383	#if defined( __lcmuk )
	1384	SUBROUTINE maketri_1dd( j, tri )
	1385	#else
	1386	SUBROUTINE maketri_1dd( j )
	1387	#endif
	1388
	1389	!------------------------------------------------------------------------------!
	1390	! computes the i- and j-dependent component of the matrix
	1391	!------------------------------------------------------------------------------!
	1392
	1393	USE constants
	1394
	1395	IMPLICIT NONE
	1396
	1397	INTEGER :: i, j, k, nnxh
	1398	REAL :: a, c
	1399
	1400	REAL, DIMENSION(0:nx) :: l
	1401
	1402	#if defined( __lcmuk )
	1403	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	1404	#endif
	1405
	1406
	1407	nnxh = ( nx + 1 ) / 2
	1408	!
	1409	!-- Provide the tridiagonal matrix for solution of the Poisson equation in
	1410	!-- Fourier space. The coefficients are computed following the method of
	1411	!-- Schmidt et al. (DFVLR-Mitteilung 84-15), which departs from Stephan
	1412	!-- Siano's original version by discretizing the Poisson equation,
	1413	!-- before it is Fourier-transformed
	1414	DO i = 0, nx
	1415	IF ( i >= 0 .AND. i < nnxh ) THEN
	1416	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
	1417	FLOAT( nx+1 ) ) ) * ddx2 + &
	1418	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
	1419	FLOAT( ny+1 ) ) ) * ddy2
	1420	ELSEIF ( i == nnxh ) THEN
	1421	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
	1422	FLOAT( nx+1 ) ) ) * ddx2 + &
	1423	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
	1424	FLOAT( ny+1 ) ) ) * ddy2
	1425	ELSE
	1426	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
	1427	FLOAT( nx+1 ) ) ) * ddx2 + &
	1428	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
	1429	FLOAT( ny+1 ) ) ) * ddy2
	1430	ENDIF
	1431	ENDDO
	1432
	1433	DO k = 0, nz-1
	1434	DO i = 0, nx
	1435	a = -1.0 * ddzu(k+2) * ddzw(k+1)
	1436	c = -1.0 * ddzu(k+1) * ddzw(k+1)
	1437	tri(1,i,k) = a + c - l(i)
	1438	ENDDO
	1439	ENDDO
	1440	IF ( ibc_p_b == 1 .OR. ibc_p_b == 2 ) THEN
	1441	DO i = 0, nx
	1442	tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
	1443	ENDDO
	1444	ENDIF
	1445	IF ( ibc_p_t == 1 ) THEN
	1446	DO i = 0, nx
	1447	tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
	1448	ENDDO
	1449	ENDIF
	1450
	1451	END SUBROUTINE maketri_1dd
	1452
	1453
	1454	#if defined( __lcmuk )
	1455	SUBROUTINE split_1dd( tri )
	1456	#else
	1457	SUBROUTINE split_1dd
	1458	#endif
	1459
	1460	!------------------------------------------------------------------------------!
	1461	! Splitting of the tridiagonal matrix (Thomas algorithm)
	1462	!------------------------------------------------------------------------------!
	1463
	1464	IMPLICIT NONE
	1465
	1466	INTEGER :: i, k
	1467
	1468	#if defined( __lcmuk )
	1469	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	1470	#endif
	1471
	1472
	1473	!
	1474	!-- Splitting
	1475	DO i = 0, nx
	1476	tri(4,i,0) = tri(1,i,0)
	1477	ENDDO
	1478	DO k = 1, nz-1
	1479	DO i = 0, nx
	1480	tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
	1481	tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
	1482	ENDDO
	1483	ENDDO
	1484
	1485	END SUBROUTINE split_1dd
	1486
	1487
	1488	SUBROUTINE substi_1dd( ar, tri )
	1489
	1490	!------------------------------------------------------------------------------!
	1491	! Substitution (Forward and Backward) (Thomas algorithm)
	1492	!------------------------------------------------------------------------------!
	1493
	1494	IMPLICIT NONE
	1495
[76]	1496	INTEGER :: i, k
[1]	1497
	1498	REAL, DIMENSION(0:nx,nz) :: ar
	1499	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
	1500	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	1501
	1502	!
	1503	!-- Forward substitution
	1504	DO i = 0, nx
	1505	ar1(i,0) = ar(i,1)
	1506	ENDDO
	1507	DO k = 1, nz-1
	1508	DO i = 0, nx
	1509	ar1(i,k) = ar(i,k+1) - tri(5,i,k) * ar1(i,k-1)
	1510	ENDDO
	1511	ENDDO
	1512
	1513	!
	1514	!-- Backward substitution
	1515	DO i = 0, nx
	1516	ar(i,nz) = ar1(i,nz-1) / tri(4,i,nz-1)
	1517	ENDDO
	1518	DO k = nz-2, 0, -1
	1519	DO i = 0, nx
	1520	ar(i,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,k+2) ) &
	1521	/ tri(4,i,k)
	1522	ENDDO
	1523	ENDDO
	1524
[76]	1525	!
	1526	!-- Indices i=0, j=0 correspond to horizontally averaged pressure.
	1527	!-- The respective values of ar should be zero at all k-levels if
	1528	!-- acceleration of horizontally averaged vertical velocity is zero.
	1529	IF ( ibc_p_b == 1 .AND. ibc_p_t == 1 ) THEN
	1530	IF ( j == 0 ) THEN
	1531	DO k = 1, nz
	1532	ar(0,k) = 0.0
	1533	ENDDO
	1534	ENDIF
	1535	ENDIF
	1536
[1]	1537	END SUBROUTINE substi_1dd
	1538
	1539	END SUBROUTINE tridia_1dd
	1540
	1541	#endif
	1542
	1543	END MODULE poisfft_mod

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: palm/trunk/SOURCE/poisfft.f90 @ 90

Download in other formats: