Home

Context Navigation

source: palm/trunk/SOURCE/poisfft.f90 @ 1255

Last change on this file since 1255 was 1217, checked in by raasch, 11 years ago
last commit documented
Property svn:keywords set to `Id`
File size: 45.8 KB

Rev	Line
[1]	1	MODULE poisfft_mod
	2
[1036]	3	!--------------------------------------------------------------------------------!
	4	! This file is part of PALM.
	5	!
	6	! PALM is free software: you can redistribute it and/or modify it under the terms
	7	! of the GNU General Public License as published by the Free Software Foundation,
	8	! either version 3 of the License, or (at your option) any later version.
	9	!
	10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
	11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
	12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
	13	!
	14	! You should have received a copy of the GNU General Public License along with
	15	! PALM. If not, see <http://www.gnu.org/licenses/>.
	16	!
	17	! Copyright 1997-2012 Leibniz University Hannover
	18	!--------------------------------------------------------------------------------!
	19	!
[484]	20	! Current revisions:
[1]	21	! -----------------
[1217]	22	!
	23	!
	24	! Former revisions:
	25	! -----------------
	26	! $Id: poisfft.f90 1217 2013-08-26 11:30:05Z raasch $
	27	!
	28	! 1216 2013-08-26 09:31:42Z raasch
[1216]	29	! resorting of arrays moved to separate routines resort_for_...,
	30	! one argument, used as temporary work array, removed from all transpose
	31	! routines
	32	! overlapping fft / transposition implemented
[1112]	33	!
[1213]	34	! 1212 2013-08-15 08:46:27Z raasch
	35	! tridia routines moved to seperate module tridia_solver
	36	!
[1209]	37	! 1208 2013-08-13 06:41:49Z raasch
	38	! acc-update clauses added for "ar" so that ffts other than cufft can also be
	39	! used (although they are not ported and will give a poor performance)
	40	!
[1112]	41	! 1111 2013-03-08 23:54:10Z raasch
[1111]	42	! further openACC porting of non-parallel (MPI) branch:
	43	! tridiagonal routines split into extermal subroutines (instead using CONTAINS),
	44	! no distinction between parallel/non-parallel in poisfft and tridia any more,
[1112]	45	! tridia routines moved to end of file because of probable bug in PGI compiler 12.5
[1111]	46	! (otherwise "invalid device function" is indicated during runtime),
	47	! optimization of tridia routines: constant elements and coefficients of tri are
	48	! stored in seperate arrays ddzuw and tric, last dimension of tri reduced from 5
	49	! to 2,
	50	! poisfft_init is now called internally from poisfft, maketri is called from
	51	! poisfft_init,
	52	! ibc_p_b = 2 removed
[1]	53	!
[1107]	54	! 1106 2013-03-04 05:31:38Z raasch
	55	! routines fftx, ffty, fftxp, fftyp removed, calls replaced by fft_x, fft_y,
	56	! in the 1D-decomposition routines fft_x, ffty are replaced by fft_x_1d,
	57	! fft_y_1d
	58	!
[1104]	59	! 1103 2013-02-20 02:15:53Z raasch
	60	! tri, ar, and ar1 arguments in tridia-routines (2d) are removed because they
	61	! sometimes cause segmentation faults with intel 12.1 compiler
	62	!
[1093]	63	! 1092 2013-02-02 11:24:22Z raasch
	64	! unused variables removed
	65	!
[1037]	66	! 1036 2012-10-22 13:43:42Z raasch
	67	! code put under GPL (PALM 3.9)
	68	!
[1014]	69	! 2012-09-21 07:03:55Z raasch
	70	! FLOAT type conversion replaced by REAL
	71	!
[1004]	72	! 1003 2012-09-14 14:35:53Z raasch
	73	! indices nxa, nya, etc. replaced by nx, ny, etc.
	74	!
[941]	75	! 940 2012-07-09 14:31:00Z raasch
	76	! special handling of tri-array as an argument in tridia_1dd routines switched
	77	! off because it caused segmentation faults with intel 12.1 compiler
	78	!
[878]	79	! 877 2012-04-03 11:21:44Z suehring
	80	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
	81	! pressure at the top of the model domain.
	82	!
[810]	83	! 809 2012-01-30 13:32:58Z maronga
	84	! Bugfix: replaced .AND. and .NOT. with && and ! in the preprocessor directives
	85	!
[808]	86	! 807 2012-01-25 11:53:51Z maronga
	87	! New cpp directive "__check" implemented which is used by check_namelist_files
	88	! (most of the code is unneeded by check_namelist_files).
	89	!
[764]	90	! 763 2011-10-06 09:32:09Z suehring
	91	! Comment added concerning the last change.
	92	!
[762]	93	! 761 2011-10-05 17:58:52Z suehring
	94	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
	95	! pressure at the top of the model domain.
	96	!
[697]	97	! 696 2011-03-18 07:03:49Z raasch
	98	! work_fftx removed from PRIVATE clauses in fftx_tr_xy and tr_yx_fftx
	99	!
[684]	100	! 683 2011-02-09 14:25:15Z raasch
	101	! openMP parallelization for 2d-domain-decomposition
	102	!
[668]	103	! 667 2010-12-23 12:06:00Z suehring/gryschka
	104	! ddzu replaced by ddzu_pres due to changes in zu(0)
	105	!
[623]	106	! 622 2010-12-10 08:08:13Z raasch
	107	! optional barriers included in order to speed up collective operations
	108	!
[392]	109	! 377 2009-09-04 11:09:00Z raasch
	110	! __lcmuk changed to __lc to avoid problems with Intel compiler on sgi-ice
	111	!
[198]	112	! 164 2008-05-15 08:46:15Z raasch
	113	! Arguments removed from transpose routines
	114	!
[139]	115	! 128 2007-10-26 13:11:14Z raasch
	116	! Bugfix: wavenumber calculation for even nx in routines maketri
	117	!
[90]	118	! 85 2007-05-11 09:35:14Z raasch
	119	! Bugfix: work_fft*_vec removed from some PRIVATE-declarations
	120	!
[77]	121	! 76 2007-03-29 00:58:32Z raasch
	122	! Tridiagonal coefficients adjusted for Neumann boundary conditions both at
	123	! the bottom and the top.
	124	!
[3]	125	! RCS Log replace by Id keyword, revision history cleaned up
	126	!
[1]	127	! Revision 1.24 2006/08/04 15:00:24 raasch
	128	! Default setting of the thread number tn in case of not using OpenMP
	129	!
	130	! Revision 1.23 2006/02/23 12:48:38 raasch
	131	! Additional compiler directive in routine tridia_1dd for preventing loop
	132	! exchange on NEC-SX6
	133	!
	134	! Revision 1.20 2004/04/30 12:38:09 raasch
	135	! Parts of former poisfft_hybrid moved to this subroutine,
	136	! former subroutine changed to a module, renaming of FFT-subroutines and
	137	! -module, FFTs completely substituted by calls of fft_x and fft_y,
	138	! NAG fft used in the non-parallel case completely removed, l in maketri
	139	! is now a 1d-array, variables passed by modules instead of using parameter
	140	! lists, enlarged transposition arrays introduced
	141	!
	142	! Revision 1.1 1997/07/24 11:24:14 raasch
	143	! Initial revision
	144	!
	145	!
	146	! Description:
	147	! ------------
	148	! See below.
	149	!------------------------------------------------------------------------------!
	150
	151	!--------------------------------------------------------------------------!
	152	! poisfft !
	153	! !
	154	! Original version: Stephan Siano (pois3d) !
	155	! !
	156	! Institute of Meteorology and Climatology, University of Hannover !
	157	! Germany !
	158	! !
	159	! Version as of July 23,1996 !
	160	! !
	161	! !
	162	! Version for parallel computers: Siegfried Raasch !
	163	! !
	164	! Version as of July 03,1997 !
	165	! !
	166	! Solves the Poisson equation with a 2D spectral method !
	167	! d^2 p / dx^2 + d^2 p / dy^2 + d^2 p / dz^2 = s !
	168	! !
	169	! Input: !
	170	! real ar contains in the (nnx,nny,nnz) elements, !
	171	! starting from the element (1,nys,nxl), the !
	172	! values for s !
	173	! real work Temporary array !
	174	! !
	175	! Output: !
	176	! real ar contains the solution for p !
	177	!--------------------------------------------------------------------------!
	178
	179	USE fft_xy
	180	USE indices
	181	USE transpose_indices
[1212]	182	USE tridia_solver
[1]	183
	184	IMPLICIT NONE
	185
[1111]	186	LOGICAL, SAVE :: poisfft_initialized = .FALSE.
	187
[1]	188	PRIVATE
[807]	189
[809]	190	#if ! defined ( __check )
[1]	191	PUBLIC poisfft, poisfft_init
	192
	193	INTERFACE poisfft
	194	MODULE PROCEDURE poisfft
	195	END INTERFACE poisfft
	196
	197	INTERFACE poisfft_init
	198	MODULE PROCEDURE poisfft_init
	199	END INTERFACE poisfft_init
[807]	200	#else
	201	PUBLIC poisfft_init
[1]	202
[807]	203	INTERFACE poisfft_init
	204	MODULE PROCEDURE poisfft_init
	205	END INTERFACE poisfft_init
	206	#endif
	207
[1]	208	CONTAINS
	209
	210	SUBROUTINE poisfft_init
	211
[1111]	212	USE arrays_3d, ONLY: ddzu_pres, ddzw
	213
	214	IMPLICIT NONE
	215
	216	INTEGER :: k
	217
	218
[1]	219	CALL fft_init
	220
[1212]	221	CALL tridia_init
[1111]	222
	223	poisfft_initialized = .TRUE.
	224
[1]	225	END SUBROUTINE poisfft_init
	226
[1111]	227
[809]	228	#if ! defined ( __check )
[1216]	229	SUBROUTINE poisfft( ar, ar_inv_test )
[1]	230
[1216]	231	USE control_parameters, ONLY : fft_method, transpose_compute_overlap
[1]	232	USE cpulog
	233	USE interfaces
	234	USE pegrid
	235
	236	IMPLICIT NONE
	237
[1216]	238	INTEGER :: ind_even, ind_odd, ind_third, ii, iind, inew, jj, jind, &
	239	jnew, ki, kk, knew, n, nblk, nnx_y, nny_z, nnz_t, nnz_x, &
	240	nxl_y_bound, nxr_y_bound
	241	INTEGER, DIMENSION(4) :: isave
[1]	242
[1216]	243	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar
	244	REAL, DIMENSION(nys:nyn,nxl:nxr,1:nz) :: ar_inv_test ! work array tend from pres
	245	!$acc declare create( ar_inv )
	246	REAL, DIMENSION(nys:nyn,nxl:nxr,1:nz) :: ar_inv
[1]	247
[1216]	248	REAL, DIMENSION(:,:,:), ALLOCATABLE :: f_in, f_inv, f_out_y, f_out_z
	249	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: ar1
	250
	251
[1]	252	CALL cpu_log( log_point_s(3), 'poisfft', 'start' )
	253
[1111]	254	IF ( .NOT. poisfft_initialized ) CALL poisfft_init
	255
[1]	256	!
	257	!-- Two-dimensional Fourier Transformation in x- and y-direction.
[1111]	258	IF ( pdims(2) == 1 .AND. pdims(1) > 1 ) THEN
[1]	259
	260	!
	261	!-- 1d-domain-decomposition along x:
	262	!-- FFT along y and transposition y --> x
[1216]	263	CALL ffty_tr_yx( ar, ar )
[1]	264
	265	!
	266	!-- FFT along x, solving the tridiagonal system and backward FFT
	267	CALL fftx_tri_fftx( ar )
	268
	269	!
	270	!-- Transposition x --> y and backward FFT along y
[1216]	271	CALL tr_xy_ffty( ar, ar )
[1]	272
[1111]	273	ELSEIF ( pdims(1) == 1 .AND. pdims(2) > 1 ) THEN
[1]	274
	275	!
	276	!-- 1d-domain-decomposition along y:
	277	!-- FFT along x and transposition x --> y
[1216]	278	CALL fftx_tr_xy( ar, ar )
[1]	279
	280	!
	281	!-- FFT along y, solving the tridiagonal system and backward FFT
	282	CALL ffty_tri_ffty( ar )
	283
	284	!
	285	!-- Transposition y --> x and backward FFT along x
[1216]	286	CALL tr_yx_fftx( ar, ar )
[1]	287
[1216]	288	ELSEIF ( .NOT. transpose_compute_overlap ) THEN
[1]	289
	290	!
[1111]	291	!-- 2d-domain-decomposition or no decomposition (1 PE run)
[1]	292	!-- Transposition z --> x
	293	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
[1216]	294	CALL resort_for_zx( ar, ar_inv )
	295	CALL transpose_zx( ar_inv, ar )
[1]	296	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
	297
	298	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
[1208]	299	IF ( fft_method /= 'system-specific' ) THEN
	300	!$acc update host( ar )
	301	ENDIF
[1106]	302	CALL fft_x( ar, 'forward' )
[1208]	303	IF ( fft_method /= 'system-specific' ) THEN
	304	!$acc update device( ar )
	305	ENDIF
[1]	306	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
	307
	308	!
	309	!-- Transposition x --> y
	310	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
[1216]	311	CALL resort_for_xy( ar, ar_inv )
	312	CALL transpose_xy( ar_inv, ar )
[1]	313	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
	314
	315	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
[1208]	316	IF ( fft_method /= 'system-specific' ) THEN
	317	!$acc update host( ar )
	318	ENDIF
[1216]	319	CALL fft_y( ar, 'forward', ar_tr = ar, &
	320	nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
	321	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
[1208]	322	IF ( fft_method /= 'system-specific' ) THEN
	323	!$acc update device( ar )
	324	ENDIF
[1]	325	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	326
	327	!
	328	!-- Transposition y --> z
	329	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
[1216]	330	CALL resort_for_yz( ar, ar_inv )
	331	CALL transpose_yz( ar_inv, ar )
[1]	332	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
	333
	334	!
[1106]	335	!-- Solve the tridiagonal equation system along z
[1]	336	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
[1212]	337	CALL tridia_substi( ar )
[1]	338	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
	339
	340	!
	341	!-- Inverse Fourier Transformation
	342	!-- Transposition z --> y
	343	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
[1216]	344	CALL transpose_zy( ar, ar_inv )
	345	CALL resort_for_zy( ar_inv, ar )
[1]	346	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
	347
	348	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
[1208]	349	IF ( fft_method /= 'system-specific' ) THEN
	350	!$acc update host( ar )
	351	ENDIF
[1216]	352	CALL fft_y( ar, 'backward', ar_tr = ar, &
	353	nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
	354	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
[1208]	355	IF ( fft_method /= 'system-specific' ) THEN
	356	!$acc update device( ar )
	357	ENDIF
[1]	358	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	359
	360	!
	361	!-- Transposition y --> x
	362	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
[1216]	363	CALL transpose_yx( ar, ar_inv )
	364	CALL resort_for_yx( ar_inv, ar )
[1]	365	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
	366
	367	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
[1208]	368	IF ( fft_method /= 'system-specific' ) THEN
	369	!$acc update host( ar )
	370	ENDIF
[1106]	371	CALL fft_x( ar, 'backward' )
[1208]	372	IF ( fft_method /= 'system-specific' ) THEN
	373	!$acc update device( ar )
	374	ENDIF
[1]	375	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	376
	377	!
	378	!-- Transposition x --> z
	379	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
[1216]	380	CALL transpose_xz( ar, ar_inv )
	381	CALL resort_for_xz( ar_inv, ar )
[1]	382	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
	383
[1216]	384	ELSE
	385
	386	!
	387	!-- 2d-domain-decomposition or no decomposition (1 PE run) with
	388	!-- overlapping transposition / fft
	389	ALLOCATE( f_out_y(0:ny,nxl_y:nxr_y,nzb_y:nzt_y), &
	390	f_out_z(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
	391	!
	392	!-- Transposition z --> x + subsequent fft along x
	393	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
	394	CALL resort_for_zx( ar, f_inv )
	395	!
	396	!-- Save original indices and gridpoint counter
	397	isave(1) = nz
	398	isave(2) = nzb_x
	399	isave(3) = nzt_x
	400	isave(4) = sendrecvcount_zx
	401	!
	402	!-- Set new indices for transformation
	403	nblk = nz / pdims(1)
	404	nz = pdims(1)
	405	nnz_x = 1
	406	nzb_x = 1 + myidx * nnz_x
	407	nzt_x = ( myidx + 1 ) * nnz_x
	408	sendrecvcount_zx = nnx * nny * nnz_x
	409
	410	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x,2) )
	411	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
	412
	413	DO kk = 1, nblk+1
	414	ind_odd = MOD( kk, 2 ) + 1
	415	ind_even = MOD( kk+1, 2 ) + 1
	416	!$OMP sections private(ki,knew,n)
	417	!$OMP section
	418	IF ( kk <= nblk ) THEN
	419
	420	IF ( kk == 1 ) THEN
	421	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
	422	ELSE
	423	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
	424	ENDIF
	425
	426	DO knew = 1, nz
	427	ki = kk + nblk * ( knew - 1 )
	428	f_in(:,:,knew) = f_inv(:,:,ki)
	429	ENDDO
	430
	431	CALL transpose_zx( f_in, ar1(:,:,:,ind_odd))
	432	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
	433
	434	ENDIF
	435
	436	!$OMP section
	437	IF ( kk >= 2 ) THEN
	438
	439	IF ( kk == 2 ) THEN
	440	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
	441	ELSE
	442	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
	443	ENDIF
	444
	445	n = isave(2) + kk - 2
	446	CALL fft_x( ar1(:,:,:,ind_even), 'forward', ar_2d = f_out_z(:,:,n))
	447	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
	448
	449	ENDIF
	450	!$OMP end sections
	451
	452	ENDDO
	453	!
	454	!-- Restore original indices/counters
	455	nz = isave(1)
	456	nzb_x = isave(2)
	457	nzt_x = isave(3)
	458	sendrecvcount_zx = isave(4)
	459
	460	DEALLOCATE( ar1, f_in, f_inv )
	461
	462	!
	463	!-- Transposition x --> y + subsequent fft along y
	464	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
	465	CALL resort_for_xy( f_out_z, f_inv )
	466	!
	467	!-- Save original indices and gridpoint counter
	468	isave(1) = nx
	469	isave(2) = nxl_y
	470	isave(3) = nxr_y
	471	isave(4) = sendrecvcount_xy
	472	!
	473	!-- Set new indices for transformation
	474	nblk = ( ( nx+1 ) / pdims(2) ) - 1
	475	nx = pdims(2)
	476	nnx_y = 1
	477	nxl_y = myidy * nnx_y
	478	nxr_y = ( myidy + 1 ) * nnx_y - 1
	479	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
	480
	481	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y,2) )
	482	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
	483
	484	DO ii = 0, nblk+1
	485	ind_odd = MOD( ii+1, 2 ) + 1
	486	ind_even = MOD( ii+2, 2 ) + 1
	487	!$OMP sections private(ki,knew,n)
	488	!$OMP section
	489	IF ( ii <= nblk ) THEN
	490
	491	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
	492
	493	DO inew = 0, nx-1
	494	iind = ii + ( nblk + 1 ) * inew
	495	f_in(:,:,inew) = f_inv(:,:,iind)
	496	ENDDO
	497
	498	CALL transpose_xy( f_in, ar1(:,:,:,ind_odd) )
	499
	500	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
	501
	502	ENDIF
	503
	504	!$OMP section
	505	IF ( ii >= 1 ) THEN
	506
	507	IF ( ii == 1 ) THEN
	508	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
	509	ELSE
	510	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
	511	ENDIF
	512
	513	nxl_y_bound = isave(2)
	514	nxr_y_bound = isave(3)
	515	n = isave(2) + ii - 1
	516	! CALL fft_y( ar1(:,:,:,ind_even), 'forward', ar_3d = f_out_y, &
	517	! ni = n )
	518	CALL fft_y( ar1(:,:,:,ind_even), 'forward', ar_tr = f_out_y, &
	519	nxl_y_bound = nxl_y_bound, nxr_y_bound = nxr_y_bound, &
	520	nxl_y_l = n, nxr_y_l = n )
	521
	522	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	523
	524	ENDIF
	525	!$OMP end sections
	526
	527	ENDDO
	528	!
	529	!-- Restore original indices/counters
	530	nx = isave(1)
	531	nxl_y = isave(2)
	532	nxr_y = isave(3)
	533	sendrecvcount_xy = isave(4)
	534
	535	DEALLOCATE( ar1, f_in, f_inv )
	536
	537	!
	538	!-- Transposition y --> z + subsequent tridia + resort for z --> y
	539	ALLOCATE( f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
	540	CALL resort_for_yz( f_out_y, f_inv )
	541	!
	542	!-- Save original indices and gridpoint counter
	543	isave(1) = ny
	544	isave(2) = nys_z
	545	isave(3) = nyn_z
	546	isave(4) = sendrecvcount_yz
	547	!
	548	!-- Set new indices for transformation
	549	nblk = ( ( ny+1 ) / pdims(1) ) - 1
	550	ny = pdims(1)
	551	nny_z = 1
	552	nys_z = myidx * nny_z
	553	nyn_z = ( myidx + 1 ) * nny_z - 1
	554	sendrecvcount_yz = ( nxr_y-nxl_y+1 ) * nny_z * ( nzt_y-nzb_y+1 )
	555
	556	ALLOCATE( ar1(nxl_z:nxr_z,nys_z:nyn_z,1:nz,3) )
	557	ALLOCATE( f_in(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
	558
	559	DO jj = 0, nblk+2
	560	ind_odd = MOD( jj+3, 3 ) + 1
	561	ind_even = MOD( jj+2, 3 ) + 1
	562	ind_third = MOD( jj+1, 3 ) + 1
	563	!$OMP sections private(ki,knew,n)
	564	!$OMP section
	565	IF ( jj <= nblk ) THEN
	566	!
	567	!-- Forward Fourier Transformation
	568	!-- Transposition y --> z
	569	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
	570
	571	DO jnew = 0, ny-1
	572	jind = jj + ( nblk + 1 ) * jnew
	573	f_in(:,:,jnew) =f_inv(:,:,jind)
	574	ENDDO
	575
	576	CALL transpose_yz( f_in, ar1(:,:,:,ind_odd) )
	577
	578	IF ( jj == nblk ) THEN
	579	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
	580	ELSE
	581	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
	582	ENDIF
	583
	584	ENDIF
	585
	586	IF ( jj >= 2 ) THEN
	587	!
	588	!-- Inverse Fourier Transformation
	589	!-- Transposition z --> y
	590	!-- Only one thread should call MPI routines, therefore forward and
	591	!-- backward tranpose are in the same section
	592	IF ( jj == 2 ) THEN
	593	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
	594	ELSE
	595	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
	596	ENDIF
	597
	598	CALL transpose_zy( ar1(:,:,:,ind_third), f_in )
	599
	600	DO jnew = 0, ny-1
	601	jind = jj-2 + ( nblk + 1 ) * jnew
	602	f_inv(:,:,jind) = f_in(:,:,jnew)
	603	ENDDO
	604
	605	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
	606
	607	ENDIF
	608
	609	!$OMP section
	610	IF ( jj >= 1 .AND. jj <= nblk+1 ) THEN
	611	!
	612	!-- Solve the tridiagonal equation system along z
	613	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
	614
	615	n = isave(2) + jj - 1
	616	CALL tridia_substi_overlap( ar1(:,:,:,ind_even), n )
	617
	618	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
	619	ENDIF
	620	!$OMP end sections
	621
	622	ENDDO
	623	!
	624	!-- Restore original indices/counters
	625	ny = isave(1)
	626	nys_z = isave(2)
	627	nyn_z = isave(3)
	628	sendrecvcount_yz = isave(4)
	629
	630	CALL resort_for_zy( f_inv, f_out_y )
	631
	632	DEALLOCATE( ar1, f_in, f_inv )
	633
	634	!
	635	!-- fft along y backward + subsequent transposition y --> x
	636	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
	637	!
	638	!-- Save original indices and gridpoint counter
	639	isave(1) = nx
	640	isave(2) = nxl_y
	641	isave(3) = nxr_y
	642	isave(4) = sendrecvcount_xy
	643	!
	644	!-- Set new indices for transformation
	645	nblk = (( nx+1 ) / pdims(2) ) - 1
	646	nx = pdims(2)
	647	nnx_y = 1
	648	nxl_y = myidy * nnx_y
	649	nxr_y = ( myidy + 1 ) * nnx_y - 1
	650	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
	651
	652	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y,2) )
	653	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
	654
	655	DO ii = 0, nblk+1
	656	ind_odd = MOD( ii+1, 2 ) + 1
	657	ind_even = MOD( ii+2, 2 ) + 1
	658	!$OMP sections private(ki,knew,n)
	659	!$OMP section
	660	IF ( ii <= nblk ) THEN
	661
	662	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
	663
	664	n = isave(2) + ii
	665	nxl_y_bound = isave(2)
	666	nxr_y_bound = isave(3)
	667
	668	! CALL fft_y( ar1(:,:,:,ind_even), 'backward', ar_3d = f_out_y, &
	669	! ni = n )
	670	CALL fft_y( ar1(:,:,:,ind_even), 'backward', ar_tr = f_out_y, &
	671	nxl_y_bound = nxl_y_bound, nxr_y_bound = nxr_y_bound, &
	672	nxl_y_l = n, nxr_y_l = n )
	673
	674	IF ( ii == nblk ) THEN
	675	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	676	ELSE
	677	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	678	ENDIF
	679
	680	ENDIF
	681
	682	!$OMP section
	683	IF ( ii >= 1 ) THEN
	684
	685	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
	686
	687	CALL transpose_yx( ar1(:,:,:,ind_odd), f_in )
	688
	689	DO inew = 0, nx-1
	690	iind = ii-1 + (nblk+1) * inew
	691	f_inv(:,:,iind) = f_in(:,:,inew)
	692	ENDDO
	693
	694	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
	695
	696	ENDIF
	697	!$OMP end sections
	698
	699	ENDDO
	700	!
	701	!-- Restore original indices/counters
	702	nx = isave(1)
	703	nxl_y = isave(2)
	704	nxr_y = isave(3)
	705	sendrecvcount_xy = isave(4)
	706
	707	CALL resort_for_yx( f_inv, f_out_z )
	708
	709	DEALLOCATE( ar1, f_in, f_inv )
	710
	711	!
	712	!-- fft along x backward + subsequent final transposition x --> z
	713	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
	714	!
	715	!-- Save original indices and gridpoint counter
	716	isave(1) = nz
	717	isave(2) = nzb_x
	718	isave(3) = nzt_x
	719	isave(4) = sendrecvcount_zx
	720	!
	721	!-- Set new indices for transformation
	722	nblk = nz / pdims(1)
	723	nz = pdims(1)
	724	nnz_x = 1
	725	nzb_x = 1 + myidx * nnz_x
	726	nzt_x = ( myidx + 1 ) * nnz_x
	727	sendrecvcount_zx = nnx * nny * nnz_x
	728
	729	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x,2) )
	730	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
	731
	732	DO kk = 1, nblk+1
	733	ind_odd = MOD( kk, 2 ) + 1
	734	ind_even = MOD( kk+1, 2 ) + 1
	735	!$OMP sections private(ki,knew,n)
	736	!$OMP section
	737	IF ( kk <= nblk ) THEN
	738
	739	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
	740
	741	n = isave(2) + kk - 1
	742	CALL fft_x( ar1(:,:,:,ind_even), 'backward', f_out_z(:,:,n))
	743
	744	IF ( kk == nblk ) THEN
	745	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	746	ELSE
	747	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
	748	ENDIF
	749
	750	ENDIF
	751
	752	!$OMP section
	753	IF ( kk >= 2 ) THEN
	754
	755	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
	756
	757	CALL transpose_xz( ar1(:,:,:,ind_odd), f_in )
	758
	759	DO knew = 1, nz
	760	ki = kk-1 + nblk * (knew-1)
	761	f_inv(:,:,ki) = f_in(:,:,knew)
	762	ENDDO
	763
	764	IF ( kk == nblk+1 ) THEN
	765	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
	766	ELSE
	767	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
	768	ENDIF
	769
	770	ENDIF
	771	!$OMP end sections
	772
	773	ENDDO
	774	!
	775	!-- Restore original indices/counters
	776	nz = isave(1)
	777	nzb_x = isave(2)
	778	nzt_x = isave(3)
	779	sendrecvcount_zx = isave(4)
	780
	781	CALL resort_for_xz( f_inv, ar )
	782
	783	DEALLOCATE( ar1, f_in, f_inv )
	784
[1]	785	ENDIF
	786
	787	CALL cpu_log( log_point_s(3), 'poisfft', 'stop' )
	788
	789	END SUBROUTINE poisfft
	790
	791
	792
[1216]	793	SUBROUTINE ffty_tr_yx( f_in, f_out )
[1]	794
	795	!------------------------------------------------------------------------------!
	796	! Fourier-transformation along y with subsequent transposition y --> x for
	797	! a 1d-decomposition along x
	798	!
	799	! ATTENTION: The performance of this routine is much faster on the NEC-SX6,
	800	! if the first index of work_ffty_vec is odd. Otherwise
	801	! memory bank conflicts may occur (especially if the index is a
	802	! multiple of 128). That's why work_ffty_vec is dimensioned as
	803	! 0:ny+1.
	804	! Of course, this will not work if users are using an odd number
	805	! of gridpoints along y.
	806	!------------------------------------------------------------------------------!
	807
	808	USE control_parameters
	809	USE cpulog
	810	USE indices
	811	USE interfaces
	812	USE pegrid
	813	USE transpose_indices
	814
	815	IMPLICIT NONE
	816
	817	INTEGER :: i, iend, iouter, ir, j, k
	818	INTEGER, PARAMETER :: stridex = 4
	819
	820	REAL, DIMENSION(0:ny,stridex) :: work_ffty
	821	#if defined( __nec )
	822	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
	823	#endif
[1003]	824	REAL, DIMENSION(1:nz,0:ny,nxl:nxr) :: f_in
	825	REAL, DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_out
	826	REAL, DIMENSION(nxl:nxr,1:nz,0:ny) :: work
[1]	827
	828	!
	829	!-- Carry out the FFT along y, where all data are present due to the
	830	!-- 1d-decomposition along x. Resort the data in a way that x becomes
	831	!-- the first index.
[1106]	832	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'start' )
[1]	833
	834	IF ( host(1:3) == 'nec' ) THEN
	835	#if defined( __nec )
	836	!
	837	!-- Code optimized for vector processors
[85]	838	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	839	!$OMP DO
	840	DO i = nxl, nxr
	841
	842	DO j = 0, ny
	843	DO k = 1, nz
	844	work_ffty_vec(j,k,i) = f_in(k,j,i)
	845	ENDDO
	846	ENDDO
	847
	848	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' )
	849
	850	ENDDO
	851
	852	!$OMP DO
	853	DO k = 1, nz
	854	DO j = 0, ny
	855	DO i = nxl, nxr
	856	work(i,k,j) = work_ffty_vec(j,k,i)
	857	ENDDO
	858	ENDDO
	859	ENDDO
	860	!$OMP END PARALLEL
	861	#endif
	862
	863	ELSE
	864
	865	!
	866	!-- Cache optimized code.
	867	!-- The i-(x-)direction is split into a strided outer loop and an inner
	868	!-- loop for better cache performance
	869	!$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty)
	870	!$OMP DO
	871	DO iouter = nxl, nxr, stridex
	872
	873	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
	874
	875	DO k = 1, nz
	876
	877	DO i = iouter, iend
	878
	879	ir = i-iouter+1 ! counter within a stride
	880	DO j = 0, ny
	881	work_ffty(j,ir) = f_in(k,j,i)
	882	ENDDO
	883	!
	884	!-- FFT along y
[1106]	885	CALL fft_y_1d( work_ffty(:,ir), 'forward' )
[1]	886
	887	ENDDO
	888
	889	!
	890	!-- Resort
	891	DO j = 0, ny
	892	DO i = iouter, iend
	893	work(i,k,j) = work_ffty(j,i-iouter+1)
	894	ENDDO
	895	ENDDO
	896
	897	ENDDO
	898
	899	ENDDO
	900	!$OMP END PARALLEL
	901
	902	ENDIF
[1106]	903	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'pause' )
[1]	904
	905	!
	906	!-- Transpose array
[1111]	907	#if defined( __parallel )
[1]	908	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	909	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	910	CALL MPI_ALLTOALL( work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
	911	f_out(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
	912	comm1dx, ierr )
	913	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1111]	914	#endif
[1]	915
	916	END SUBROUTINE ffty_tr_yx
	917
	918
[1216]	919	SUBROUTINE tr_xy_ffty( f_in, f_out )
[1]	920
	921	!------------------------------------------------------------------------------!
	922	! Transposition x --> y with a subsequent backward Fourier transformation for
	923	! a 1d-decomposition along x
	924	!------------------------------------------------------------------------------!
	925
	926	USE control_parameters
	927	USE cpulog
	928	USE indices
	929	USE interfaces
	930	USE pegrid
	931	USE transpose_indices
	932
	933	IMPLICIT NONE
	934
	935	INTEGER :: i, iend, iouter, ir, j, k
	936	INTEGER, PARAMETER :: stridex = 4
	937
	938	REAL, DIMENSION(0:ny,stridex) :: work_ffty
	939	#if defined( __nec )
	940	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
	941	#endif
[1003]	942	REAL, DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_in
	943	REAL, DIMENSION(1:nz,0:ny,nxl:nxr) :: f_out
	944	REAL, DIMENSION(nxl:nxr,1:nz,0:ny) :: work
[1]	945
	946	!
	947	!-- Transpose array
[1111]	948	#if defined( __parallel )
[1]	949	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	950	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	951	CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
	952	work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
	953	comm1dx, ierr )
	954	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1111]	955	#endif
[1]	956
	957	!
	958	!-- Resort the data in a way that y becomes the first index and carry out the
	959	!-- backward fft along y.
[1106]	960	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'continue' )
[1]	961
	962	IF ( host(1:3) == 'nec' ) THEN
	963	#if defined( __nec )
	964	!
	965	!-- Code optimized for vector processors
[85]	966	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	967	!$OMP DO
	968	DO k = 1, nz
	969	DO j = 0, ny
	970	DO i = nxl, nxr
	971	work_ffty_vec(j,k,i) = work(i,k,j)
	972	ENDDO
	973	ENDDO
	974	ENDDO
	975
	976	!$OMP DO
	977	DO i = nxl, nxr
	978
	979	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' )
	980
	981	DO j = 0, ny
	982	DO k = 1, nz
	983	f_out(k,j,i) = work_ffty_vec(j,k,i)
	984	ENDDO
	985	ENDDO
	986
	987	ENDDO
	988	!$OMP END PARALLEL
	989	#endif
	990
	991	ELSE
	992
	993	!
	994	!-- Cache optimized code.
	995	!-- The i-(x-)direction is split into a strided outer loop and an inner
	996	!-- loop for better cache performance
	997	!$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty )
	998	!$OMP DO
	999	DO iouter = nxl, nxr, stridex
	1000
	1001	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
	1002
	1003	DO k = 1, nz
	1004	!
	1005	!-- Resort
	1006	DO j = 0, ny
	1007	DO i = iouter, iend
	1008	work_ffty(j,i-iouter+1) = work(i,k,j)
	1009	ENDDO
	1010	ENDDO
	1011
	1012	DO i = iouter, iend
	1013
	1014	!
	1015	!-- FFT along y
	1016	ir = i-iouter+1 ! counter within a stride
[1106]	1017	CALL fft_y_1d( work_ffty(:,ir), 'backward' )
[1]	1018
	1019	DO j = 0, ny
	1020	f_out(k,j,i) = work_ffty(j,ir)
	1021	ENDDO
	1022	ENDDO
	1023
	1024	ENDDO
	1025
	1026	ENDDO
	1027	!$OMP END PARALLEL
	1028
	1029	ENDIF
	1030
[1106]	1031	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'stop' )
[1]	1032
	1033	END SUBROUTINE tr_xy_ffty
	1034
	1035
	1036	SUBROUTINE fftx_tri_fftx( ar )
	1037
	1038	!------------------------------------------------------------------------------!
	1039	! FFT along x, solution of the tridiagonal system and backward FFT for
	1040	! a 1d-decomposition along x
	1041	!
	1042	! WARNING: this subroutine may still not work for hybrid parallelization
	1043	! with OpenMP (for possible necessary changes see the original
	1044	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
	1045	!------------------------------------------------------------------------------!
	1046
	1047	USE control_parameters
	1048	USE cpulog
	1049	USE grid_variables
	1050	USE indices
	1051	USE interfaces
	1052	USE pegrid
	1053	USE transpose_indices
	1054
	1055	IMPLICIT NONE
	1056
	1057	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
	1058
[1003]	1059	REAL, DIMENSION(0:nx) :: work_fftx
	1060	REAL, DIMENSION(0:nx,1:nz) :: work_trix
	1061	REAL, DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: ar
	1062	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
[1]	1063
	1064
[1106]	1065	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'start' )
[1]	1066
	1067	ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) )
	1068
	1069	tn = 0 ! Default thread number in case of one thread
	1070	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_fftx, work_trix )
	1071	DO j = nys_x, nyn_x
	1072
	1073	!$ tn = omp_get_thread_num()
	1074
	1075	IF ( host(1:3) == 'nec' ) THEN
	1076	!
	1077	!-- Code optimized for vector processors
	1078	DO k = 1, nz
	1079
	1080	m = 0
	1081	DO n = 1, pdims(1)
[1003]	1082	DO i = 1, nnx
[1]	1083	work_trix(m,k) = ar(i,k,j,n)
	1084	m = m + 1
	1085	ENDDO
	1086	ENDDO
	1087
	1088	ENDDO
	1089
	1090	CALL fft_x_m( work_trix, 'forward' )
	1091
	1092	ELSE
	1093	!
	1094	!-- Cache optimized code
	1095	DO k = 1, nz
	1096
	1097	m = 0
	1098	DO n = 1, pdims(1)
[1003]	1099	DO i = 1, nnx
[1]	1100	work_fftx(m) = ar(i,k,j,n)
	1101	m = m + 1
	1102	ENDDO
	1103	ENDDO
	1104
[1106]	1105	CALL fft_x_1d( work_fftx, 'forward' )
[1]	1106
	1107	DO i = 0, nx
	1108	work_trix(i,k) = work_fftx(i)
	1109	ENDDO
	1110
	1111	ENDDO
	1112
	1113	ENDIF
	1114
	1115	!
	1116	!-- Solve the linear equation system
	1117	CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) )
	1118
	1119	IF ( host(1:3) == 'nec' ) THEN
	1120	!
	1121	!-- Code optimized for vector processors
	1122	CALL fft_x_m( work_trix, 'backward' )
	1123
	1124	DO k = 1, nz
	1125
	1126	m = 0
	1127	DO n = 1, pdims(1)
[1003]	1128	DO i = 1, nnx
[1]	1129	ar(i,k,j,n) = work_trix(m,k)
	1130	m = m + 1
	1131	ENDDO
	1132	ENDDO
	1133
	1134	ENDDO
	1135
	1136	ELSE
	1137	!
	1138	!-- Cache optimized code
	1139	DO k = 1, nz
	1140
	1141	DO i = 0, nx
	1142	work_fftx(i) = work_trix(i,k)
	1143	ENDDO
	1144
[1106]	1145	CALL fft_x_1d( work_fftx, 'backward' )
[1]	1146
	1147	m = 0
	1148	DO n = 1, pdims(1)
[1003]	1149	DO i = 1, nnx
[1]	1150	ar(i,k,j,n) = work_fftx(m)
	1151	m = m + 1
	1152	ENDDO
	1153	ENDDO
	1154
	1155	ENDDO
	1156
	1157	ENDIF
	1158
	1159	ENDDO
	1160
	1161	DEALLOCATE( tri )
	1162
[1106]	1163	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'stop' )
[1]	1164
	1165	END SUBROUTINE fftx_tri_fftx
	1166
	1167
[1216]	1168	SUBROUTINE fftx_tr_xy( f_in, f_out )
[1]	1169
	1170	!------------------------------------------------------------------------------!
	1171	! Fourier-transformation along x with subsequent transposition x --> y for
	1172	! a 1d-decomposition along y
	1173	!
	1174	! ATTENTION: The NEC-branch of this routine may significantly profit from
	1175	! further optimizations. So far, performance is much worse than
	1176	! for routine ffty_tr_yx (more than three times slower).
	1177	!------------------------------------------------------------------------------!
	1178
	1179	USE control_parameters
	1180	USE cpulog
	1181	USE indices
	1182	USE interfaces
	1183	USE pegrid
	1184	USE transpose_indices
	1185
	1186	IMPLICIT NONE
	1187
	1188	INTEGER :: i, j, k
	1189
[1003]	1190	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
	1191	REAL, DIMENSION(1:nz,nys:nyn,0:nx) :: f_in
	1192	REAL, DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_out
	1193	REAL, DIMENSION(nys:nyn,1:nz,0:nx) :: work
[1]	1194
	1195	!
	1196	!-- Carry out the FFT along x, where all data are present due to the
	1197	!-- 1d-decomposition along y. Resort the data in a way that y becomes
	1198	!-- the first index.
[1106]	1199	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'start' )
[1]	1200
	1201	IF ( host(1:3) == 'nec' ) THEN
	1202	!
	1203	!-- Code for vector processors
[85]	1204	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	1205	!$OMP DO
	1206	DO i = 0, nx
	1207
	1208	DO j = nys, nyn
	1209	DO k = 1, nz
	1210	work_fftx(i,k,j) = f_in(k,j,i)
	1211	ENDDO
	1212	ENDDO
	1213
	1214	ENDDO
	1215
	1216	!$OMP DO
	1217	DO j = nys, nyn
	1218
	1219	CALL fft_x_m( work_fftx(:,:,j), 'forward' )
	1220
	1221	DO k = 1, nz
	1222	DO i = 0, nx
	1223	work(j,k,i) = work_fftx(i,k,j)
	1224	ENDDO
	1225	ENDDO
	1226
	1227	ENDDO
	1228	!$OMP END PARALLEL
	1229
	1230	ELSE
	1231
	1232	!
	1233	!-- Cache optimized code (there might be still a potential for better
	1234	!-- optimization).
[696]	1235	!$OMP PARALLEL PRIVATE (i,j,k)
[1]	1236	!$OMP DO
	1237	DO i = 0, nx
	1238
	1239	DO j = nys, nyn
	1240	DO k = 1, nz
	1241	work_fftx(i,k,j) = f_in(k,j,i)
	1242	ENDDO
	1243	ENDDO
	1244
	1245	ENDDO
	1246
	1247	!$OMP DO
	1248	DO j = nys, nyn
	1249	DO k = 1, nz
	1250
[1106]	1251	CALL fft_x_1d( work_fftx(0:nx,k,j), 'forward' )
[1]	1252
	1253	DO i = 0, nx
	1254	work(j,k,i) = work_fftx(i,k,j)
	1255	ENDDO
	1256	ENDDO
	1257
	1258	ENDDO
	1259	!$OMP END PARALLEL
	1260
	1261	ENDIF
[1106]	1262	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'pause' )
[1]	1263
	1264	!
	1265	!-- Transpose array
[1111]	1266	#if defined( __parallel )
[1]	1267	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	1268	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	1269	CALL MPI_ALLTOALL( work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
	1270	f_out(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
	1271	comm1dy, ierr )
	1272	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1111]	1273	#endif
[1]	1274
	1275	END SUBROUTINE fftx_tr_xy
	1276
	1277
[1216]	1278	SUBROUTINE tr_yx_fftx( f_in, f_out )
[1]	1279
	1280	!------------------------------------------------------------------------------!
	1281	! Transposition y --> x with a subsequent backward Fourier transformation for
	1282	! a 1d-decomposition along x
	1283	!------------------------------------------------------------------------------!
	1284
	1285	USE control_parameters
	1286	USE cpulog
	1287	USE indices
	1288	USE interfaces
	1289	USE pegrid
	1290	USE transpose_indices
	1291
	1292	IMPLICIT NONE
	1293
	1294	INTEGER :: i, j, k
	1295
[1003]	1296	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
	1297	REAL, DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_in
	1298	REAL, DIMENSION(1:nz,nys:nyn,0:nx) :: f_out
	1299	REAL, DIMENSION(nys:nyn,1:nz,0:nx) :: work
[1]	1300
	1301	!
	1302	!-- Transpose array
[1111]	1303	#if defined( __parallel )
[1]	1304	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	1305	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	1306	CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
	1307	work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
	1308	comm1dy, ierr )
	1309	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1111]	1310	#endif
[1]	1311
	1312	!
	1313	!-- Carry out the FFT along x, where all data are present due to the
	1314	!-- 1d-decomposition along y. Resort the data in a way that y becomes
	1315	!-- the first index.
[1106]	1316	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'continue' )
[1]	1317
	1318	IF ( host(1:3) == 'nec' ) THEN
	1319	!
	1320	!-- Code optimized for vector processors
[85]	1321	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	1322	!$OMP DO
	1323	DO j = nys, nyn
	1324
	1325	DO k = 1, nz
	1326	DO i = 0, nx
	1327	work_fftx(i,k,j) = work(j,k,i)
	1328	ENDDO
	1329	ENDDO
	1330
	1331	CALL fft_x_m( work_fftx(:,:,j), 'backward' )
	1332
	1333	ENDDO
	1334
	1335	!$OMP DO
	1336	DO i = 0, nx
	1337	DO j = nys, nyn
	1338	DO k = 1, nz
	1339	f_out(k,j,i) = work_fftx(i,k,j)
	1340	ENDDO
	1341	ENDDO
	1342	ENDDO
	1343	!$OMP END PARALLEL
	1344
	1345	ELSE
	1346
	1347	!
	1348	!-- Cache optimized code (there might be still a potential for better
	1349	!-- optimization).
[696]	1350	!$OMP PARALLEL PRIVATE (i,j,k)
[1]	1351	!$OMP DO
	1352	DO j = nys, nyn
	1353	DO k = 1, nz
	1354
	1355	DO i = 0, nx
	1356	work_fftx(i,k,j) = work(j,k,i)
	1357	ENDDO
	1358
[1106]	1359	CALL fft_x_1d( work_fftx(0:nx,k,j), 'backward' )
[1]	1360
	1361	ENDDO
	1362	ENDDO
	1363
	1364	!$OMP DO
	1365	DO i = 0, nx
	1366	DO j = nys, nyn
	1367	DO k = 1, nz
	1368	f_out(k,j,i) = work_fftx(i,k,j)
	1369	ENDDO
	1370	ENDDO
	1371	ENDDO
	1372	!$OMP END PARALLEL
	1373
	1374	ENDIF
[1106]	1375	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'stop' )
[1]	1376
	1377	END SUBROUTINE tr_yx_fftx
	1378
	1379
	1380	SUBROUTINE ffty_tri_ffty( ar )
	1381
	1382	!------------------------------------------------------------------------------!
	1383	! FFT along y, solution of the tridiagonal system and backward FFT for
	1384	! a 1d-decomposition along y
	1385	!
	1386	! WARNING: this subroutine may still not work for hybrid parallelization
	1387	! with OpenMP (for possible necessary changes see the original
	1388	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
	1389	!------------------------------------------------------------------------------!
	1390
	1391	USE control_parameters
	1392	USE cpulog
	1393	USE grid_variables
	1394	USE indices
	1395	USE interfaces
	1396	USE pegrid
	1397	USE transpose_indices
	1398
	1399	IMPLICIT NONE
	1400
	1401	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
	1402
[1003]	1403	REAL, DIMENSION(0:ny) :: work_ffty
	1404	REAL, DIMENSION(0:ny,1:nz) :: work_triy
	1405	REAL, DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: ar
	1406	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
[1]	1407
	1408
[1106]	1409	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'start' )
[1]	1410
	1411	ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) )
	1412
	1413	tn = 0 ! Default thread number in case of one thread
[696]	1414	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy )
[1]	1415	DO i = nxl_y, nxr_y
	1416
	1417	!$ tn = omp_get_thread_num()
	1418
	1419	IF ( host(1:3) == 'nec' ) THEN
	1420	!
	1421	!-- Code optimized for vector processors
	1422	DO k = 1, nz
	1423
	1424	m = 0
	1425	DO n = 1, pdims(2)
[1003]	1426	DO j = 1, nny
[1]	1427	work_triy(m,k) = ar(j,k,i,n)
	1428	m = m + 1
	1429	ENDDO
	1430	ENDDO
	1431
	1432	ENDDO
	1433
	1434	CALL fft_y_m( work_triy, ny, 'forward' )
	1435
	1436	ELSE
	1437	!
	1438	!-- Cache optimized code
	1439	DO k = 1, nz
	1440
	1441	m = 0
	1442	DO n = 1, pdims(2)
[1003]	1443	DO j = 1, nny
[1]	1444	work_ffty(m) = ar(j,k,i,n)
	1445	m = m + 1
	1446	ENDDO
	1447	ENDDO
	1448
[1106]	1449	CALL fft_y_1d( work_ffty, 'forward' )
[1]	1450
	1451	DO j = 0, ny
	1452	work_triy(j,k) = work_ffty(j)
	1453	ENDDO
	1454
	1455	ENDDO
	1456
	1457	ENDIF
	1458
	1459	!
	1460	!-- Solve the linear equation system
	1461	CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) )
	1462
	1463	IF ( host(1:3) == 'nec' ) THEN
	1464	!
	1465	!-- Code optimized for vector processors
	1466	CALL fft_y_m( work_triy, ny, 'backward' )
	1467
	1468	DO k = 1, nz
	1469
	1470	m = 0
	1471	DO n = 1, pdims(2)
[1003]	1472	DO j = 1, nny
[1]	1473	ar(j,k,i,n) = work_triy(m,k)
	1474	m = m + 1
	1475	ENDDO
	1476	ENDDO
	1477
	1478	ENDDO
	1479
	1480	ELSE
	1481	!
	1482	!-- Cache optimized code
	1483	DO k = 1, nz
	1484
	1485	DO j = 0, ny
	1486	work_ffty(j) = work_triy(j,k)
	1487	ENDDO
	1488
[1106]	1489	CALL fft_y_1d( work_ffty, 'backward' )
[1]	1490
	1491	m = 0
	1492	DO n = 1, pdims(2)
[1003]	1493	DO j = 1, nny
[1]	1494	ar(j,k,i,n) = work_ffty(m)
	1495	m = m + 1
	1496	ENDDO
	1497	ENDDO
	1498
	1499	ENDDO
	1500
	1501	ENDIF
	1502
	1503	ENDDO
	1504
	1505	DEALLOCATE( tri )
	1506
[1106]	1507	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'stop' )
[1]	1508
	1509	END SUBROUTINE ffty_tri_ffty
	1510
	1511	#endif
	1512
	1513	END MODULE poisfft_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |