Home

Context Navigation

source: palm/trunk/SOURCE/poisfft.f90 @ 1310

Last change on this file since 1310 was 1310, checked in by raasch, 10 years ago
update of GPL copyright
Property svn:keywords set to `Id`
File size: 42.1 KB

Rev	Line
[1]	1	MODULE poisfft_mod
	2
[1036]	3	!--------------------------------------------------------------------------------!
	4	! This file is part of PALM.
	5	!
	6	! PALM is free software: you can redistribute it and/or modify it under the terms
	7	! of the GNU General Public License as published by the Free Software Foundation,
	8	! either version 3 of the License, or (at your option) any later version.
	9	!
	10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
	11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
	12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
	13	!
	14	! You should have received a copy of the GNU General Public License along with
	15	! PALM. If not, see <http://www.gnu.org/licenses/>.
	16	!
[1310]	17	! Copyright 1997-2014 Leibniz Universitaet Hannover
[1036]	18	!--------------------------------------------------------------------------------!
	19	!
[484]	20	! Current revisions:
[1]	21	! -----------------
[1217]	22	!
[1307]	23	!
[1217]	24	! Former revisions:
	25	! -----------------
	26	! $Id: poisfft.f90 1310 2014-03-14 08:01:56Z raasch $
	27	!
[1307]	28	! 1306 2014-03-13 14:30:59Z raasch
	29	! openmp sections removed from the overlap branch,
	30	! second argument removed from parameter list
	31	!
[1217]	32	! 1216 2013-08-26 09:31:42Z raasch
[1216]	33	! resorting of arrays moved to separate routines resort_for_...,
	34	! one argument, used as temporary work array, removed from all transpose
	35	! routines
	36	! overlapping fft / transposition implemented
[1112]	37	!
[1213]	38	! 1212 2013-08-15 08:46:27Z raasch
	39	! tridia routines moved to seperate module tridia_solver
	40	!
[1209]	41	! 1208 2013-08-13 06:41:49Z raasch
	42	! acc-update clauses added for "ar" so that ffts other than cufft can also be
	43	! used (although they are not ported and will give a poor performance)
	44	!
[1112]	45	! 1111 2013-03-08 23:54:10Z raasch
[1111]	46	! further openACC porting of non-parallel (MPI) branch:
	47	! tridiagonal routines split into extermal subroutines (instead using CONTAINS),
	48	! no distinction between parallel/non-parallel in poisfft and tridia any more,
[1112]	49	! tridia routines moved to end of file because of probable bug in PGI compiler 12.5
[1111]	50	! (otherwise "invalid device function" is indicated during runtime),
	51	! optimization of tridia routines: constant elements and coefficients of tri are
	52	! stored in seperate arrays ddzuw and tric, last dimension of tri reduced from 5
	53	! to 2,
	54	! poisfft_init is now called internally from poisfft, maketri is called from
	55	! poisfft_init,
	56	! ibc_p_b = 2 removed
[1]	57	!
[1107]	58	! 1106 2013-03-04 05:31:38Z raasch
	59	! routines fftx, ffty, fftxp, fftyp removed, calls replaced by fft_x, fft_y,
	60	! in the 1D-decomposition routines fft_x, ffty are replaced by fft_x_1d,
	61	! fft_y_1d
	62	!
[1104]	63	! 1103 2013-02-20 02:15:53Z raasch
	64	! tri, ar, and ar1 arguments in tridia-routines (2d) are removed because they
	65	! sometimes cause segmentation faults with intel 12.1 compiler
	66	!
[1093]	67	! 1092 2013-02-02 11:24:22Z raasch
	68	! unused variables removed
	69	!
[1037]	70	! 1036 2012-10-22 13:43:42Z raasch
	71	! code put under GPL (PALM 3.9)
	72	!
[1014]	73	! 2012-09-21 07:03:55Z raasch
	74	! FLOAT type conversion replaced by REAL
	75	!
[1004]	76	! 1003 2012-09-14 14:35:53Z raasch
	77	! indices nxa, nya, etc. replaced by nx, ny, etc.
	78	!
[941]	79	! 940 2012-07-09 14:31:00Z raasch
	80	! special handling of tri-array as an argument in tridia_1dd routines switched
	81	! off because it caused segmentation faults with intel 12.1 compiler
	82	!
[878]	83	! 877 2012-04-03 11:21:44Z suehring
	84	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
	85	! pressure at the top of the model domain.
	86	!
[810]	87	! 809 2012-01-30 13:32:58Z maronga
	88	! Bugfix: replaced .AND. and .NOT. with && and ! in the preprocessor directives
	89	!
[808]	90	! 807 2012-01-25 11:53:51Z maronga
	91	! New cpp directive "__check" implemented which is used by check_namelist_files
	92	! (most of the code is unneeded by check_namelist_files).
	93	!
[764]	94	! 763 2011-10-06 09:32:09Z suehring
	95	! Comment added concerning the last change.
	96	!
[762]	97	! 761 2011-10-05 17:58:52Z suehring
	98	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
	99	! pressure at the top of the model domain.
	100	!
[697]	101	! 696 2011-03-18 07:03:49Z raasch
	102	! work_fftx removed from PRIVATE clauses in fftx_tr_xy and tr_yx_fftx
	103	!
[684]	104	! 683 2011-02-09 14:25:15Z raasch
	105	! openMP parallelization for 2d-domain-decomposition
	106	!
[668]	107	! 667 2010-12-23 12:06:00Z suehring/gryschka
	108	! ddzu replaced by ddzu_pres due to changes in zu(0)
	109	!
[623]	110	! 622 2010-12-10 08:08:13Z raasch
	111	! optional barriers included in order to speed up collective operations
	112	!
[392]	113	! 377 2009-09-04 11:09:00Z raasch
	114	! __lcmuk changed to __lc to avoid problems with Intel compiler on sgi-ice
	115	!
[198]	116	! 164 2008-05-15 08:46:15Z raasch
	117	! Arguments removed from transpose routines
	118	!
[139]	119	! 128 2007-10-26 13:11:14Z raasch
	120	! Bugfix: wavenumber calculation for even nx in routines maketri
	121	!
[90]	122	! 85 2007-05-11 09:35:14Z raasch
	123	! Bugfix: work_fft*_vec removed from some PRIVATE-declarations
	124	!
[77]	125	! 76 2007-03-29 00:58:32Z raasch
	126	! Tridiagonal coefficients adjusted for Neumann boundary conditions both at
	127	! the bottom and the top.
	128	!
[3]	129	! RCS Log replace by Id keyword, revision history cleaned up
	130	!
[1]	131	! Revision 1.24 2006/08/04 15:00:24 raasch
	132	! Default setting of the thread number tn in case of not using OpenMP
	133	!
	134	! Revision 1.23 2006/02/23 12:48:38 raasch
	135	! Additional compiler directive in routine tridia_1dd for preventing loop
	136	! exchange on NEC-SX6
	137	!
	138	! Revision 1.20 2004/04/30 12:38:09 raasch
	139	! Parts of former poisfft_hybrid moved to this subroutine,
	140	! former subroutine changed to a module, renaming of FFT-subroutines and
	141	! -module, FFTs completely substituted by calls of fft_x and fft_y,
	142	! NAG fft used in the non-parallel case completely removed, l in maketri
	143	! is now a 1d-array, variables passed by modules instead of using parameter
	144	! lists, enlarged transposition arrays introduced
	145	!
	146	! Revision 1.1 1997/07/24 11:24:14 raasch
	147	! Initial revision
	148	!
	149	!
	150	! Description:
	151	! ------------
[1306]	152	! Original version by Stephan Siano (pois3d), as of July 23, 1996
	153	! Adapted for 2D-domain-decomposition by Siegfried Raasch, July 3, 1997
	154	!
	155	! Solves the Poisson equation with a 2D spectral method
	156	! d^2 p / dx^2 + d^2 p / dy^2 + d^2 p / dz^2 = s
	157	!
	158	! Input:
	159	! real ar contains (nnz,nny,nnx) elements of the velocity divergence,
	160	! starting from (1,nys,nxl)
	161	!
	162	! Output:
	163	! real ar contains the solution for perturbation pressure p
[1]	164	!------------------------------------------------------------------------------!
	165
	166	USE fft_xy
	167	USE indices
	168	USE transpose_indices
[1212]	169	USE tridia_solver
[1]	170
	171	IMPLICIT NONE
	172
[1111]	173	LOGICAL, SAVE :: poisfft_initialized = .FALSE.
	174
[1]	175	PRIVATE
[807]	176
[809]	177	#if ! defined ( __check )
[1]	178	PUBLIC poisfft, poisfft_init
	179
	180	INTERFACE poisfft
	181	MODULE PROCEDURE poisfft
	182	END INTERFACE poisfft
	183
	184	INTERFACE poisfft_init
	185	MODULE PROCEDURE poisfft_init
	186	END INTERFACE poisfft_init
[807]	187	#else
	188	PUBLIC poisfft_init
[1]	189
[807]	190	INTERFACE poisfft_init
	191	MODULE PROCEDURE poisfft_init
	192	END INTERFACE poisfft_init
	193	#endif
	194
[1]	195	CONTAINS
	196
	197	SUBROUTINE poisfft_init
	198
[1111]	199	USE arrays_3d, ONLY: ddzu_pres, ddzw
	200
	201	IMPLICIT NONE
	202
	203	INTEGER :: k
	204
	205
[1]	206	CALL fft_init
	207
[1212]	208	CALL tridia_init
[1111]	209
	210	poisfft_initialized = .TRUE.
	211
[1]	212	END SUBROUTINE poisfft_init
	213
[1111]	214
[809]	215	#if ! defined ( __check )
[1306]	216	SUBROUTINE poisfft( ar )
[1]	217
[1216]	218	USE control_parameters, ONLY : fft_method, transpose_compute_overlap
[1]	219	USE cpulog
	220	USE interfaces
	221	USE pegrid
	222
	223	IMPLICIT NONE
	224
[1306]	225	INTEGER :: ii, iind, inew, jj, jind, jnew, ki, kk, knew, n, nblk, &
	226	nnx_y, nny_z, nnz_t, nnz_x, nxl_y_bound, nxr_y_bound
[1216]	227	INTEGER, DIMENSION(4) :: isave
[1]	228
[1216]	229	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar
	230	!$acc declare create( ar_inv )
	231	REAL, DIMENSION(nys:nyn,nxl:nxr,1:nz) :: ar_inv
[1]	232
[1306]	233	REAL, DIMENSION(:,:,:), ALLOCATABLE :: ar1, f_in, f_inv, f_out_y, &
	234	f_out_z
[1216]	235
	236
[1]	237	CALL cpu_log( log_point_s(3), 'poisfft', 'start' )
	238
[1111]	239	IF ( .NOT. poisfft_initialized ) CALL poisfft_init
	240
[1]	241	!
	242	!-- Two-dimensional Fourier Transformation in x- and y-direction.
[1111]	243	IF ( pdims(2) == 1 .AND. pdims(1) > 1 ) THEN
[1]	244
	245	!
	246	!-- 1d-domain-decomposition along x:
	247	!-- FFT along y and transposition y --> x
[1216]	248	CALL ffty_tr_yx( ar, ar )
[1]	249
	250	!
	251	!-- FFT along x, solving the tridiagonal system and backward FFT
	252	CALL fftx_tri_fftx( ar )
	253
	254	!
	255	!-- Transposition x --> y and backward FFT along y
[1216]	256	CALL tr_xy_ffty( ar, ar )
[1]	257
[1111]	258	ELSEIF ( pdims(1) == 1 .AND. pdims(2) > 1 ) THEN
[1]	259
	260	!
	261	!-- 1d-domain-decomposition along y:
	262	!-- FFT along x and transposition x --> y
[1216]	263	CALL fftx_tr_xy( ar, ar )
[1]	264
	265	!
	266	!-- FFT along y, solving the tridiagonal system and backward FFT
	267	CALL ffty_tri_ffty( ar )
	268
	269	!
	270	!-- Transposition y --> x and backward FFT along x
[1216]	271	CALL tr_yx_fftx( ar, ar )
[1]	272
[1216]	273	ELSEIF ( .NOT. transpose_compute_overlap ) THEN
[1]	274
	275	!
[1111]	276	!-- 2d-domain-decomposition or no decomposition (1 PE run)
[1]	277	!-- Transposition z --> x
	278	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
[1216]	279	CALL resort_for_zx( ar, ar_inv )
	280	CALL transpose_zx( ar_inv, ar )
[1]	281	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
	282
	283	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
[1208]	284	IF ( fft_method /= 'system-specific' ) THEN
	285	!$acc update host( ar )
	286	ENDIF
[1106]	287	CALL fft_x( ar, 'forward' )
[1208]	288	IF ( fft_method /= 'system-specific' ) THEN
	289	!$acc update device( ar )
	290	ENDIF
[1]	291	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
	292
	293	!
	294	!-- Transposition x --> y
	295	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
[1216]	296	CALL resort_for_xy( ar, ar_inv )
	297	CALL transpose_xy( ar_inv, ar )
[1]	298	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
	299
	300	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
[1208]	301	IF ( fft_method /= 'system-specific' ) THEN
	302	!$acc update host( ar )
	303	ENDIF
[1216]	304	CALL fft_y( ar, 'forward', ar_tr = ar, &
	305	nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
	306	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
[1208]	307	IF ( fft_method /= 'system-specific' ) THEN
	308	!$acc update device( ar )
	309	ENDIF
[1]	310	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	311
	312	!
	313	!-- Transposition y --> z
	314	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
[1216]	315	CALL resort_for_yz( ar, ar_inv )
	316	CALL transpose_yz( ar_inv, ar )
[1]	317	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
	318
	319	!
[1106]	320	!-- Solve the tridiagonal equation system along z
[1]	321	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
[1212]	322	CALL tridia_substi( ar )
[1]	323	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
	324
	325	!
	326	!-- Inverse Fourier Transformation
	327	!-- Transposition z --> y
	328	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
[1216]	329	CALL transpose_zy( ar, ar_inv )
	330	CALL resort_for_zy( ar_inv, ar )
[1]	331	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
	332
	333	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
[1208]	334	IF ( fft_method /= 'system-specific' ) THEN
	335	!$acc update host( ar )
	336	ENDIF
[1216]	337	CALL fft_y( ar, 'backward', ar_tr = ar, &
	338	nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
	339	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
[1208]	340	IF ( fft_method /= 'system-specific' ) THEN
	341	!$acc update device( ar )
	342	ENDIF
[1]	343	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	344
	345	!
	346	!-- Transposition y --> x
	347	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
[1216]	348	CALL transpose_yx( ar, ar_inv )
	349	CALL resort_for_yx( ar_inv, ar )
[1]	350	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
	351
	352	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
[1208]	353	IF ( fft_method /= 'system-specific' ) THEN
	354	!$acc update host( ar )
	355	ENDIF
[1106]	356	CALL fft_x( ar, 'backward' )
[1208]	357	IF ( fft_method /= 'system-specific' ) THEN
	358	!$acc update device( ar )
	359	ENDIF
[1]	360	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	361
	362	!
	363	!-- Transposition x --> z
	364	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
[1216]	365	CALL transpose_xz( ar, ar_inv )
	366	CALL resort_for_xz( ar_inv, ar )
[1]	367	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
	368
[1216]	369	ELSE
	370
	371	!
	372	!-- 2d-domain-decomposition or no decomposition (1 PE run) with
	373	!-- overlapping transposition / fft
	374	ALLOCATE( f_out_y(0:ny,nxl_y:nxr_y,nzb_y:nzt_y), &
	375	f_out_z(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
	376	!
	377	!-- Transposition z --> x + subsequent fft along x
	378	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
	379	CALL resort_for_zx( ar, f_inv )
	380	!
	381	!-- Save original indices and gridpoint counter
	382	isave(1) = nz
	383	isave(2) = nzb_x
	384	isave(3) = nzt_x
	385	isave(4) = sendrecvcount_zx
	386	!
	387	!-- Set new indices for transformation
	388	nblk = nz / pdims(1)
	389	nz = pdims(1)
	390	nnz_x = 1
	391	nzb_x = 1 + myidx * nnz_x
	392	nzt_x = ( myidx + 1 ) * nnz_x
	393	sendrecvcount_zx = nnx * nny * nnz_x
	394
[1306]	395	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
[1216]	396	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
	397
[1306]	398	DO kk = 1, nblk
[1216]	399
[1306]	400	IF ( kk == 1 ) THEN
	401	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
	402	ELSE
	403	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
	404	ENDIF
[1216]	405
[1306]	406	DO knew = 1, nz
	407	ki = kk + nblk * ( knew - 1 )
	408	f_in(:,:,knew) = f_inv(:,:,ki)
	409	ENDDO
[1216]	410
[1306]	411	CALL transpose_zx( f_in, ar1(:,:,:))
	412	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
[1216]	413
[1306]	414	IF ( kk == 1 ) THEN
	415	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
	416	ELSE
	417	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
[1216]	418	ENDIF
	419
[1306]	420	n = isave(2) + kk - 1
	421	CALL fft_x( ar1(:,:,:), 'forward', ar_2d = f_out_z(:,:,n))
	422	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
[1216]	423
	424	ENDDO
	425	!
	426	!-- Restore original indices/counters
	427	nz = isave(1)
	428	nzb_x = isave(2)
	429	nzt_x = isave(3)
	430	sendrecvcount_zx = isave(4)
	431
	432	DEALLOCATE( ar1, f_in, f_inv )
	433
	434	!
	435	!-- Transposition x --> y + subsequent fft along y
	436	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
	437	CALL resort_for_xy( f_out_z, f_inv )
	438	!
	439	!-- Save original indices and gridpoint counter
	440	isave(1) = nx
	441	isave(2) = nxl_y
	442	isave(3) = nxr_y
	443	isave(4) = sendrecvcount_xy
	444	!
	445	!-- Set new indices for transformation
	446	nblk = ( ( nx+1 ) / pdims(2) ) - 1
	447	nx = pdims(2)
	448	nnx_y = 1
	449	nxl_y = myidy * nnx_y
	450	nxr_y = ( myidy + 1 ) * nnx_y - 1
	451	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
	452
[1306]	453	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
[1216]	454	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
	455
[1306]	456	DO ii = 0, nblk
[1216]	457
[1306]	458	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
[1216]	459
[1306]	460	DO inew = 0, nx-1
	461	iind = ii + ( nblk + 1 ) * inew
	462	f_in(:,:,inew) = f_inv(:,:,iind)
	463	ENDDO
[1216]	464
[1306]	465	CALL transpose_xy( f_in, ar1(:,:,:) )
[1216]	466
[1306]	467	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
[1216]	468
[1306]	469	IF ( ii == 1 ) THEN
	470	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
	471	ELSE
	472	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
[1216]	473	ENDIF
	474
[1306]	475	nxl_y_bound = isave(2)
	476	nxr_y_bound = isave(3)
	477	n = isave(2) + ii
	478	CALL fft_y( ar1(:,:,:), 'forward', ar_tr = f_out_y, &
	479	nxl_y_bound = nxl_y_bound, nxr_y_bound = nxr_y_bound, &
	480	nxl_y_l = n, nxr_y_l = n )
[1216]	481
[1306]	482	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
[1216]	483
	484	ENDDO
	485	!
	486	!-- Restore original indices/counters
	487	nx = isave(1)
	488	nxl_y = isave(2)
	489	nxr_y = isave(3)
	490	sendrecvcount_xy = isave(4)
	491
	492	DEALLOCATE( ar1, f_in, f_inv )
	493
	494	!
	495	!-- Transposition y --> z + subsequent tridia + resort for z --> y
	496	ALLOCATE( f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
	497	CALL resort_for_yz( f_out_y, f_inv )
	498	!
	499	!-- Save original indices and gridpoint counter
	500	isave(1) = ny
	501	isave(2) = nys_z
	502	isave(3) = nyn_z
	503	isave(4) = sendrecvcount_yz
	504	!
	505	!-- Set new indices for transformation
	506	nblk = ( ( ny+1 ) / pdims(1) ) - 1
	507	ny = pdims(1)
	508	nny_z = 1
	509	nys_z = myidx * nny_z
	510	nyn_z = ( myidx + 1 ) * nny_z - 1
	511	sendrecvcount_yz = ( nxr_y-nxl_y+1 ) * nny_z * ( nzt_y-nzb_y+1 )
	512
[1306]	513	ALLOCATE( ar1(nxl_z:nxr_z,nys_z:nyn_z,1:nz) )
[1216]	514	ALLOCATE( f_in(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
	515
[1306]	516	DO jj = 0, nblk
[1216]	517	!
[1306]	518	!-- Forward Fourier Transformation
	519	!-- Transposition y --> z
	520	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
[1216]	521
[1306]	522	DO jnew = 0, ny-1
	523	jind = jj + ( nblk + 1 ) * jnew
	524	f_in(:,:,jnew) = f_inv(:,:,jind)
	525	ENDDO
[1216]	526
[1306]	527	CALL transpose_yz( f_in, ar1(:,:,:) )
[1216]	528
[1306]	529	IF ( jj == nblk ) THEN
	530	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
	531	ELSE
	532	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
[1216]	533	ENDIF
	534
	535	!
[1306]	536	!-- Solve the tridiagonal equation system along z
	537	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
[1216]	538
[1306]	539	n = isave(2) + jj
	540	CALL tridia_substi_overlap( ar1(:,:,:), n )
[1216]	541
[1306]	542	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
[1216]	543
[1306]	544	!
	545	!-- Inverse Fourier Transformation
	546	!-- Transposition z --> y
	547	!-- Only one thread should call MPI routines, therefore forward and
	548	!-- backward tranpose are in the same section
	549	IF ( jj == 0 ) THEN
	550	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
	551	ELSE
	552	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
[1216]	553	ENDIF
	554
[1306]	555	CALL transpose_zy( ar1(:,:,:), f_in )
[1216]	556
[1306]	557	DO jnew = 0, ny-1
	558	jind = jj + ( nblk + 1 ) * jnew
	559	f_inv(:,:,jind) = f_in(:,:,jnew)
	560	ENDDO
[1216]	561
[1306]	562	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
[1216]	563
	564	ENDDO
	565	!
	566	!-- Restore original indices/counters
	567	ny = isave(1)
	568	nys_z = isave(2)
	569	nyn_z = isave(3)
	570	sendrecvcount_yz = isave(4)
	571
	572	CALL resort_for_zy( f_inv, f_out_y )
	573
	574	DEALLOCATE( ar1, f_in, f_inv )
	575
	576	!
	577	!-- fft along y backward + subsequent transposition y --> x
	578	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
	579	!
	580	!-- Save original indices and gridpoint counter
	581	isave(1) = nx
	582	isave(2) = nxl_y
	583	isave(3) = nxr_y
	584	isave(4) = sendrecvcount_xy
	585	!
	586	!-- Set new indices for transformation
	587	nblk = (( nx+1 ) / pdims(2) ) - 1
	588	nx = pdims(2)
	589	nnx_y = 1
	590	nxl_y = myidy * nnx_y
	591	nxr_y = ( myidy + 1 ) * nnx_y - 1
	592	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
	593
[1306]	594	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
[1216]	595	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
	596
[1306]	597	DO ii = 0, nblk
[1216]	598
[1306]	599	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
[1216]	600
[1306]	601	n = isave(2) + ii
	602	nxl_y_bound = isave(2)
	603	nxr_y_bound = isave(3)
[1216]	604
[1306]	605	CALL fft_y( ar1(:,:,:), 'backward', ar_tr = f_out_y, &
	606	nxl_y_bound = nxl_y_bound, nxr_y_bound = nxr_y_bound, &
	607	nxl_y_l = n, nxr_y_l = n )
[1216]	608
[1306]	609	IF ( ii == nblk ) THEN
	610	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	611	ELSE
	612	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
[1216]	613	ENDIF
	614
[1306]	615	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
[1216]	616
[1306]	617	CALL transpose_yx( ar1(:,:,:), f_in )
[1216]	618
[1306]	619	DO inew = 0, nx-1
	620	iind = ii + (nblk+1) * inew
	621	f_inv(:,:,iind) = f_in(:,:,inew)
	622	ENDDO
[1216]	623
[1306]	624	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
[1216]	625
	626	ENDDO
	627	!
	628	!-- Restore original indices/counters
	629	nx = isave(1)
	630	nxl_y = isave(2)
	631	nxr_y = isave(3)
	632	sendrecvcount_xy = isave(4)
	633
	634	CALL resort_for_yx( f_inv, f_out_z )
	635
	636	DEALLOCATE( ar1, f_in, f_inv )
	637
	638	!
	639	!-- fft along x backward + subsequent final transposition x --> z
	640	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
	641	!
	642	!-- Save original indices and gridpoint counter
	643	isave(1) = nz
	644	isave(2) = nzb_x
	645	isave(3) = nzt_x
	646	isave(4) = sendrecvcount_zx
	647	!
	648	!-- Set new indices for transformation
	649	nblk = nz / pdims(1)
	650	nz = pdims(1)
	651	nnz_x = 1
	652	nzb_x = 1 + myidx * nnz_x
	653	nzt_x = ( myidx + 1 ) * nnz_x
	654	sendrecvcount_zx = nnx * nny * nnz_x
	655
[1306]	656	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
[1216]	657	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
	658
[1306]	659	DO kk = 1, nblk
[1216]	660
[1306]	661	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
[1216]	662
[1306]	663	n = isave(2) + kk - 1
	664	CALL fft_x( ar1(:,:,:), 'backward', f_out_z(:,:,n))
[1216]	665
[1306]	666	IF ( kk == nblk ) THEN
	667	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	668	ELSE
	669	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
[1216]	670	ENDIF
	671
[1306]	672	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
[1216]	673
[1306]	674	CALL transpose_xz( ar1(:,:,:), f_in )
[1216]	675
[1306]	676	DO knew = 1, nz
	677	ki = kk + nblk * (knew-1)
	678	f_inv(:,:,ki) = f_in(:,:,knew)
	679	ENDDO
[1216]	680
[1306]	681	IF ( kk == nblk ) THEN
	682	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
	683	ELSE
	684	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
[1216]	685	ENDIF
	686
	687	ENDDO
	688	!
	689	!-- Restore original indices/counters
	690	nz = isave(1)
	691	nzb_x = isave(2)
	692	nzt_x = isave(3)
	693	sendrecvcount_zx = isave(4)
	694
	695	CALL resort_for_xz( f_inv, ar )
	696
	697	DEALLOCATE( ar1, f_in, f_inv )
	698
[1]	699	ENDIF
	700
	701	CALL cpu_log( log_point_s(3), 'poisfft', 'stop' )
	702
	703	END SUBROUTINE poisfft
	704
	705
	706
[1216]	707	SUBROUTINE ffty_tr_yx( f_in, f_out )
[1]	708
	709	!------------------------------------------------------------------------------!
	710	! Fourier-transformation along y with subsequent transposition y --> x for
	711	! a 1d-decomposition along x
	712	!
	713	! ATTENTION: The performance of this routine is much faster on the NEC-SX6,
	714	! if the first index of work_ffty_vec is odd. Otherwise
	715	! memory bank conflicts may occur (especially if the index is a
	716	! multiple of 128). That's why work_ffty_vec is dimensioned as
	717	! 0:ny+1.
	718	! Of course, this will not work if users are using an odd number
	719	! of gridpoints along y.
	720	!------------------------------------------------------------------------------!
	721
	722	USE control_parameters
	723	USE cpulog
	724	USE indices
	725	USE interfaces
	726	USE pegrid
	727	USE transpose_indices
	728
	729	IMPLICIT NONE
	730
	731	INTEGER :: i, iend, iouter, ir, j, k
	732	INTEGER, PARAMETER :: stridex = 4
	733
	734	REAL, DIMENSION(0:ny,stridex) :: work_ffty
	735	#if defined( __nec )
	736	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
	737	#endif
[1003]	738	REAL, DIMENSION(1:nz,0:ny,nxl:nxr) :: f_in
	739	REAL, DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_out
	740	REAL, DIMENSION(nxl:nxr,1:nz,0:ny) :: work
[1]	741
	742	!
	743	!-- Carry out the FFT along y, where all data are present due to the
	744	!-- 1d-decomposition along x. Resort the data in a way that x becomes
	745	!-- the first index.
[1106]	746	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'start' )
[1]	747
	748	IF ( host(1:3) == 'nec' ) THEN
	749	#if defined( __nec )
	750	!
	751	!-- Code optimized for vector processors
[85]	752	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	753	!$OMP DO
	754	DO i = nxl, nxr
	755
	756	DO j = 0, ny
	757	DO k = 1, nz
	758	work_ffty_vec(j,k,i) = f_in(k,j,i)
	759	ENDDO
	760	ENDDO
	761
	762	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' )
	763
	764	ENDDO
	765
	766	!$OMP DO
	767	DO k = 1, nz
	768	DO j = 0, ny
	769	DO i = nxl, nxr
	770	work(i,k,j) = work_ffty_vec(j,k,i)
	771	ENDDO
	772	ENDDO
	773	ENDDO
	774	!$OMP END PARALLEL
	775	#endif
	776
	777	ELSE
	778
	779	!
	780	!-- Cache optimized code.
	781	!-- The i-(x-)direction is split into a strided outer loop and an inner
	782	!-- loop for better cache performance
	783	!$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty)
	784	!$OMP DO
	785	DO iouter = nxl, nxr, stridex
	786
	787	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
	788
	789	DO k = 1, nz
	790
	791	DO i = iouter, iend
	792
	793	ir = i-iouter+1 ! counter within a stride
	794	DO j = 0, ny
	795	work_ffty(j,ir) = f_in(k,j,i)
	796	ENDDO
	797	!
	798	!-- FFT along y
[1106]	799	CALL fft_y_1d( work_ffty(:,ir), 'forward' )
[1]	800
	801	ENDDO
	802
	803	!
	804	!-- Resort
	805	DO j = 0, ny
	806	DO i = iouter, iend
	807	work(i,k,j) = work_ffty(j,i-iouter+1)
	808	ENDDO
	809	ENDDO
	810
	811	ENDDO
	812
	813	ENDDO
	814	!$OMP END PARALLEL
	815
	816	ENDIF
[1106]	817	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'pause' )
[1]	818
	819	!
	820	!-- Transpose array
[1111]	821	#if defined( __parallel )
[1]	822	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	823	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	824	CALL MPI_ALLTOALL( work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
	825	f_out(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
	826	comm1dx, ierr )
	827	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1111]	828	#endif
[1]	829
	830	END SUBROUTINE ffty_tr_yx
	831
	832
[1216]	833	SUBROUTINE tr_xy_ffty( f_in, f_out )
[1]	834
	835	!------------------------------------------------------------------------------!
	836	! Transposition x --> y with a subsequent backward Fourier transformation for
	837	! a 1d-decomposition along x
	838	!------------------------------------------------------------------------------!
	839
	840	USE control_parameters
	841	USE cpulog
	842	USE indices
	843	USE interfaces
	844	USE pegrid
	845	USE transpose_indices
	846
	847	IMPLICIT NONE
	848
	849	INTEGER :: i, iend, iouter, ir, j, k
	850	INTEGER, PARAMETER :: stridex = 4
	851
	852	REAL, DIMENSION(0:ny,stridex) :: work_ffty
	853	#if defined( __nec )
	854	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
	855	#endif
[1003]	856	REAL, DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_in
	857	REAL, DIMENSION(1:nz,0:ny,nxl:nxr) :: f_out
	858	REAL, DIMENSION(nxl:nxr,1:nz,0:ny) :: work
[1]	859
	860	!
	861	!-- Transpose array
[1111]	862	#if defined( __parallel )
[1]	863	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	864	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	865	CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
	866	work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
	867	comm1dx, ierr )
	868	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1111]	869	#endif
[1]	870
	871	!
	872	!-- Resort the data in a way that y becomes the first index and carry out the
	873	!-- backward fft along y.
[1106]	874	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'continue' )
[1]	875
	876	IF ( host(1:3) == 'nec' ) THEN
	877	#if defined( __nec )
	878	!
	879	!-- Code optimized for vector processors
[85]	880	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	881	!$OMP DO
	882	DO k = 1, nz
	883	DO j = 0, ny
	884	DO i = nxl, nxr
	885	work_ffty_vec(j,k,i) = work(i,k,j)
	886	ENDDO
	887	ENDDO
	888	ENDDO
	889
	890	!$OMP DO
	891	DO i = nxl, nxr
	892
	893	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' )
	894
	895	DO j = 0, ny
	896	DO k = 1, nz
	897	f_out(k,j,i) = work_ffty_vec(j,k,i)
	898	ENDDO
	899	ENDDO
	900
	901	ENDDO
	902	!$OMP END PARALLEL
	903	#endif
	904
	905	ELSE
	906
	907	!
	908	!-- Cache optimized code.
	909	!-- The i-(x-)direction is split into a strided outer loop and an inner
	910	!-- loop for better cache performance
	911	!$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty )
	912	!$OMP DO
	913	DO iouter = nxl, nxr, stridex
	914
	915	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
	916
	917	DO k = 1, nz
	918	!
	919	!-- Resort
	920	DO j = 0, ny
	921	DO i = iouter, iend
	922	work_ffty(j,i-iouter+1) = work(i,k,j)
	923	ENDDO
	924	ENDDO
	925
	926	DO i = iouter, iend
	927
	928	!
	929	!-- FFT along y
	930	ir = i-iouter+1 ! counter within a stride
[1106]	931	CALL fft_y_1d( work_ffty(:,ir), 'backward' )
[1]	932
	933	DO j = 0, ny
	934	f_out(k,j,i) = work_ffty(j,ir)
	935	ENDDO
	936	ENDDO
	937
	938	ENDDO
	939
	940	ENDDO
	941	!$OMP END PARALLEL
	942
	943	ENDIF
	944
[1106]	945	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'stop' )
[1]	946
	947	END SUBROUTINE tr_xy_ffty
	948
	949
	950	SUBROUTINE fftx_tri_fftx( ar )
	951
	952	!------------------------------------------------------------------------------!
	953	! FFT along x, solution of the tridiagonal system and backward FFT for
	954	! a 1d-decomposition along x
	955	!
	956	! WARNING: this subroutine may still not work for hybrid parallelization
	957	! with OpenMP (for possible necessary changes see the original
	958	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
	959	!------------------------------------------------------------------------------!
	960
	961	USE control_parameters
	962	USE cpulog
	963	USE grid_variables
	964	USE indices
	965	USE interfaces
	966	USE pegrid
	967	USE transpose_indices
	968
	969	IMPLICIT NONE
	970
	971	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
	972
[1003]	973	REAL, DIMENSION(0:nx) :: work_fftx
	974	REAL, DIMENSION(0:nx,1:nz) :: work_trix
	975	REAL, DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: ar
	976	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
[1]	977
	978
[1106]	979	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'start' )
[1]	980
	981	ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) )
	982
	983	tn = 0 ! Default thread number in case of one thread
	984	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_fftx, work_trix )
	985	DO j = nys_x, nyn_x
	986
	987	!$ tn = omp_get_thread_num()
	988
	989	IF ( host(1:3) == 'nec' ) THEN
	990	!
	991	!-- Code optimized for vector processors
	992	DO k = 1, nz
	993
	994	m = 0
	995	DO n = 1, pdims(1)
[1003]	996	DO i = 1, nnx
[1]	997	work_trix(m,k) = ar(i,k,j,n)
	998	m = m + 1
	999	ENDDO
	1000	ENDDO
	1001
	1002	ENDDO
	1003
	1004	CALL fft_x_m( work_trix, 'forward' )
	1005
	1006	ELSE
	1007	!
	1008	!-- Cache optimized code
	1009	DO k = 1, nz
	1010
	1011	m = 0
	1012	DO n = 1, pdims(1)
[1003]	1013	DO i = 1, nnx
[1]	1014	work_fftx(m) = ar(i,k,j,n)
	1015	m = m + 1
	1016	ENDDO
	1017	ENDDO
	1018
[1106]	1019	CALL fft_x_1d( work_fftx, 'forward' )
[1]	1020
	1021	DO i = 0, nx
	1022	work_trix(i,k) = work_fftx(i)
	1023	ENDDO
	1024
	1025	ENDDO
	1026
	1027	ENDIF
	1028
	1029	!
	1030	!-- Solve the linear equation system
	1031	CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) )
	1032
	1033	IF ( host(1:3) == 'nec' ) THEN
	1034	!
	1035	!-- Code optimized for vector processors
	1036	CALL fft_x_m( work_trix, 'backward' )
	1037
	1038	DO k = 1, nz
	1039
	1040	m = 0
	1041	DO n = 1, pdims(1)
[1003]	1042	DO i = 1, nnx
[1]	1043	ar(i,k,j,n) = work_trix(m,k)
	1044	m = m + 1
	1045	ENDDO
	1046	ENDDO
	1047
	1048	ENDDO
	1049
	1050	ELSE
	1051	!
	1052	!-- Cache optimized code
	1053	DO k = 1, nz
	1054
	1055	DO i = 0, nx
	1056	work_fftx(i) = work_trix(i,k)
	1057	ENDDO
	1058
[1106]	1059	CALL fft_x_1d( work_fftx, 'backward' )
[1]	1060
	1061	m = 0
	1062	DO n = 1, pdims(1)
[1003]	1063	DO i = 1, nnx
[1]	1064	ar(i,k,j,n) = work_fftx(m)
	1065	m = m + 1
	1066	ENDDO
	1067	ENDDO
	1068
	1069	ENDDO
	1070
	1071	ENDIF
	1072
	1073	ENDDO
	1074
	1075	DEALLOCATE( tri )
	1076
[1106]	1077	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'stop' )
[1]	1078
	1079	END SUBROUTINE fftx_tri_fftx
	1080
	1081
[1216]	1082	SUBROUTINE fftx_tr_xy( f_in, f_out )
[1]	1083
	1084	!------------------------------------------------------------------------------!
	1085	! Fourier-transformation along x with subsequent transposition x --> y for
	1086	! a 1d-decomposition along y
	1087	!
	1088	! ATTENTION: The NEC-branch of this routine may significantly profit from
	1089	! further optimizations. So far, performance is much worse than
	1090	! for routine ffty_tr_yx (more than three times slower).
	1091	!------------------------------------------------------------------------------!
	1092
	1093	USE control_parameters
	1094	USE cpulog
	1095	USE indices
	1096	USE interfaces
	1097	USE pegrid
	1098	USE transpose_indices
	1099
	1100	IMPLICIT NONE
	1101
	1102	INTEGER :: i, j, k
	1103
[1003]	1104	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
	1105	REAL, DIMENSION(1:nz,nys:nyn,0:nx) :: f_in
	1106	REAL, DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_out
	1107	REAL, DIMENSION(nys:nyn,1:nz,0:nx) :: work
[1]	1108
	1109	!
	1110	!-- Carry out the FFT along x, where all data are present due to the
	1111	!-- 1d-decomposition along y. Resort the data in a way that y becomes
	1112	!-- the first index.
[1106]	1113	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'start' )
[1]	1114
	1115	IF ( host(1:3) == 'nec' ) THEN
	1116	!
	1117	!-- Code for vector processors
[85]	1118	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	1119	!$OMP DO
	1120	DO i = 0, nx
	1121
	1122	DO j = nys, nyn
	1123	DO k = 1, nz
	1124	work_fftx(i,k,j) = f_in(k,j,i)
	1125	ENDDO
	1126	ENDDO
	1127
	1128	ENDDO
	1129
	1130	!$OMP DO
	1131	DO j = nys, nyn
	1132
	1133	CALL fft_x_m( work_fftx(:,:,j), 'forward' )
	1134
	1135	DO k = 1, nz
	1136	DO i = 0, nx
	1137	work(j,k,i) = work_fftx(i,k,j)
	1138	ENDDO
	1139	ENDDO
	1140
	1141	ENDDO
	1142	!$OMP END PARALLEL
	1143
	1144	ELSE
	1145
	1146	!
	1147	!-- Cache optimized code (there might be still a potential for better
	1148	!-- optimization).
[696]	1149	!$OMP PARALLEL PRIVATE (i,j,k)
[1]	1150	!$OMP DO
	1151	DO i = 0, nx
	1152
	1153	DO j = nys, nyn
	1154	DO k = 1, nz
	1155	work_fftx(i,k,j) = f_in(k,j,i)
	1156	ENDDO
	1157	ENDDO
	1158
	1159	ENDDO
	1160
	1161	!$OMP DO
	1162	DO j = nys, nyn
	1163	DO k = 1, nz
	1164
[1106]	1165	CALL fft_x_1d( work_fftx(0:nx,k,j), 'forward' )
[1]	1166
	1167	DO i = 0, nx
	1168	work(j,k,i) = work_fftx(i,k,j)
	1169	ENDDO
	1170	ENDDO
	1171
	1172	ENDDO
	1173	!$OMP END PARALLEL
	1174
	1175	ENDIF
[1106]	1176	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'pause' )
[1]	1177
	1178	!
	1179	!-- Transpose array
[1111]	1180	#if defined( __parallel )
[1]	1181	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	1182	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	1183	CALL MPI_ALLTOALL( work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
	1184	f_out(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
	1185	comm1dy, ierr )
	1186	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1111]	1187	#endif
[1]	1188
	1189	END SUBROUTINE fftx_tr_xy
	1190
	1191
[1216]	1192	SUBROUTINE tr_yx_fftx( f_in, f_out )
[1]	1193
	1194	!------------------------------------------------------------------------------!
	1195	! Transposition y --> x with a subsequent backward Fourier transformation for
	1196	! a 1d-decomposition along x
	1197	!------------------------------------------------------------------------------!
	1198
	1199	USE control_parameters
	1200	USE cpulog
	1201	USE indices
	1202	USE interfaces
	1203	USE pegrid
	1204	USE transpose_indices
	1205
	1206	IMPLICIT NONE
	1207
	1208	INTEGER :: i, j, k
	1209
[1003]	1210	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
	1211	REAL, DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_in
	1212	REAL, DIMENSION(1:nz,nys:nyn,0:nx) :: f_out
	1213	REAL, DIMENSION(nys:nyn,1:nz,0:nx) :: work
[1]	1214
	1215	!
	1216	!-- Transpose array
[1111]	1217	#if defined( __parallel )
[1]	1218	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	1219	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	1220	CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
	1221	work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
	1222	comm1dy, ierr )
	1223	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1111]	1224	#endif
[1]	1225
	1226	!
	1227	!-- Carry out the FFT along x, where all data are present due to the
	1228	!-- 1d-decomposition along y. Resort the data in a way that y becomes
	1229	!-- the first index.
[1106]	1230	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'continue' )
[1]	1231
	1232	IF ( host(1:3) == 'nec' ) THEN
	1233	!
	1234	!-- Code optimized for vector processors
[85]	1235	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	1236	!$OMP DO
	1237	DO j = nys, nyn
	1238
	1239	DO k = 1, nz
	1240	DO i = 0, nx
	1241	work_fftx(i,k,j) = work(j,k,i)
	1242	ENDDO
	1243	ENDDO
	1244
	1245	CALL fft_x_m( work_fftx(:,:,j), 'backward' )
	1246
	1247	ENDDO
	1248
	1249	!$OMP DO
	1250	DO i = 0, nx
	1251	DO j = nys, nyn
	1252	DO k = 1, nz
	1253	f_out(k,j,i) = work_fftx(i,k,j)
	1254	ENDDO
	1255	ENDDO
	1256	ENDDO
	1257	!$OMP END PARALLEL
	1258
	1259	ELSE
	1260
	1261	!
	1262	!-- Cache optimized code (there might be still a potential for better
	1263	!-- optimization).
[696]	1264	!$OMP PARALLEL PRIVATE (i,j,k)
[1]	1265	!$OMP DO
	1266	DO j = nys, nyn
	1267	DO k = 1, nz
	1268
	1269	DO i = 0, nx
	1270	work_fftx(i,k,j) = work(j,k,i)
	1271	ENDDO
	1272
[1106]	1273	CALL fft_x_1d( work_fftx(0:nx,k,j), 'backward' )
[1]	1274
	1275	ENDDO
	1276	ENDDO
	1277
	1278	!$OMP DO
	1279	DO i = 0, nx
	1280	DO j = nys, nyn
	1281	DO k = 1, nz
	1282	f_out(k,j,i) = work_fftx(i,k,j)
	1283	ENDDO
	1284	ENDDO
	1285	ENDDO
	1286	!$OMP END PARALLEL
	1287
	1288	ENDIF
[1106]	1289	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'stop' )
[1]	1290
	1291	END SUBROUTINE tr_yx_fftx
	1292
	1293
	1294	SUBROUTINE ffty_tri_ffty( ar )
	1295
	1296	!------------------------------------------------------------------------------!
	1297	! FFT along y, solution of the tridiagonal system and backward FFT for
	1298	! a 1d-decomposition along y
	1299	!
	1300	! WARNING: this subroutine may still not work for hybrid parallelization
	1301	! with OpenMP (for possible necessary changes see the original
	1302	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
	1303	!------------------------------------------------------------------------------!
	1304
	1305	USE control_parameters
	1306	USE cpulog
	1307	USE grid_variables
	1308	USE indices
	1309	USE interfaces
	1310	USE pegrid
	1311	USE transpose_indices
	1312
	1313	IMPLICIT NONE
	1314
	1315	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
	1316
[1003]	1317	REAL, DIMENSION(0:ny) :: work_ffty
	1318	REAL, DIMENSION(0:ny,1:nz) :: work_triy
	1319	REAL, DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: ar
	1320	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
[1]	1321
	1322
[1106]	1323	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'start' )
[1]	1324
	1325	ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) )
	1326
	1327	tn = 0 ! Default thread number in case of one thread
[696]	1328	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy )
[1]	1329	DO i = nxl_y, nxr_y
	1330
	1331	!$ tn = omp_get_thread_num()
	1332
	1333	IF ( host(1:3) == 'nec' ) THEN
	1334	!
	1335	!-- Code optimized for vector processors
	1336	DO k = 1, nz
	1337
	1338	m = 0
	1339	DO n = 1, pdims(2)
[1003]	1340	DO j = 1, nny
[1]	1341	work_triy(m,k) = ar(j,k,i,n)
	1342	m = m + 1
	1343	ENDDO
	1344	ENDDO
	1345
	1346	ENDDO
	1347
	1348	CALL fft_y_m( work_triy, ny, 'forward' )
	1349
	1350	ELSE
	1351	!
	1352	!-- Cache optimized code
	1353	DO k = 1, nz
	1354
	1355	m = 0
	1356	DO n = 1, pdims(2)
[1003]	1357	DO j = 1, nny
[1]	1358	work_ffty(m) = ar(j,k,i,n)
	1359	m = m + 1
	1360	ENDDO
	1361	ENDDO
	1362
[1106]	1363	CALL fft_y_1d( work_ffty, 'forward' )
[1]	1364
	1365	DO j = 0, ny
	1366	work_triy(j,k) = work_ffty(j)
	1367	ENDDO
	1368
	1369	ENDDO
	1370
	1371	ENDIF
	1372
	1373	!
	1374	!-- Solve the linear equation system
	1375	CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) )
	1376
	1377	IF ( host(1:3) == 'nec' ) THEN
	1378	!
	1379	!-- Code optimized for vector processors
	1380	CALL fft_y_m( work_triy, ny, 'backward' )
	1381
	1382	DO k = 1, nz
	1383
	1384	m = 0
	1385	DO n = 1, pdims(2)
[1003]	1386	DO j = 1, nny
[1]	1387	ar(j,k,i,n) = work_triy(m,k)
	1388	m = m + 1
	1389	ENDDO
	1390	ENDDO
	1391
	1392	ENDDO
	1393
	1394	ELSE
	1395	!
	1396	!-- Cache optimized code
	1397	DO k = 1, nz
	1398
	1399	DO j = 0, ny
	1400	work_ffty(j) = work_triy(j,k)
	1401	ENDDO
	1402
[1106]	1403	CALL fft_y_1d( work_ffty, 'backward' )
[1]	1404
	1405	m = 0
	1406	DO n = 1, pdims(2)
[1003]	1407	DO j = 1, nny
[1]	1408	ar(j,k,i,n) = work_ffty(m)
	1409	m = m + 1
	1410	ENDDO
	1411	ENDDO
	1412
	1413	ENDDO
	1414
	1415	ENDIF
	1416
	1417	ENDDO
	1418
	1419	DEALLOCATE( tri )
	1420
[1106]	1421	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'stop' )
[1]	1422
	1423	END SUBROUTINE ffty_tri_ffty
	1424
	1425	#endif
	1426
	1427	END MODULE poisfft_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |