Home

Context Navigation

source: palm/trunk/SOURCE/poisfft.f90 @ 1350

Last change on this file since 1350 was 1321, checked in by raasch, 11 years ago
last commit documented
Property svn:keywords set to `Id`
File size: 44.8 KB

Rev	Line
[1]	1	MODULE poisfft_mod
	2
[1036]	3	!--------------------------------------------------------------------------------!
	4	! This file is part of PALM.
	5	!
	6	! PALM is free software: you can redistribute it and/or modify it under the terms
	7	! of the GNU General Public License as published by the Free Software Foundation,
	8	! either version 3 of the License, or (at your option) any later version.
	9	!
	10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
	11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
	12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
	13	!
	14	! You should have received a copy of the GNU General Public License along with
	15	! PALM. If not, see <http://www.gnu.org/licenses/>.
	16	!
[1310]	17	! Copyright 1997-2014 Leibniz Universitaet Hannover
[1036]	18	!--------------------------------------------------------------------------------!
	19	!
[484]	20	! Current revisions:
[1]	21	! -----------------
[1321]	22	!
	23	!
	24	! Former revisions:
	25	! -----------------
	26	! $Id: poisfft.f90 1321 2014-03-20 09:40:40Z maronga $
	27	!
	28	! 1320 2014-03-20 08:40:49Z raasch
[1320]	29	! ONLY-attribute added to USE-statements,
	30	! kind-parameters added to all INTEGER and REAL declaration statements,
	31	! kinds are defined in new module kinds,
	32	! old module precision_kind is removed,
	33	! revision history before 2012 removed,
	34	! comment fields (!:) to be used for variable explanations added to
	35	! all variable declaration statements
[1217]	36	!
[1319]	37	! 1318 2014-03-17 13:35:16Z raasch
	38	! module interfaces removed
	39	!
[1307]	40	! 1306 2014-03-13 14:30:59Z raasch
	41	! openmp sections removed from the overlap branch,
	42	! second argument removed from parameter list
	43	!
[1217]	44	! 1216 2013-08-26 09:31:42Z raasch
[1216]	45	! resorting of arrays moved to separate routines resort_for_...,
	46	! one argument, used as temporary work array, removed from all transpose
	47	! routines
	48	! overlapping fft / transposition implemented
[1112]	49	!
[1213]	50	! 1212 2013-08-15 08:46:27Z raasch
	51	! tridia routines moved to seperate module tridia_solver
	52	!
[1209]	53	! 1208 2013-08-13 06:41:49Z raasch
	54	! acc-update clauses added for "ar" so that ffts other than cufft can also be
	55	! used (although they are not ported and will give a poor performance)
	56	!
[1112]	57	! 1111 2013-03-08 23:54:10Z raasch
[1111]	58	! further openACC porting of non-parallel (MPI) branch:
	59	! tridiagonal routines split into extermal subroutines (instead using CONTAINS),
	60	! no distinction between parallel/non-parallel in poisfft and tridia any more,
[1112]	61	! tridia routines moved to end of file because of probable bug in PGI compiler 12.5
[1111]	62	! (otherwise "invalid device function" is indicated during runtime),
	63	! optimization of tridia routines: constant elements and coefficients of tri are
	64	! stored in seperate arrays ddzuw and tric, last dimension of tri reduced from 5
	65	! to 2,
	66	! poisfft_init is now called internally from poisfft, maketri is called from
	67	! poisfft_init,
	68	! ibc_p_b = 2 removed
[1]	69	!
[1107]	70	! 1106 2013-03-04 05:31:38Z raasch
	71	! routines fftx, ffty, fftxp, fftyp removed, calls replaced by fft_x, fft_y,
	72	! in the 1D-decomposition routines fft_x, ffty are replaced by fft_x_1d,
	73	! fft_y_1d
	74	!
[1104]	75	! 1103 2013-02-20 02:15:53Z raasch
	76	! tri, ar, and ar1 arguments in tridia-routines (2d) are removed because they
	77	! sometimes cause segmentation faults with intel 12.1 compiler
	78	!
[1093]	79	! 1092 2013-02-02 11:24:22Z raasch
	80	! unused variables removed
	81	!
[1037]	82	! 1036 2012-10-22 13:43:42Z raasch
	83	! code put under GPL (PALM 3.9)
	84	!
[1014]	85	! 2012-09-21 07:03:55Z raasch
	86	! FLOAT type conversion replaced by REAL
	87	!
[1004]	88	! 1003 2012-09-14 14:35:53Z raasch
	89	! indices nxa, nya, etc. replaced by nx, ny, etc.
	90	!
[941]	91	! 940 2012-07-09 14:31:00Z raasch
	92	! special handling of tri-array as an argument in tridia_1dd routines switched
	93	! off because it caused segmentation faults with intel 12.1 compiler
	94	!
[878]	95	! 877 2012-04-03 11:21:44Z suehring
	96	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
	97	! pressure at the top of the model domain.
	98	!
[810]	99	! 809 2012-01-30 13:32:58Z maronga
	100	! Bugfix: replaced .AND. and .NOT. with && and ! in the preprocessor directives
	101	!
[808]	102	! 807 2012-01-25 11:53:51Z maronga
	103	! New cpp directive "__check" implemented which is used by check_namelist_files
	104	! (most of the code is unneeded by check_namelist_files).
	105	!
[1]	106	! Revision 1.1 1997/07/24 11:24:14 raasch
	107	! Initial revision
	108	!
	109	!
	110	! Description:
	111	! ------------
[1306]	112	! Original version by Stephan Siano (pois3d), as of July 23, 1996
	113	! Adapted for 2D-domain-decomposition by Siegfried Raasch, July 3, 1997
	114	!
	115	! Solves the Poisson equation with a 2D spectral method
	116	! d^2 p / dx^2 + d^2 p / dy^2 + d^2 p / dz^2 = s
	117	!
	118	! Input:
	119	! real ar contains (nnz,nny,nnx) elements of the velocity divergence,
	120	! starting from (1,nys,nxl)
	121	!
	122	! Output:
	123	! real ar contains the solution for perturbation pressure p
[1]	124	!------------------------------------------------------------------------------!
	125
[1320]	126	USE fft_xy, &
	127	ONLY: fft_init, fft_y, fft_y_1d, fft_y_m, fft_x, fft_x_1d, fft_x_m
[1]	128
[1320]	129	USE indices, &
	130	ONLY: nnx, nny, nx, nxl, nxr, ny, nys, nyn, nz
	131
	132	USE transpose_indices, &
	133	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nys_x, nys_z, nyn_x, nyn_z, nzb_x, &
	134	nzb_y, nzt_x, nzt_y
	135
	136	USE tridia_solver, &
	137	ONLY: tridia_1dd, tridia_init, tridia_substi, tridia_substi_overlap
	138
[1]	139	IMPLICIT NONE
	140
[1111]	141	LOGICAL, SAVE :: poisfft_initialized = .FALSE.
	142
[1]	143	PRIVATE
[807]	144
[809]	145	#if ! defined ( __check )
[1]	146	PUBLIC poisfft, poisfft_init
	147
	148	INTERFACE poisfft
	149	MODULE PROCEDURE poisfft
	150	END INTERFACE poisfft
	151
	152	INTERFACE poisfft_init
	153	MODULE PROCEDURE poisfft_init
	154	END INTERFACE poisfft_init
[807]	155	#else
	156	PUBLIC poisfft_init
[1]	157
[807]	158	INTERFACE poisfft_init
	159	MODULE PROCEDURE poisfft_init
	160	END INTERFACE poisfft_init
	161	#endif
	162
[1]	163	CONTAINS
	164
	165	SUBROUTINE poisfft_init
	166
[1320]	167	USE arrays_3d, &
	168	ONLY: ddzu_pres, ddzw
[1111]	169
[1320]	170	USE kinds
	171
[1111]	172	IMPLICIT NONE
	173
[1320]	174	INTEGER(iwp) :: k !:
[1111]	175
	176
[1]	177	CALL fft_init
	178
[1212]	179	CALL tridia_init
[1111]	180
	181	poisfft_initialized = .TRUE.
	182
[1]	183	END SUBROUTINE poisfft_init
	184
[1111]	185
[809]	186	#if ! defined ( __check )
[1306]	187	SUBROUTINE poisfft( ar )
[1]	188
[1320]	189	USE control_parameters, &
	190	ONLY: fft_method, transpose_compute_overlap
	191
	192	USE cpulog, &
	193	ONLY: cpu_log, cpu_log_nowait, log_point_s
	194
	195	USE kinds
	196
[1]	197	USE pegrid
	198
	199	IMPLICIT NONE
	200
[1320]	201	INTEGER(iwp) :: ii !:
	202	INTEGER(iwp) :: iind !:
	203	INTEGER(iwp) :: inew !:
	204	INTEGER(iwp) :: jj !:
	205	INTEGER(iwp) :: jind !:
	206	INTEGER(iwp) :: jnew !:
	207	INTEGER(iwp) :: ki !:
	208	INTEGER(iwp) :: kk !:
	209	INTEGER(iwp) :: knew !:
	210	INTEGER(iwp) :: n !:
	211	INTEGER(iwp) :: nblk !:
	212	INTEGER(iwp) :: nnx_y !:
	213	INTEGER(iwp) :: nny_z !:
	214	INTEGER(iwp) :: nnz_t !:
	215	INTEGER(iwp) :: nnz_x !:
	216	INTEGER(iwp) :: nxl_y_bound !:
	217	INTEGER(iwp) :: nxr_y_bound !:
[1]	218
[1320]	219	INTEGER(iwp), DIMENSION(4) :: isave !:
	220
	221	REAL(wp), DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar !:
[1216]	222	!$acc declare create( ar_inv )
[1320]	223	REAL(wp), DIMENSION(nys:nyn,nxl:nxr,1:nz) :: ar_inv !:
[1]	224
[1320]	225	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: ar1 !:
	226	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_in !:
	227	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_inv !:
	228	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_out_y !:
	229	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_out_z !:
[1216]	230
	231
[1]	232	CALL cpu_log( log_point_s(3), 'poisfft', 'start' )
	233
[1111]	234	IF ( .NOT. poisfft_initialized ) CALL poisfft_init
	235
[1]	236	!
	237	!-- Two-dimensional Fourier Transformation in x- and y-direction.
[1111]	238	IF ( pdims(2) == 1 .AND. pdims(1) > 1 ) THEN
[1]	239
	240	!
	241	!-- 1d-domain-decomposition along x:
	242	!-- FFT along y and transposition y --> x
[1216]	243	CALL ffty_tr_yx( ar, ar )
[1]	244
	245	!
	246	!-- FFT along x, solving the tridiagonal system and backward FFT
	247	CALL fftx_tri_fftx( ar )
	248
	249	!
	250	!-- Transposition x --> y and backward FFT along y
[1216]	251	CALL tr_xy_ffty( ar, ar )
[1]	252
[1111]	253	ELSEIF ( pdims(1) == 1 .AND. pdims(2) > 1 ) THEN
[1]	254
	255	!
	256	!-- 1d-domain-decomposition along y:
	257	!-- FFT along x and transposition x --> y
[1216]	258	CALL fftx_tr_xy( ar, ar )
[1]	259
	260	!
	261	!-- FFT along y, solving the tridiagonal system and backward FFT
	262	CALL ffty_tri_ffty( ar )
	263
	264	!
	265	!-- Transposition y --> x and backward FFT along x
[1216]	266	CALL tr_yx_fftx( ar, ar )
[1]	267
[1216]	268	ELSEIF ( .NOT. transpose_compute_overlap ) THEN
[1]	269
	270	!
[1111]	271	!-- 2d-domain-decomposition or no decomposition (1 PE run)
[1]	272	!-- Transposition z --> x
	273	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
[1216]	274	CALL resort_for_zx( ar, ar_inv )
	275	CALL transpose_zx( ar_inv, ar )
[1]	276	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
	277
	278	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
[1208]	279	IF ( fft_method /= 'system-specific' ) THEN
	280	!$acc update host( ar )
	281	ENDIF
[1106]	282	CALL fft_x( ar, 'forward' )
[1208]	283	IF ( fft_method /= 'system-specific' ) THEN
	284	!$acc update device( ar )
	285	ENDIF
[1]	286	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
	287
	288	!
	289	!-- Transposition x --> y
	290	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
[1216]	291	CALL resort_for_xy( ar, ar_inv )
	292	CALL transpose_xy( ar_inv, ar )
[1]	293	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
	294
	295	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
[1208]	296	IF ( fft_method /= 'system-specific' ) THEN
	297	!$acc update host( ar )
	298	ENDIF
[1216]	299	CALL fft_y( ar, 'forward', ar_tr = ar, &
	300	nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
	301	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
[1208]	302	IF ( fft_method /= 'system-specific' ) THEN
	303	!$acc update device( ar )
	304	ENDIF
[1]	305	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	306
	307	!
	308	!-- Transposition y --> z
	309	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
[1216]	310	CALL resort_for_yz( ar, ar_inv )
	311	CALL transpose_yz( ar_inv, ar )
[1]	312	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
	313
	314	!
[1106]	315	!-- Solve the tridiagonal equation system along z
[1]	316	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
[1212]	317	CALL tridia_substi( ar )
[1]	318	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
	319
	320	!
	321	!-- Inverse Fourier Transformation
	322	!-- Transposition z --> y
	323	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
[1216]	324	CALL transpose_zy( ar, ar_inv )
	325	CALL resort_for_zy( ar_inv, ar )
[1]	326	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
	327
	328	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
[1208]	329	IF ( fft_method /= 'system-specific' ) THEN
	330	!$acc update host( ar )
	331	ENDIF
[1216]	332	CALL fft_y( ar, 'backward', ar_tr = ar, &
	333	nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
	334	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
[1208]	335	IF ( fft_method /= 'system-specific' ) THEN
	336	!$acc update device( ar )
	337	ENDIF
[1]	338	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	339
	340	!
	341	!-- Transposition y --> x
	342	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
[1216]	343	CALL transpose_yx( ar, ar_inv )
	344	CALL resort_for_yx( ar_inv, ar )
[1]	345	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
	346
	347	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
[1208]	348	IF ( fft_method /= 'system-specific' ) THEN
	349	!$acc update host( ar )
	350	ENDIF
[1106]	351	CALL fft_x( ar, 'backward' )
[1208]	352	IF ( fft_method /= 'system-specific' ) THEN
	353	!$acc update device( ar )
	354	ENDIF
[1]	355	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	356
	357	!
	358	!-- Transposition x --> z
	359	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
[1216]	360	CALL transpose_xz( ar, ar_inv )
	361	CALL resort_for_xz( ar_inv, ar )
[1]	362	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
	363
[1216]	364	ELSE
	365
	366	!
	367	!-- 2d-domain-decomposition or no decomposition (1 PE run) with
	368	!-- overlapping transposition / fft
[1318]	369	!-- cputime logging must not use barriers, which would prevent overlapping
[1216]	370	ALLOCATE( f_out_y(0:ny,nxl_y:nxr_y,nzb_y:nzt_y), &
	371	f_out_z(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
	372	!
	373	!-- Transposition z --> x + subsequent fft along x
	374	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
	375	CALL resort_for_zx( ar, f_inv )
	376	!
	377	!-- Save original indices and gridpoint counter
	378	isave(1) = nz
	379	isave(2) = nzb_x
	380	isave(3) = nzt_x
	381	isave(4) = sendrecvcount_zx
	382	!
	383	!-- Set new indices for transformation
	384	nblk = nz / pdims(1)
	385	nz = pdims(1)
	386	nnz_x = 1
	387	nzb_x = 1 + myidx * nnz_x
	388	nzt_x = ( myidx + 1 ) * nnz_x
	389	sendrecvcount_zx = nnx * nny * nnz_x
	390
[1306]	391	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
[1216]	392	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
	393
[1306]	394	DO kk = 1, nblk
[1216]	395
[1306]	396	IF ( kk == 1 ) THEN
[1318]	397	CALL cpu_log( log_point_s(5), 'transpo forward', 'start', cpu_log_nowait )
[1306]	398	ELSE
[1318]	399	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
[1306]	400	ENDIF
[1216]	401
[1306]	402	DO knew = 1, nz
	403	ki = kk + nblk * ( knew - 1 )
	404	f_in(:,:,knew) = f_inv(:,:,ki)
	405	ENDDO
[1216]	406
[1306]	407	CALL transpose_zx( f_in, ar1(:,:,:))
	408	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
[1216]	409
[1306]	410	IF ( kk == 1 ) THEN
[1318]	411	CALL cpu_log( log_point_s(4), 'fft_x', 'start', cpu_log_nowait )
[1306]	412	ELSE
[1318]	413	CALL cpu_log( log_point_s(4), 'fft_x', 'continue', cpu_log_nowait )
[1216]	414	ENDIF
	415
[1306]	416	n = isave(2) + kk - 1
	417	CALL fft_x( ar1(:,:,:), 'forward', ar_2d = f_out_z(:,:,n))
	418	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
[1216]	419
	420	ENDDO
	421	!
	422	!-- Restore original indices/counters
	423	nz = isave(1)
	424	nzb_x = isave(2)
	425	nzt_x = isave(3)
	426	sendrecvcount_zx = isave(4)
	427
	428	DEALLOCATE( ar1, f_in, f_inv )
	429
	430	!
	431	!-- Transposition x --> y + subsequent fft along y
	432	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
	433	CALL resort_for_xy( f_out_z, f_inv )
	434	!
	435	!-- Save original indices and gridpoint counter
	436	isave(1) = nx
	437	isave(2) = nxl_y
	438	isave(3) = nxr_y
	439	isave(4) = sendrecvcount_xy
	440	!
	441	!-- Set new indices for transformation
	442	nblk = ( ( nx+1 ) / pdims(2) ) - 1
	443	nx = pdims(2)
	444	nnx_y = 1
	445	nxl_y = myidy * nnx_y
	446	nxr_y = ( myidy + 1 ) * nnx_y - 1
	447	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
	448
[1306]	449	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
[1216]	450	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
	451
[1306]	452	DO ii = 0, nblk
[1216]	453
[1318]	454	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
[1216]	455
[1306]	456	DO inew = 0, nx-1
	457	iind = ii + ( nblk + 1 ) * inew
	458	f_in(:,:,inew) = f_inv(:,:,iind)
	459	ENDDO
[1216]	460
[1306]	461	CALL transpose_xy( f_in, ar1(:,:,:) )
[1216]	462
[1306]	463	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
[1216]	464
[1306]	465	IF ( ii == 1 ) THEN
[1318]	466	CALL cpu_log( log_point_s(7), 'fft_y', 'start', cpu_log_nowait )
[1306]	467	ELSE
[1318]	468	CALL cpu_log( log_point_s(7), 'fft_y', 'continue', cpu_log_nowait )
[1216]	469	ENDIF
	470
[1306]	471	nxl_y_bound = isave(2)
	472	nxr_y_bound = isave(3)
	473	n = isave(2) + ii
	474	CALL fft_y( ar1(:,:,:), 'forward', ar_tr = f_out_y, &
	475	nxl_y_bound = nxl_y_bound, nxr_y_bound = nxr_y_bound, &
	476	nxl_y_l = n, nxr_y_l = n )
[1216]	477
[1306]	478	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
[1216]	479
	480	ENDDO
	481	!
	482	!-- Restore original indices/counters
	483	nx = isave(1)
	484	nxl_y = isave(2)
	485	nxr_y = isave(3)
	486	sendrecvcount_xy = isave(4)
	487
	488	DEALLOCATE( ar1, f_in, f_inv )
	489
	490	!
	491	!-- Transposition y --> z + subsequent tridia + resort for z --> y
	492	ALLOCATE( f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
	493	CALL resort_for_yz( f_out_y, f_inv )
	494	!
	495	!-- Save original indices and gridpoint counter
	496	isave(1) = ny
	497	isave(2) = nys_z
	498	isave(3) = nyn_z
	499	isave(4) = sendrecvcount_yz
	500	!
	501	!-- Set new indices for transformation
	502	nblk = ( ( ny+1 ) / pdims(1) ) - 1
	503	ny = pdims(1)
	504	nny_z = 1
	505	nys_z = myidx * nny_z
	506	nyn_z = ( myidx + 1 ) * nny_z - 1
	507	sendrecvcount_yz = ( nxr_y-nxl_y+1 ) * nny_z * ( nzt_y-nzb_y+1 )
	508
[1306]	509	ALLOCATE( ar1(nxl_z:nxr_z,nys_z:nyn_z,1:nz) )
[1216]	510	ALLOCATE( f_in(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
	511
[1306]	512	DO jj = 0, nblk
[1216]	513	!
[1306]	514	!-- Forward Fourier Transformation
	515	!-- Transposition y --> z
[1318]	516	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
[1216]	517
[1306]	518	DO jnew = 0, ny-1
	519	jind = jj + ( nblk + 1 ) * jnew
	520	f_in(:,:,jnew) = f_inv(:,:,jind)
	521	ENDDO
[1216]	522
[1306]	523	CALL transpose_yz( f_in, ar1(:,:,:) )
[1216]	524
[1306]	525	IF ( jj == nblk ) THEN
	526	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
	527	ELSE
	528	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
[1216]	529	ENDIF
	530
	531	!
[1306]	532	!-- Solve the tridiagonal equation system along z
[1318]	533	CALL cpu_log( log_point_s(6), 'tridia', 'start', cpu_log_nowait )
[1216]	534
[1306]	535	n = isave(2) + jj
	536	CALL tridia_substi_overlap( ar1(:,:,:), n )
[1216]	537
[1306]	538	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
[1216]	539
[1306]	540	!
	541	!-- Inverse Fourier Transformation
	542	!-- Transposition z --> y
	543	!-- Only one thread should call MPI routines, therefore forward and
	544	!-- backward tranpose are in the same section
	545	IF ( jj == 0 ) THEN
[1318]	546	CALL cpu_log( log_point_s(8), 'transpo invers', 'start', cpu_log_nowait )
[1306]	547	ELSE
[1318]	548	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
[1216]	549	ENDIF
	550
[1306]	551	CALL transpose_zy( ar1(:,:,:), f_in )
[1216]	552
[1306]	553	DO jnew = 0, ny-1
	554	jind = jj + ( nblk + 1 ) * jnew
	555	f_inv(:,:,jind) = f_in(:,:,jnew)
	556	ENDDO
[1216]	557
[1306]	558	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
[1216]	559
	560	ENDDO
	561	!
	562	!-- Restore original indices/counters
	563	ny = isave(1)
	564	nys_z = isave(2)
	565	nyn_z = isave(3)
	566	sendrecvcount_yz = isave(4)
	567
	568	CALL resort_for_zy( f_inv, f_out_y )
	569
	570	DEALLOCATE( ar1, f_in, f_inv )
	571
	572	!
	573	!-- fft along y backward + subsequent transposition y --> x
	574	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
	575	!
	576	!-- Save original indices and gridpoint counter
	577	isave(1) = nx
	578	isave(2) = nxl_y
	579	isave(3) = nxr_y
	580	isave(4) = sendrecvcount_xy
	581	!
	582	!-- Set new indices for transformation
	583	nblk = (( nx+1 ) / pdims(2) ) - 1
	584	nx = pdims(2)
	585	nnx_y = 1
	586	nxl_y = myidy * nnx_y
	587	nxr_y = ( myidy + 1 ) * nnx_y - 1
	588	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
	589
[1306]	590	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
[1216]	591	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
	592
[1306]	593	DO ii = 0, nblk
[1216]	594
[1318]	595	CALL cpu_log( log_point_s(7), 'fft_y', 'continue', cpu_log_nowait )
[1216]	596
[1306]	597	n = isave(2) + ii
	598	nxl_y_bound = isave(2)
	599	nxr_y_bound = isave(3)
[1216]	600
[1306]	601	CALL fft_y( ar1(:,:,:), 'backward', ar_tr = f_out_y, &
	602	nxl_y_bound = nxl_y_bound, nxr_y_bound = nxr_y_bound, &
	603	nxl_y_l = n, nxr_y_l = n )
[1216]	604
[1306]	605	IF ( ii == nblk ) THEN
	606	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	607	ELSE
	608	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
[1216]	609	ENDIF
	610
[1318]	611	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
[1216]	612
[1306]	613	CALL transpose_yx( ar1(:,:,:), f_in )
[1216]	614
[1306]	615	DO inew = 0, nx-1
	616	iind = ii + (nblk+1) * inew
	617	f_inv(:,:,iind) = f_in(:,:,inew)
	618	ENDDO
[1216]	619
[1306]	620	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
[1216]	621
	622	ENDDO
	623	!
	624	!-- Restore original indices/counters
	625	nx = isave(1)
	626	nxl_y = isave(2)
	627	nxr_y = isave(3)
	628	sendrecvcount_xy = isave(4)
	629
	630	CALL resort_for_yx( f_inv, f_out_z )
	631
	632	DEALLOCATE( ar1, f_in, f_inv )
	633
	634	!
	635	!-- fft along x backward + subsequent final transposition x --> z
	636	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
	637	!
	638	!-- Save original indices and gridpoint counter
	639	isave(1) = nz
	640	isave(2) = nzb_x
	641	isave(3) = nzt_x
	642	isave(4) = sendrecvcount_zx
	643	!
	644	!-- Set new indices for transformation
	645	nblk = nz / pdims(1)
	646	nz = pdims(1)
	647	nnz_x = 1
	648	nzb_x = 1 + myidx * nnz_x
	649	nzt_x = ( myidx + 1 ) * nnz_x
	650	sendrecvcount_zx = nnx * nny * nnz_x
	651
[1306]	652	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
[1216]	653	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
	654
[1306]	655	DO kk = 1, nblk
[1216]	656
[1318]	657	CALL cpu_log( log_point_s(4), 'fft_x', 'continue', cpu_log_nowait )
[1216]	658
[1306]	659	n = isave(2) + kk - 1
	660	CALL fft_x( ar1(:,:,:), 'backward', f_out_z(:,:,n))
[1216]	661
[1306]	662	IF ( kk == nblk ) THEN
	663	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	664	ELSE
	665	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
[1216]	666	ENDIF
	667
[1318]	668	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
[1216]	669
[1306]	670	CALL transpose_xz( ar1(:,:,:), f_in )
[1216]	671
[1306]	672	DO knew = 1, nz
	673	ki = kk + nblk * (knew-1)
	674	f_inv(:,:,ki) = f_in(:,:,knew)
	675	ENDDO
[1216]	676
[1306]	677	IF ( kk == nblk ) THEN
	678	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
	679	ELSE
	680	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
[1216]	681	ENDIF
	682
	683	ENDDO
	684	!
	685	!-- Restore original indices/counters
	686	nz = isave(1)
	687	nzb_x = isave(2)
	688	nzt_x = isave(3)
	689	sendrecvcount_zx = isave(4)
	690
	691	CALL resort_for_xz( f_inv, ar )
	692
	693	DEALLOCATE( ar1, f_in, f_inv )
	694
[1]	695	ENDIF
	696
	697	CALL cpu_log( log_point_s(3), 'poisfft', 'stop' )
	698
	699	END SUBROUTINE poisfft
	700
	701
	702
[1216]	703	SUBROUTINE ffty_tr_yx( f_in, f_out )
[1]	704
	705	!------------------------------------------------------------------------------!
	706	! Fourier-transformation along y with subsequent transposition y --> x for
	707	! a 1d-decomposition along x
	708	!
	709	! ATTENTION: The performance of this routine is much faster on the NEC-SX6,
	710	! if the first index of work_ffty_vec is odd. Otherwise
	711	! memory bank conflicts may occur (especially if the index is a
	712	! multiple of 128). That's why work_ffty_vec is dimensioned as
	713	! 0:ny+1.
	714	! Of course, this will not work if users are using an odd number
	715	! of gridpoints along y.
	716	!------------------------------------------------------------------------------!
	717
[1320]	718	USE control_parameters, &
	719	ONLY: host
	720
	721	USE cpulog, &
	722	ONLY: cpu_log, log_point_s
	723
	724	USE kinds
	725
[1]	726	USE pegrid
	727
	728	IMPLICIT NONE
	729
[1320]	730	INTEGER(iwp) :: i !:
	731	INTEGER(iwp) :: iend !:
	732	INTEGER(iwp) :: iouter !:
	733	INTEGER(iwp) :: ir !:
	734	INTEGER(iwp) :: j !:
	735	INTEGER(iwp) :: k !:
[1]	736
[1320]	737	INTEGER(iwp), PARAMETER :: stridex = 4 !:
	738
	739	REAL(wp), DIMENSION(0:ny,stridex) :: work_ffty !:
[1]	740	#if defined( __nec )
[1320]	741	REAL(wp), DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec !:
[1]	742	#endif
[1320]	743	REAL(wp), DIMENSION(1:nz,0:ny,nxl:nxr) :: f_in !:
	744	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_out !:
	745	REAL(wp), DIMENSION(nxl:nxr,1:nz,0:ny) :: work !:
[1]	746
	747	!
	748	!-- Carry out the FFT along y, where all data are present due to the
	749	!-- 1d-decomposition along x. Resort the data in a way that x becomes
	750	!-- the first index.
[1106]	751	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'start' )
[1]	752
	753	IF ( host(1:3) == 'nec' ) THEN
	754	#if defined( __nec )
	755	!
	756	!-- Code optimized for vector processors
[85]	757	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	758	!$OMP DO
	759	DO i = nxl, nxr
	760
	761	DO j = 0, ny
	762	DO k = 1, nz
	763	work_ffty_vec(j,k,i) = f_in(k,j,i)
	764	ENDDO
	765	ENDDO
	766
	767	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' )
	768
	769	ENDDO
	770
	771	!$OMP DO
	772	DO k = 1, nz
	773	DO j = 0, ny
	774	DO i = nxl, nxr
	775	work(i,k,j) = work_ffty_vec(j,k,i)
	776	ENDDO
	777	ENDDO
	778	ENDDO
	779	!$OMP END PARALLEL
	780	#endif
	781
	782	ELSE
	783
	784	!
	785	!-- Cache optimized code.
	786	!-- The i-(x-)direction is split into a strided outer loop and an inner
	787	!-- loop for better cache performance
	788	!$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty)
	789	!$OMP DO
	790	DO iouter = nxl, nxr, stridex
	791
	792	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
	793
	794	DO k = 1, nz
	795
	796	DO i = iouter, iend
	797
	798	ir = i-iouter+1 ! counter within a stride
	799	DO j = 0, ny
	800	work_ffty(j,ir) = f_in(k,j,i)
	801	ENDDO
	802	!
	803	!-- FFT along y
[1106]	804	CALL fft_y_1d( work_ffty(:,ir), 'forward' )
[1]	805
	806	ENDDO
	807
	808	!
	809	!-- Resort
	810	DO j = 0, ny
	811	DO i = iouter, iend
	812	work(i,k,j) = work_ffty(j,i-iouter+1)
	813	ENDDO
	814	ENDDO
	815
	816	ENDDO
	817
	818	ENDDO
	819	!$OMP END PARALLEL
	820
	821	ENDIF
[1106]	822	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'pause' )
[1]	823
	824	!
	825	!-- Transpose array
[1111]	826	#if defined( __parallel )
[1]	827	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	828	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	829	CALL MPI_ALLTOALL( work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
	830	f_out(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
	831	comm1dx, ierr )
	832	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1111]	833	#endif
[1]	834
	835	END SUBROUTINE ffty_tr_yx
	836
	837
[1216]	838	SUBROUTINE tr_xy_ffty( f_in, f_out )
[1]	839
	840	!------------------------------------------------------------------------------!
	841	! Transposition x --> y with a subsequent backward Fourier transformation for
	842	! a 1d-decomposition along x
	843	!------------------------------------------------------------------------------!
	844
[1320]	845	USE control_parameters, &
	846	ONLY: host
	847
	848	USE cpulog, &
	849	ONLY: cpu_log, log_point_s
	850
	851	USE kinds
	852
[1]	853	USE pegrid
	854
	855	IMPLICIT NONE
	856
[1320]	857	INTEGER(iwp) :: i !:
	858	INTEGER(iwp) :: iend !:
	859	INTEGER(iwp) :: iouter !:
	860	INTEGER(iwp) :: ir !:
	861	INTEGER(iwp) :: j !:
	862	INTEGER(iwp) :: k !:
[1]	863
[1320]	864	INTEGER(iwp), PARAMETER :: stridex = 4 !:
	865
	866	REAL(wp), DIMENSION(0:ny,stridex) :: work_ffty !:
[1]	867	#if defined( __nec )
[1320]	868	REAL(wp), DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec !:
[1]	869	#endif
[1320]	870	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_in !:
	871	REAL(wp), DIMENSION(1:nz,0:ny,nxl:nxr) :: f_out !:
	872	REAL(wp), DIMENSION(nxl:nxr,1:nz,0:ny) :: work !:
[1]	873
	874	!
	875	!-- Transpose array
[1111]	876	#if defined( __parallel )
[1]	877	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	878	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	879	CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
	880	work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
	881	comm1dx, ierr )
	882	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1111]	883	#endif
[1]	884
	885	!
	886	!-- Resort the data in a way that y becomes the first index and carry out the
	887	!-- backward fft along y.
[1106]	888	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'continue' )
[1]	889
	890	IF ( host(1:3) == 'nec' ) THEN
	891	#if defined( __nec )
	892	!
	893	!-- Code optimized for vector processors
[85]	894	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	895	!$OMP DO
	896	DO k = 1, nz
	897	DO j = 0, ny
	898	DO i = nxl, nxr
	899	work_ffty_vec(j,k,i) = work(i,k,j)
	900	ENDDO
	901	ENDDO
	902	ENDDO
	903
	904	!$OMP DO
	905	DO i = nxl, nxr
	906
	907	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' )
	908
	909	DO j = 0, ny
	910	DO k = 1, nz
	911	f_out(k,j,i) = work_ffty_vec(j,k,i)
	912	ENDDO
	913	ENDDO
	914
	915	ENDDO
	916	!$OMP END PARALLEL
	917	#endif
	918
	919	ELSE
	920
	921	!
	922	!-- Cache optimized code.
	923	!-- The i-(x-)direction is split into a strided outer loop and an inner
	924	!-- loop for better cache performance
	925	!$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty )
	926	!$OMP DO
	927	DO iouter = nxl, nxr, stridex
	928
	929	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
	930
	931	DO k = 1, nz
	932	!
	933	!-- Resort
	934	DO j = 0, ny
	935	DO i = iouter, iend
	936	work_ffty(j,i-iouter+1) = work(i,k,j)
	937	ENDDO
	938	ENDDO
	939
	940	DO i = iouter, iend
	941
	942	!
	943	!-- FFT along y
	944	ir = i-iouter+1 ! counter within a stride
[1106]	945	CALL fft_y_1d( work_ffty(:,ir), 'backward' )
[1]	946
	947	DO j = 0, ny
	948	f_out(k,j,i) = work_ffty(j,ir)
	949	ENDDO
	950	ENDDO
	951
	952	ENDDO
	953
	954	ENDDO
	955	!$OMP END PARALLEL
	956
	957	ENDIF
	958
[1106]	959	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'stop' )
[1]	960
	961	END SUBROUTINE tr_xy_ffty
	962
	963
	964	SUBROUTINE fftx_tri_fftx( ar )
	965
	966	!------------------------------------------------------------------------------!
	967	! FFT along x, solution of the tridiagonal system and backward FFT for
	968	! a 1d-decomposition along x
	969	!
	970	! WARNING: this subroutine may still not work for hybrid parallelization
	971	! with OpenMP (for possible necessary changes see the original
	972	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
	973	!------------------------------------------------------------------------------!
	974
[1320]	975	USE control_parameters, &
	976	ONLY: host
	977
	978	USE cpulog, &
	979	ONLY: cpu_log, log_point_s
	980
	981	USE grid_variables, &
	982	ONLY: ddx2, ddy2
	983
	984	USE kinds
	985
[1]	986	USE pegrid
	987
	988	IMPLICIT NONE
	989
[1320]	990	INTEGER(iwp) :: i !:
	991	INTEGER(iwp) :: j !:
	992	INTEGER(iwp) :: k !:
	993	INTEGER(iwp) :: m !:
	994	INTEGER(iwp) :: n !:
	995	INTEGER(iwp) :: omp_get_thread_num !:
	996	INTEGER(iwp) :: tn !:
[1]	997
[1320]	998	REAL(wp), DIMENSION(0:nx) :: work_fftx !:
	999	REAL(wp), DIMENSION(0:nx,1:nz) :: work_trix !:
	1000	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: ar !:
	1001	REAL(wp), DIMENSION(:,:,:,:), ALLOCATABLE :: tri !:
[1]	1002
	1003
[1106]	1004	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'start' )
[1]	1005
	1006	ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) )
	1007
	1008	tn = 0 ! Default thread number in case of one thread
	1009	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_fftx, work_trix )
	1010	DO j = nys_x, nyn_x
	1011
	1012	!$ tn = omp_get_thread_num()
	1013
	1014	IF ( host(1:3) == 'nec' ) THEN
	1015	!
	1016	!-- Code optimized for vector processors
	1017	DO k = 1, nz
	1018
	1019	m = 0
	1020	DO n = 1, pdims(1)
[1003]	1021	DO i = 1, nnx
[1]	1022	work_trix(m,k) = ar(i,k,j,n)
	1023	m = m + 1
	1024	ENDDO
	1025	ENDDO
	1026
	1027	ENDDO
	1028
	1029	CALL fft_x_m( work_trix, 'forward' )
	1030
	1031	ELSE
	1032	!
	1033	!-- Cache optimized code
	1034	DO k = 1, nz
	1035
	1036	m = 0
	1037	DO n = 1, pdims(1)
[1003]	1038	DO i = 1, nnx
[1]	1039	work_fftx(m) = ar(i,k,j,n)
	1040	m = m + 1
	1041	ENDDO
	1042	ENDDO
	1043
[1106]	1044	CALL fft_x_1d( work_fftx, 'forward' )
[1]	1045
	1046	DO i = 0, nx
	1047	work_trix(i,k) = work_fftx(i)
	1048	ENDDO
	1049
	1050	ENDDO
	1051
	1052	ENDIF
	1053
	1054	!
	1055	!-- Solve the linear equation system
	1056	CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) )
	1057
	1058	IF ( host(1:3) == 'nec' ) THEN
	1059	!
	1060	!-- Code optimized for vector processors
	1061	CALL fft_x_m( work_trix, 'backward' )
	1062
	1063	DO k = 1, nz
	1064
	1065	m = 0
	1066	DO n = 1, pdims(1)
[1003]	1067	DO i = 1, nnx
[1]	1068	ar(i,k,j,n) = work_trix(m,k)
	1069	m = m + 1
	1070	ENDDO
	1071	ENDDO
	1072
	1073	ENDDO
	1074
	1075	ELSE
	1076	!
	1077	!-- Cache optimized code
	1078	DO k = 1, nz
	1079
	1080	DO i = 0, nx
	1081	work_fftx(i) = work_trix(i,k)
	1082	ENDDO
	1083
[1106]	1084	CALL fft_x_1d( work_fftx, 'backward' )
[1]	1085
	1086	m = 0
	1087	DO n = 1, pdims(1)
[1003]	1088	DO i = 1, nnx
[1]	1089	ar(i,k,j,n) = work_fftx(m)
	1090	m = m + 1
	1091	ENDDO
	1092	ENDDO
	1093
	1094	ENDDO
	1095
	1096	ENDIF
	1097
	1098	ENDDO
	1099
	1100	DEALLOCATE( tri )
	1101
[1106]	1102	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'stop' )
[1]	1103
	1104	END SUBROUTINE fftx_tri_fftx
	1105
	1106
[1216]	1107	SUBROUTINE fftx_tr_xy( f_in, f_out )
[1]	1108
	1109	!------------------------------------------------------------------------------!
	1110	! Fourier-transformation along x with subsequent transposition x --> y for
	1111	! a 1d-decomposition along y
	1112	!
	1113	! ATTENTION: The NEC-branch of this routine may significantly profit from
	1114	! further optimizations. So far, performance is much worse than
	1115	! for routine ffty_tr_yx (more than three times slower).
	1116	!------------------------------------------------------------------------------!
	1117
[1320]	1118	USE control_parameters, &
	1119	ONLY: host
	1120
	1121	USE cpulog, &
	1122	ONLY: cpu_log, log_point_s
	1123
	1124	USE kinds
	1125
[1]	1126	USE pegrid
	1127
	1128	IMPLICIT NONE
	1129
[1320]	1130	INTEGER(iwp) :: i !:
	1131	INTEGER(iwp) :: j !:
	1132	INTEGER(iwp) :: k !:
[1]	1133
[1320]	1134	REAL(wp), DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx !:
	1135	REAL(wp), DIMENSION(1:nz,nys:nyn,0:nx) :: f_in !:
	1136	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_out !:
	1137	REAL(wp), DIMENSION(nys:nyn,1:nz,0:nx) :: work !:
[1]	1138
	1139	!
	1140	!-- Carry out the FFT along x, where all data are present due to the
	1141	!-- 1d-decomposition along y. Resort the data in a way that y becomes
	1142	!-- the first index.
[1106]	1143	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'start' )
[1]	1144
	1145	IF ( host(1:3) == 'nec' ) THEN
	1146	!
	1147	!-- Code for vector processors
[85]	1148	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	1149	!$OMP DO
	1150	DO i = 0, nx
	1151
	1152	DO j = nys, nyn
	1153	DO k = 1, nz
	1154	work_fftx(i,k,j) = f_in(k,j,i)
	1155	ENDDO
	1156	ENDDO
	1157
	1158	ENDDO
	1159
	1160	!$OMP DO
	1161	DO j = nys, nyn
	1162
	1163	CALL fft_x_m( work_fftx(:,:,j), 'forward' )
	1164
	1165	DO k = 1, nz
	1166	DO i = 0, nx
	1167	work(j,k,i) = work_fftx(i,k,j)
	1168	ENDDO
	1169	ENDDO
	1170
	1171	ENDDO
	1172	!$OMP END PARALLEL
	1173
	1174	ELSE
	1175
	1176	!
	1177	!-- Cache optimized code (there might be still a potential for better
	1178	!-- optimization).
[696]	1179	!$OMP PARALLEL PRIVATE (i,j,k)
[1]	1180	!$OMP DO
	1181	DO i = 0, nx
	1182
	1183	DO j = nys, nyn
	1184	DO k = 1, nz
	1185	work_fftx(i,k,j) = f_in(k,j,i)
	1186	ENDDO
	1187	ENDDO
	1188
	1189	ENDDO
	1190
	1191	!$OMP DO
	1192	DO j = nys, nyn
	1193	DO k = 1, nz
	1194
[1106]	1195	CALL fft_x_1d( work_fftx(0:nx,k,j), 'forward' )
[1]	1196
	1197	DO i = 0, nx
	1198	work(j,k,i) = work_fftx(i,k,j)
	1199	ENDDO
	1200	ENDDO
	1201
	1202	ENDDO
	1203	!$OMP END PARALLEL
	1204
	1205	ENDIF
[1106]	1206	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'pause' )
[1]	1207
	1208	!
	1209	!-- Transpose array
[1111]	1210	#if defined( __parallel )
[1]	1211	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	1212	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	1213	CALL MPI_ALLTOALL( work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
	1214	f_out(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
	1215	comm1dy, ierr )
	1216	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1111]	1217	#endif
[1]	1218
	1219	END SUBROUTINE fftx_tr_xy
	1220
	1221
[1216]	1222	SUBROUTINE tr_yx_fftx( f_in, f_out )
[1]	1223
	1224	!------------------------------------------------------------------------------!
	1225	! Transposition y --> x with a subsequent backward Fourier transformation for
	1226	! a 1d-decomposition along x
	1227	!------------------------------------------------------------------------------!
	1228
[1320]	1229	USE control_parameters, &
	1230	ONLY: host
	1231
	1232	USE cpulog, &
	1233	ONLY: cpu_log, log_point_s
	1234
	1235	USE kinds
	1236
[1]	1237	USE pegrid
	1238
	1239	IMPLICIT NONE
	1240
[1320]	1241	INTEGER(iwp) :: i !:
	1242	INTEGER(iwp) :: j !:
	1243	INTEGER(iwp) :: k !:
[1]	1244
[1320]	1245	REAL(wp), DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx !:
	1246	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_in !:
	1247	REAL(wp), DIMENSION(1:nz,nys:nyn,0:nx) :: f_out !:
	1248	REAL(wp), DIMENSION(nys:nyn,1:nz,0:nx) :: work !:
[1]	1249
	1250	!
	1251	!-- Transpose array
[1111]	1252	#if defined( __parallel )
[1]	1253	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	1254	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	1255	CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
	1256	work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
	1257	comm1dy, ierr )
	1258	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1111]	1259	#endif
[1]	1260
	1261	!
	1262	!-- Carry out the FFT along x, where all data are present due to the
	1263	!-- 1d-decomposition along y. Resort the data in a way that y becomes
	1264	!-- the first index.
[1106]	1265	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'continue' )
[1]	1266
	1267	IF ( host(1:3) == 'nec' ) THEN
	1268	!
	1269	!-- Code optimized for vector processors
[85]	1270	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	1271	!$OMP DO
	1272	DO j = nys, nyn
	1273
	1274	DO k = 1, nz
	1275	DO i = 0, nx
	1276	work_fftx(i,k,j) = work(j,k,i)
	1277	ENDDO
	1278	ENDDO
	1279
	1280	CALL fft_x_m( work_fftx(:,:,j), 'backward' )
	1281
	1282	ENDDO
	1283
	1284	!$OMP DO
	1285	DO i = 0, nx
	1286	DO j = nys, nyn
	1287	DO k = 1, nz
	1288	f_out(k,j,i) = work_fftx(i,k,j)
	1289	ENDDO
	1290	ENDDO
	1291	ENDDO
	1292	!$OMP END PARALLEL
	1293
	1294	ELSE
	1295
	1296	!
	1297	!-- Cache optimized code (there might be still a potential for better
	1298	!-- optimization).
[696]	1299	!$OMP PARALLEL PRIVATE (i,j,k)
[1]	1300	!$OMP DO
	1301	DO j = nys, nyn
	1302	DO k = 1, nz
	1303
	1304	DO i = 0, nx
	1305	work_fftx(i,k,j) = work(j,k,i)
	1306	ENDDO
	1307
[1106]	1308	CALL fft_x_1d( work_fftx(0:nx,k,j), 'backward' )
[1]	1309
	1310	ENDDO
	1311	ENDDO
	1312
	1313	!$OMP DO
	1314	DO i = 0, nx
	1315	DO j = nys, nyn
	1316	DO k = 1, nz
	1317	f_out(k,j,i) = work_fftx(i,k,j)
	1318	ENDDO
	1319	ENDDO
	1320	ENDDO
	1321	!$OMP END PARALLEL
	1322
	1323	ENDIF
[1106]	1324	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'stop' )
[1]	1325
	1326	END SUBROUTINE tr_yx_fftx
	1327
	1328
	1329	SUBROUTINE ffty_tri_ffty( ar )
	1330
	1331	!------------------------------------------------------------------------------!
	1332	! FFT along y, solution of the tridiagonal system and backward FFT for
	1333	! a 1d-decomposition along y
	1334	!
	1335	! WARNING: this subroutine may still not work for hybrid parallelization
	1336	! with OpenMP (for possible necessary changes see the original
	1337	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
	1338	!------------------------------------------------------------------------------!
	1339
[1320]	1340	USE control_parameters, &
	1341	ONLY: host
	1342
	1343	USE cpulog, &
	1344	ONLY: cpu_log, log_point_s
	1345
	1346	USE grid_variables, &
	1347	ONLY: ddx2, ddy2
	1348
	1349	USE kinds
	1350
[1]	1351	USE pegrid
	1352
	1353	IMPLICIT NONE
	1354
[1320]	1355	INTEGER(iwp) :: i !:
	1356	INTEGER(iwp) :: j !:
	1357	INTEGER(iwp) :: k !:
	1358	INTEGER(iwp) :: m !:
	1359	INTEGER(iwp) :: n !:
	1360	INTEGER(iwp) :: omp_get_thread_num !:
	1361	INTEGER(iwp) :: tn !:
[1]	1362
[1320]	1363	REAL(wp), DIMENSION(0:ny) :: work_ffty !:
	1364	REAL(wp), DIMENSION(0:ny,1:nz) :: work_triy !:
	1365	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: ar !:
	1366	REAL(wp), DIMENSION(:,:,:,:), ALLOCATABLE :: tri !:
[1]	1367
	1368
[1106]	1369	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'start' )
[1]	1370
	1371	ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) )
	1372
	1373	tn = 0 ! Default thread number in case of one thread
[696]	1374	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy )
[1]	1375	DO i = nxl_y, nxr_y
	1376
	1377	!$ tn = omp_get_thread_num()
	1378
	1379	IF ( host(1:3) == 'nec' ) THEN
	1380	!
	1381	!-- Code optimized for vector processors
	1382	DO k = 1, nz
	1383
	1384	m = 0
	1385	DO n = 1, pdims(2)
[1003]	1386	DO j = 1, nny
[1]	1387	work_triy(m,k) = ar(j,k,i,n)
	1388	m = m + 1
	1389	ENDDO
	1390	ENDDO
	1391
	1392	ENDDO
	1393
	1394	CALL fft_y_m( work_triy, ny, 'forward' )
	1395
	1396	ELSE
	1397	!
	1398	!-- Cache optimized code
	1399	DO k = 1, nz
	1400
	1401	m = 0
	1402	DO n = 1, pdims(2)
[1003]	1403	DO j = 1, nny
[1]	1404	work_ffty(m) = ar(j,k,i,n)
	1405	m = m + 1
	1406	ENDDO
	1407	ENDDO
	1408
[1106]	1409	CALL fft_y_1d( work_ffty, 'forward' )
[1]	1410
	1411	DO j = 0, ny
	1412	work_triy(j,k) = work_ffty(j)
	1413	ENDDO
	1414
	1415	ENDDO
	1416
	1417	ENDIF
	1418
	1419	!
	1420	!-- Solve the linear equation system
	1421	CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) )
	1422
	1423	IF ( host(1:3) == 'nec' ) THEN
	1424	!
	1425	!-- Code optimized for vector processors
	1426	CALL fft_y_m( work_triy, ny, 'backward' )
	1427
	1428	DO k = 1, nz
	1429
	1430	m = 0
	1431	DO n = 1, pdims(2)
[1003]	1432	DO j = 1, nny
[1]	1433	ar(j,k,i,n) = work_triy(m,k)
	1434	m = m + 1
	1435	ENDDO
	1436	ENDDO
	1437
	1438	ENDDO
	1439
	1440	ELSE
	1441	!
	1442	!-- Cache optimized code
	1443	DO k = 1, nz
	1444
	1445	DO j = 0, ny
	1446	work_ffty(j) = work_triy(j,k)
	1447	ENDDO
	1448
[1106]	1449	CALL fft_y_1d( work_ffty, 'backward' )
[1]	1450
	1451	m = 0
	1452	DO n = 1, pdims(2)
[1003]	1453	DO j = 1, nny
[1]	1454	ar(j,k,i,n) = work_ffty(m)
	1455	m = m + 1
	1456	ENDDO
	1457	ENDDO
	1458
	1459	ENDDO
	1460
	1461	ENDIF
	1462
	1463	ENDDO
	1464
	1465	DEALLOCATE( tri )
	1466
[1106]	1467	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'stop' )
[1]	1468
	1469	END SUBROUTINE ffty_tri_ffty
	1470
	1471	#endif
	1472
	1473	END MODULE poisfft_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |