Home

Context Navigation

source: palm/trunk/SOURCE/poisfft_mod.f90 @ 4633

Last change on this file since 4633 was 4429, checked in by raasch, 5 years ago
serial (non-MPI) test case added, several bugfixes for the serial mode
Property svn:keywords set to `Id`
File size: 44.0 KB

Rev	Line
[1850]	1	!> @file poisfft_mod.f90
[2000]	2	!------------------------------------------------------------------------------!
[2696]	3	! This file is part of the PALM model system.
[1036]	4	!
[2000]	5	! PALM is free software: you can redistribute it and/or modify it under the
	6	! terms of the GNU General Public License as published by the Free Software
	7	! Foundation, either version 3 of the License, or (at your option) any later
	8	! version.
[1036]	9	!
	10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
	11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
	12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
	13	!
	14	! You should have received a copy of the GNU General Public License along with
	15	! PALM. If not, see <http://www.gnu.org/licenses/>.
	16	!
[4360]	17	! Copyright 1997-2020 Leibniz Universitaet Hannover
[2000]	18	!------------------------------------------------------------------------------!
[1036]	19	!
[484]	20	! Current revisions:
[1]	21	! -----------------
[1683]	22	!
[2119]	23	!
[1321]	24	! Former revisions:
	25	! -----------------
	26	! $Id: poisfft_mod.f90 4429 2020-02-27 15:24:30Z suehring $
[4429]	27	! Statements added to avoid compile errors due to unused dummy arguments in serial mode
	28	!
	29	! 4366 2020-01-09 08:12:43Z raasch
[4366]	30	! modification concerning NEC vectorizatio
	31	!
	32	! 4360 2020-01-07 11:25:50Z suehring
[4182]	33	! Corrected "Former revisions" section
	34	!
	35	! 3690 2019-01-22 22:56:42Z knoop
[3634]	36	! OpenACC port for SPEC
[2716]	37	!
[4182]	38	! Revision 1.1 1997/07/24 11:24:14 raasch
	39	! Initial revision
	40	!
	41	!
[1]	42	! Description:
	43	! ------------
[1682]	44	!> Solves the Poisson equation with a 2D spectral method
	45	!> d^2 p / dx^2 + d^2 p / dy^2 + d^2 p / dz^2 = s
	46	!>
	47	!> Input:
	48	!> real ar contains (nnz,nny,nnx) elements of the velocity divergence,
	49	!> starting from (1,nys,nxl)
	50	!>
	51	!> Output:
	52	!> real ar contains the solution for perturbation pressure p
[1]	53	!------------------------------------------------------------------------------!
[1682]	54	MODULE poisfft_mod
	55
[1]	56
[1320]	57	USE fft_xy, &
[4366]	58	ONLY: fft_init, fft_y, fft_y_1d, fft_y_m, fft_x, fft_x_1d, fft_x_m, &
	59	temperton_fft_vec
[1]	60
[1320]	61	USE indices, &
	62	ONLY: nnx, nny, nx, nxl, nxr, ny, nys, nyn, nz
	63
	64	USE transpose_indices, &
	65	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nys_x, nys_z, nyn_x, nyn_z, nzb_x, &
	66	nzb_y, nzt_x, nzt_y
	67
	68	USE tridia_solver, &
	69	ONLY: tridia_1dd, tridia_init, tridia_substi, tridia_substi_overlap
	70
[1]	71	IMPLICIT NONE
	72
[1111]	73	LOGICAL, SAVE :: poisfft_initialized = .FALSE.
	74
[1]	75	PRIVATE
[807]	76
[1]	77	PUBLIC poisfft, poisfft_init
	78
	79	INTERFACE poisfft
	80	MODULE PROCEDURE poisfft
	81	END INTERFACE poisfft
	82
	83	INTERFACE poisfft_init
	84	MODULE PROCEDURE poisfft_init
	85	END INTERFACE poisfft_init
	86
[807]	87
[1]	88	CONTAINS
	89
[1682]	90	!------------------------------------------------------------------------------!
	91	! Description:
	92	! ------------
[3241]	93	!> Setup coefficients for FFT and the tridiagonal solver
[1682]	94	!------------------------------------------------------------------------------!
[1]	95	SUBROUTINE poisfft_init
	96
[1111]	97	IMPLICIT NONE
	98
	99
[1]	100	CALL fft_init
	101
[1212]	102	CALL tridia_init
[1111]	103
	104	poisfft_initialized = .TRUE.
	105
[1]	106	END SUBROUTINE poisfft_init
	107
[1111]	108
[1804]	109
[1682]	110	!------------------------------------------------------------------------------!
	111	! Description:
	112	! ------------
	113	!> Two-dimensional Fourier Transformation in x- and y-direction.
	114	!------------------------------------------------------------------------------!
[1306]	115	SUBROUTINE poisfft( ar )
[1]	116
[1320]	117	USE control_parameters, &
[3241]	118	ONLY: transpose_compute_overlap
[1320]	119
	120	USE cpulog, &
	121	ONLY: cpu_log, cpu_log_nowait, log_point_s
	122
	123	USE kinds
	124
[1]	125	USE pegrid
	126
	127	IMPLICIT NONE
	128
[1682]	129	INTEGER(iwp) :: ii !<
	130	INTEGER(iwp) :: iind !<
	131	INTEGER(iwp) :: inew !<
	132	INTEGER(iwp) :: jj !<
	133	INTEGER(iwp) :: jind !<
	134	INTEGER(iwp) :: jnew !<
	135	INTEGER(iwp) :: ki !<
	136	INTEGER(iwp) :: kk !<
	137	INTEGER(iwp) :: knew !<
	138	INTEGER(iwp) :: n !<
	139	INTEGER(iwp) :: nblk !<
	140	INTEGER(iwp) :: nnx_y !<
	141	INTEGER(iwp) :: nny_z !<
	142	INTEGER(iwp) :: nnz_x !<
	143	INTEGER(iwp) :: nxl_y_bound !<
	144	INTEGER(iwp) :: nxr_y_bound !<
[1]	145
[1682]	146	INTEGER(iwp), DIMENSION(4) :: isave !<
[1320]	147
[1682]	148	REAL(wp), DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar !<
	149	REAL(wp), DIMENSION(nys:nyn,nxl:nxr,1:nz) :: ar_inv !<
[3690]	150
	151	#define __acc_fft_device ( defined( _OPENACC ) && ( defined ( __cuda_fft ) ) )
	152	#if __acc_fft_device
[3634]	153	!$ACC DECLARE CREATE(ar_inv)
[3690]	154	#endif
[1]	155
[1682]	156	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: ar1 !<
	157	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_in !<
	158	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_inv !<
	159	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_out_y !<
	160	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_out_z !<
[1216]	161
	162
[1]	163	CALL cpu_log( log_point_s(3), 'poisfft', 'start' )
	164
[1111]	165	IF ( .NOT. poisfft_initialized ) CALL poisfft_init
	166
[3690]	167	#if !__acc_fft_device
	168	!$ACC UPDATE HOST(ar)
	169	#endif
	170
[3634]	171	#ifndef _OPENACC
[1]	172	!
	173	!-- Two-dimensional Fourier Transformation in x- and y-direction.
[2118]	174	IF ( pdims(2) == 1 .AND. pdims(1) > 1 ) THEN
[1]	175
	176	!
	177	!-- 1d-domain-decomposition along x:
	178	!-- FFT along y and transposition y --> x
[1216]	179	CALL ffty_tr_yx( ar, ar )
[1]	180
	181	!
	182	!-- FFT along x, solving the tridiagonal system and backward FFT
	183	CALL fftx_tri_fftx( ar )
	184
	185	!
	186	!-- Transposition x --> y and backward FFT along y
[1216]	187	CALL tr_xy_ffty( ar, ar )
[1]	188
[2118]	189	ELSEIF ( pdims(1) == 1 .AND. pdims(2) > 1 ) THEN
[1]	190
	191	!
	192	!-- 1d-domain-decomposition along y:
	193	!-- FFT along x and transposition x --> y
[1216]	194	CALL fftx_tr_xy( ar, ar )
[1]	195
	196	!
	197	!-- FFT along y, solving the tridiagonal system and backward FFT
	198	CALL ffty_tri_ffty( ar )
	199
	200	!
	201	!-- Transposition y --> x and backward FFT along x
[1216]	202	CALL tr_yx_fftx( ar, ar )
[1]	203
[1216]	204	ELSEIF ( .NOT. transpose_compute_overlap ) THEN
[3634]	205	#endif
[1]	206
	207	!
[1111]	208	!-- 2d-domain-decomposition or no decomposition (1 PE run)
[1]	209	!-- Transposition z --> x
	210	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
[1216]	211	CALL resort_for_zx( ar, ar_inv )
	212	CALL transpose_zx( ar_inv, ar )
[1]	213	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
	214
	215	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
[4366]	216	IF ( temperton_fft_vec ) THEN
	217	!
	218	!-- Vector version outputs a transformed array ar_inv that does not require resorting
	219	!-- (which is done for ar further below)
	220	CALL fft_x( ar, 'forward', ar_inv=ar_inv)
	221	ELSE
	222	CALL fft_x( ar, 'forward')
	223	ENDIF
[1]	224	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
	225
	226	!
	227	!-- Transposition x --> y
	228	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
[4366]	229	IF( .NOT. temperton_fft_vec ) CALL resort_for_xy( ar, ar_inv )
[1216]	230	CALL transpose_xy( ar_inv, ar )
[1]	231	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
	232
	233	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
[4366]	234	IF ( temperton_fft_vec ) THEN
	235	!
	236	!-- Input array ar_inv from fft_x can be directly used here.
	237	!-- The output (also in array ar_inv) does not require resorting below.
	238	CALL fft_y( ar, 'forward', ar_inv = ar_inv, nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
	239	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
	240	ELSE
	241	CALL fft_y( ar, 'forward', ar_tr = ar, nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
	242	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
	243	ENDIF
[1]	244	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	245
	246	!
	247	!-- Transposition y --> z
	248	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
[4366]	249	IF ( .NOT. temperton_fft_vec ) CALL resort_for_yz( ar, ar_inv )
[1216]	250	CALL transpose_yz( ar_inv, ar )
[1]	251	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
	252
	253	!
[1106]	254	!-- Solve the tridiagonal equation system along z
[1]	255	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
[1212]	256	CALL tridia_substi( ar )
[1]	257	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
	258
	259	!
	260	!-- Inverse Fourier Transformation
	261	!-- Transposition z --> y
	262	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
[1216]	263	CALL transpose_zy( ar, ar_inv )
[4366]	264	!
	265	!-- The fft_y below (vector branch) can directly process ar_inv (i.e. does not require a
	266	!-- resorting)
	267	IF ( .NOT. temperton_fft_vec ) CALL resort_for_zy( ar_inv, ar )
[1]	268	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
	269
	270	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
[4366]	271	IF ( temperton_fft_vec ) THEN
	272	!
	273	!-- Output array ar_inv can be used as input to the below fft_x routine without resorting
	274	CALL fft_y( ar, 'backward', ar_inv = ar_inv, nxl_y_bound = nxl_y, nxr_y_bound = nxr_y,&
	275	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
	276	ELSE
	277	CALL fft_y( ar, 'backward', ar_tr = ar, nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
	278	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
	279	ENDIF
	280
[1]	281	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	282
	283	!
	284	!-- Transposition y --> x
	285	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
[1216]	286	CALL transpose_yx( ar, ar_inv )
[4366]	287	IF ( .NOT. temperton_fft_vec ) CALL resort_for_yx( ar_inv, ar )
[1]	288	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
	289
	290	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
[4366]	291	IF ( temperton_fft_vec ) THEN
	292	CALL fft_x( ar, 'backward', ar_inv=ar_inv )
	293	ELSE
	294	CALL fft_x( ar, 'backward' )
	295	ENDIF
[1]	296	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	297
	298	!
	299	!-- Transposition x --> z
	300	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
[1216]	301	CALL transpose_xz( ar, ar_inv )
	302	CALL resort_for_xz( ar_inv, ar )
[1]	303	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
	304
[3634]	305	#ifndef _OPENACC
[1216]	306	ELSE
	307
	308	!
	309	!-- 2d-domain-decomposition or no decomposition (1 PE run) with
	310	!-- overlapping transposition / fft
[1318]	311	!-- cputime logging must not use barriers, which would prevent overlapping
[1216]	312	ALLOCATE( f_out_y(0:ny,nxl_y:nxr_y,nzb_y:nzt_y), &
	313	f_out_z(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
	314	!
	315	!-- Transposition z --> x + subsequent fft along x
	316	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
	317	CALL resort_for_zx( ar, f_inv )
	318	!
	319	!-- Save original indices and gridpoint counter
	320	isave(1) = nz
	321	isave(2) = nzb_x
	322	isave(3) = nzt_x
	323	isave(4) = sendrecvcount_zx
	324	!
	325	!-- Set new indices for transformation
	326	nblk = nz / pdims(1)
	327	nz = pdims(1)
	328	nnz_x = 1
	329	nzb_x = 1 + myidx * nnz_x
	330	nzt_x = ( myidx + 1 ) * nnz_x
	331	sendrecvcount_zx = nnx * nny * nnz_x
	332
[1306]	333	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
[1216]	334	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
	335
[1306]	336	DO kk = 1, nblk
[1216]	337
[1306]	338	IF ( kk == 1 ) THEN
[1318]	339	CALL cpu_log( log_point_s(5), 'transpo forward', 'start', cpu_log_nowait )
[1306]	340	ELSE
[1318]	341	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
[1306]	342	ENDIF
[1216]	343
[1306]	344	DO knew = 1, nz
	345	ki = kk + nblk * ( knew - 1 )
	346	f_in(:,:,knew) = f_inv(:,:,ki)
	347	ENDDO
[1216]	348
[1306]	349	CALL transpose_zx( f_in, ar1(:,:,:))
	350	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
[1216]	351
[1306]	352	IF ( kk == 1 ) THEN
[1318]	353	CALL cpu_log( log_point_s(4), 'fft_x', 'start', cpu_log_nowait )
[1306]	354	ELSE
[1318]	355	CALL cpu_log( log_point_s(4), 'fft_x', 'continue', cpu_log_nowait )
[1216]	356	ENDIF
	357
[1306]	358	n = isave(2) + kk - 1
	359	CALL fft_x( ar1(:,:,:), 'forward', ar_2d = f_out_z(:,:,n))
	360	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
[1216]	361
	362	ENDDO
	363	!
	364	!-- Restore original indices/counters
	365	nz = isave(1)
	366	nzb_x = isave(2)
	367	nzt_x = isave(3)
	368	sendrecvcount_zx = isave(4)
	369
	370	DEALLOCATE( ar1, f_in, f_inv )
	371
	372	!
	373	!-- Transposition x --> y + subsequent fft along y
	374	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
	375	CALL resort_for_xy( f_out_z, f_inv )
	376	!
	377	!-- Save original indices and gridpoint counter
	378	isave(1) = nx
	379	isave(2) = nxl_y
	380	isave(3) = nxr_y
	381	isave(4) = sendrecvcount_xy
	382	!
	383	!-- Set new indices for transformation
	384	nblk = ( ( nx+1 ) / pdims(2) ) - 1
	385	nx = pdims(2)
	386	nnx_y = 1
	387	nxl_y = myidy * nnx_y
	388	nxr_y = ( myidy + 1 ) * nnx_y - 1
	389	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
	390
[1306]	391	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
[1216]	392	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
	393
[1306]	394	DO ii = 0, nblk
[1216]	395
[1318]	396	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
[1216]	397
[1306]	398	DO inew = 0, nx-1
	399	iind = ii + ( nblk + 1 ) * inew
	400	f_in(:,:,inew) = f_inv(:,:,iind)
	401	ENDDO
[1216]	402
[1306]	403	CALL transpose_xy( f_in, ar1(:,:,:) )
[1216]	404
[1306]	405	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
[1216]	406
[1306]	407	IF ( ii == 1 ) THEN
[1318]	408	CALL cpu_log( log_point_s(7), 'fft_y', 'start', cpu_log_nowait )
[1306]	409	ELSE
[1318]	410	CALL cpu_log( log_point_s(7), 'fft_y', 'continue', cpu_log_nowait )
[1216]	411	ENDIF
	412
[1306]	413	nxl_y_bound = isave(2)
	414	nxr_y_bound = isave(3)
	415	n = isave(2) + ii
	416	CALL fft_y( ar1(:,:,:), 'forward', ar_tr = f_out_y, &
	417	nxl_y_bound = nxl_y_bound, nxr_y_bound = nxr_y_bound, &
	418	nxl_y_l = n, nxr_y_l = n )
[1216]	419
[1306]	420	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
[1216]	421
	422	ENDDO
	423	!
	424	!-- Restore original indices/counters
	425	nx = isave(1)
	426	nxl_y = isave(2)
	427	nxr_y = isave(3)
	428	sendrecvcount_xy = isave(4)
	429
	430	DEALLOCATE( ar1, f_in, f_inv )
	431
	432	!
	433	!-- Transposition y --> z + subsequent tridia + resort for z --> y
	434	ALLOCATE( f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
	435	CALL resort_for_yz( f_out_y, f_inv )
	436	!
	437	!-- Save original indices and gridpoint counter
	438	isave(1) = ny
	439	isave(2) = nys_z
	440	isave(3) = nyn_z
	441	isave(4) = sendrecvcount_yz
	442	!
	443	!-- Set new indices for transformation
	444	nblk = ( ( ny+1 ) / pdims(1) ) - 1
	445	ny = pdims(1)
	446	nny_z = 1
	447	nys_z = myidx * nny_z
	448	nyn_z = ( myidx + 1 ) * nny_z - 1
	449	sendrecvcount_yz = ( nxr_y-nxl_y+1 ) * nny_z * ( nzt_y-nzb_y+1 )
	450
[1306]	451	ALLOCATE( ar1(nxl_z:nxr_z,nys_z:nyn_z,1:nz) )
[1216]	452	ALLOCATE( f_in(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
	453
[1306]	454	DO jj = 0, nblk
[1216]	455	!
[1306]	456	!-- Forward Fourier Transformation
	457	!-- Transposition y --> z
[1318]	458	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
[1216]	459
[1306]	460	DO jnew = 0, ny-1
	461	jind = jj + ( nblk + 1 ) * jnew
	462	f_in(:,:,jnew) = f_inv(:,:,jind)
	463	ENDDO
[1216]	464
[1306]	465	CALL transpose_yz( f_in, ar1(:,:,:) )
[1216]	466
[1306]	467	IF ( jj == nblk ) THEN
	468	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
	469	ELSE
	470	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
[1216]	471	ENDIF
	472
	473	!
[1306]	474	!-- Solve the tridiagonal equation system along z
[1318]	475	CALL cpu_log( log_point_s(6), 'tridia', 'start', cpu_log_nowait )
[1216]	476
[1306]	477	n = isave(2) + jj
	478	CALL tridia_substi_overlap( ar1(:,:,:), n )
[1216]	479
[1306]	480	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
[1216]	481
[1306]	482	!
	483	!-- Inverse Fourier Transformation
	484	!-- Transposition z --> y
	485	!-- Only one thread should call MPI routines, therefore forward and
	486	!-- backward tranpose are in the same section
	487	IF ( jj == 0 ) THEN
[1318]	488	CALL cpu_log( log_point_s(8), 'transpo invers', 'start', cpu_log_nowait )
[1306]	489	ELSE
[1318]	490	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
[1216]	491	ENDIF
	492
[1306]	493	CALL transpose_zy( ar1(:,:,:), f_in )
[1216]	494
[1306]	495	DO jnew = 0, ny-1
	496	jind = jj + ( nblk + 1 ) * jnew
	497	f_inv(:,:,jind) = f_in(:,:,jnew)
	498	ENDDO
[1216]	499
[1306]	500	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
[1216]	501
	502	ENDDO
	503	!
	504	!-- Restore original indices/counters
	505	ny = isave(1)
	506	nys_z = isave(2)
	507	nyn_z = isave(3)
	508	sendrecvcount_yz = isave(4)
	509
	510	CALL resort_for_zy( f_inv, f_out_y )
	511
	512	DEALLOCATE( ar1, f_in, f_inv )
	513
	514	!
	515	!-- fft along y backward + subsequent transposition y --> x
	516	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
	517	!
	518	!-- Save original indices and gridpoint counter
	519	isave(1) = nx
	520	isave(2) = nxl_y
	521	isave(3) = nxr_y
	522	isave(4) = sendrecvcount_xy
	523	!
	524	!-- Set new indices for transformation
	525	nblk = (( nx+1 ) / pdims(2) ) - 1
	526	nx = pdims(2)
	527	nnx_y = 1
	528	nxl_y = myidy * nnx_y
	529	nxr_y = ( myidy + 1 ) * nnx_y - 1
	530	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
	531
[1306]	532	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
[1216]	533	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
	534
[1306]	535	DO ii = 0, nblk
[1216]	536
[1318]	537	CALL cpu_log( log_point_s(7), 'fft_y', 'continue', cpu_log_nowait )
[1216]	538
[1306]	539	n = isave(2) + ii
	540	nxl_y_bound = isave(2)
	541	nxr_y_bound = isave(3)
[1216]	542
[1306]	543	CALL fft_y( ar1(:,:,:), 'backward', ar_tr = f_out_y, &
	544	nxl_y_bound = nxl_y_bound, nxr_y_bound = nxr_y_bound, &
	545	nxl_y_l = n, nxr_y_l = n )
[1216]	546
[1306]	547	IF ( ii == nblk ) THEN
	548	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	549	ELSE
	550	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
[1216]	551	ENDIF
	552
[1318]	553	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
[1216]	554
[1306]	555	CALL transpose_yx( ar1(:,:,:), f_in )
[1216]	556
[1306]	557	DO inew = 0, nx-1
	558	iind = ii + (nblk+1) * inew
	559	f_inv(:,:,iind) = f_in(:,:,inew)
	560	ENDDO
[1216]	561
[1306]	562	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
[1216]	563
	564	ENDDO
	565	!
	566	!-- Restore original indices/counters
	567	nx = isave(1)
	568	nxl_y = isave(2)
	569	nxr_y = isave(3)
	570	sendrecvcount_xy = isave(4)
	571
	572	CALL resort_for_yx( f_inv, f_out_z )
	573
	574	DEALLOCATE( ar1, f_in, f_inv )
	575
	576	!
	577	!-- fft along x backward + subsequent final transposition x --> z
	578	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
	579	!
	580	!-- Save original indices and gridpoint counter
	581	isave(1) = nz
	582	isave(2) = nzb_x
	583	isave(3) = nzt_x
	584	isave(4) = sendrecvcount_zx
	585	!
	586	!-- Set new indices for transformation
	587	nblk = nz / pdims(1)
	588	nz = pdims(1)
	589	nnz_x = 1
	590	nzb_x = 1 + myidx * nnz_x
	591	nzt_x = ( myidx + 1 ) * nnz_x
	592	sendrecvcount_zx = nnx * nny * nnz_x
	593
[1306]	594	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
[1216]	595	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
	596
[1306]	597	DO kk = 1, nblk
[1216]	598
[1318]	599	CALL cpu_log( log_point_s(4), 'fft_x', 'continue', cpu_log_nowait )
[1216]	600
[1306]	601	n = isave(2) + kk - 1
	602	CALL fft_x( ar1(:,:,:), 'backward', f_out_z(:,:,n))
[1216]	603
[1306]	604	IF ( kk == nblk ) THEN
	605	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	606	ELSE
	607	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
[1216]	608	ENDIF
	609
[1318]	610	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
[1216]	611
[1306]	612	CALL transpose_xz( ar1(:,:,:), f_in )
[1216]	613
[1306]	614	DO knew = 1, nz
	615	ki = kk + nblk * (knew-1)
	616	f_inv(:,:,ki) = f_in(:,:,knew)
	617	ENDDO
[1216]	618
[1306]	619	IF ( kk == nblk ) THEN
	620	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
	621	ELSE
	622	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
[1216]	623	ENDIF
	624
	625	ENDDO
	626	!
	627	!-- Restore original indices/counters
	628	nz = isave(1)
	629	nzb_x = isave(2)
	630	nzt_x = isave(3)
	631	sendrecvcount_zx = isave(4)
	632
	633	CALL resort_for_xz( f_inv, ar )
	634
	635	DEALLOCATE( ar1, f_in, f_inv )
	636
[1]	637	ENDIF
[3634]	638	#endif
[1]	639
[3690]	640	#if !__acc_fft_device
	641	!$ACC UPDATE DEVICE(ar)
	642	#endif
	643
[1]	644	CALL cpu_log( log_point_s(3), 'poisfft', 'stop' )
	645
	646	END SUBROUTINE poisfft
	647
	648
	649	!------------------------------------------------------------------------------!
[1682]	650	! Description:
	651	! ------------
	652	!> Fourier-transformation along y with subsequent transposition y --> x for
	653	!> a 1d-decomposition along x.
	654	!>
	655	!> @attention The performance of this routine is much faster on the NEC-SX6,
	656	!> if the first index of work_ffty_vec is odd. Otherwise
	657	!> memory bank conflicts may occur (especially if the index is a
	658	!> multiple of 128). That's why work_ffty_vec is dimensioned as
	659	!> 0:ny+1.
	660	!> Of course, this will not work if users are using an odd number
	661	!> of gridpoints along y.
[1]	662	!------------------------------------------------------------------------------!
[1682]	663	SUBROUTINE ffty_tr_yx( f_in, f_out )
[1]	664
[1320]	665	USE control_parameters, &
[2300]	666	ONLY: loop_optimization
[1320]	667
	668	USE cpulog, &
	669	ONLY: cpu_log, log_point_s
	670
	671	USE kinds
	672
[1]	673	USE pegrid
	674
	675	IMPLICIT NONE
	676
[1682]	677	INTEGER(iwp) :: i !<
	678	INTEGER(iwp) :: iend !<
	679	INTEGER(iwp) :: iouter !<
	680	INTEGER(iwp) :: ir !<
	681	INTEGER(iwp) :: j !<
	682	INTEGER(iwp) :: k !<
[1]	683
[1682]	684	INTEGER(iwp), PARAMETER :: stridex = 4 !<
[1320]	685
[1682]	686	REAL(wp), DIMENSION(1:nz,0:ny,nxl:nxr) :: f_in !<
	687	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_out !<
	688	REAL(wp), DIMENSION(nxl:nxr,1:nz,0:ny) :: work !<
[1]	689
[2300]	690	REAL(wp), DIMENSION(:,:), ALLOCATABLE :: work_ffty !<
	691	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: work_ffty_vec !<
	692
[1]	693	!
	694	!-- Carry out the FFT along y, where all data are present due to the
	695	!-- 1d-decomposition along x. Resort the data in a way that x becomes
	696	!-- the first index.
[1106]	697	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'start' )
[1]	698
[2300]	699	IF ( loop_optimization == 'vector' ) THEN
	700
	701	ALLOCATE( work_ffty_vec(0:ny+1,1:nz,nxl:nxr) )
[1]	702	!
	703	!-- Code optimized for vector processors
[2300]	704	!$OMP PARALLEL PRIVATE ( i, j, k )
	705	!$OMP DO
[1]	706	DO i = nxl, nxr
	707
	708	DO j = 0, ny
	709	DO k = 1, nz
	710	work_ffty_vec(j,k,i) = f_in(k,j,i)
	711	ENDDO
	712	ENDDO
	713
	714	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' )
	715
	716	ENDDO
	717
[2300]	718	!$OMP DO
[1]	719	DO k = 1, nz
	720	DO j = 0, ny
	721	DO i = nxl, nxr
	722	work(i,k,j) = work_ffty_vec(j,k,i)
	723	ENDDO
	724	ENDDO
	725	ENDDO
[2300]	726	!$OMP END PARALLEL
[1]	727
[2300]	728	DEALLOCATE( work_ffty_vec )
	729
[1]	730	ELSE
	731	!
	732	!-- Cache optimized code.
[2300]	733	ALLOCATE( work_ffty(0:ny,stridex) )
	734	!
[1]	735	!-- The i-(x-)direction is split into a strided outer loop and an inner
	736	!-- loop for better cache performance
[2300]	737	!$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty)
	738	!$OMP DO
[1]	739	DO iouter = nxl, nxr, stridex
	740
	741	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
	742
	743	DO k = 1, nz
	744
	745	DO i = iouter, iend
	746
	747	ir = i-iouter+1 ! counter within a stride
	748	DO j = 0, ny
	749	work_ffty(j,ir) = f_in(k,j,i)
	750	ENDDO
	751	!
	752	!-- FFT along y
[1106]	753	CALL fft_y_1d( work_ffty(:,ir), 'forward' )
[1]	754
	755	ENDDO
	756
	757	!
	758	!-- Resort
	759	DO j = 0, ny
	760	DO i = iouter, iend
	761	work(i,k,j) = work_ffty(j,i-iouter+1)
	762	ENDDO
	763	ENDDO
	764
	765	ENDDO
	766
	767	ENDDO
[2300]	768	!$OMP END PARALLEL
[1]	769
[2300]	770	DEALLOCATE( work_ffty )
	771
[1]	772	ENDIF
[2300]	773
[1106]	774	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'pause' )
[1]	775
	776	!
	777	!-- Transpose array
[1111]	778	#if defined( __parallel )
[1]	779	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	780	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	781	CALL MPI_ALLTOALL( work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
	782	f_out(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
	783	comm1dx, ierr )
	784	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[4429]	785	#else
	786	!
	787	!-- Next line required to avoid compile error about unused dummy argument in serial mode
	788	i = SIZE( f_out )
[1111]	789	#endif
[1]	790
	791	END SUBROUTINE ffty_tr_yx
	792
	793
	794	!------------------------------------------------------------------------------!
[1682]	795	! Description:
	796	! ------------
	797	!> Transposition x --> y with a subsequent backward Fourier transformation for
	798	!> a 1d-decomposition along x
[1]	799	!------------------------------------------------------------------------------!
[1682]	800	SUBROUTINE tr_xy_ffty( f_in, f_out )
[1]	801
[1320]	802	USE control_parameters, &
[2300]	803	ONLY: loop_optimization
[1320]	804
	805	USE cpulog, &
	806	ONLY: cpu_log, log_point_s
	807
	808	USE kinds
	809
[1]	810	USE pegrid
	811
	812	IMPLICIT NONE
	813
[1682]	814	INTEGER(iwp) :: i !<
	815	INTEGER(iwp) :: iend !<
	816	INTEGER(iwp) :: iouter !<
	817	INTEGER(iwp) :: ir !<
	818	INTEGER(iwp) :: j !<
	819	INTEGER(iwp) :: k !<
[1]	820
[1682]	821	INTEGER(iwp), PARAMETER :: stridex = 4 !<
[1320]	822
[1682]	823	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_in !<
	824	REAL(wp), DIMENSION(1:nz,0:ny,nxl:nxr) :: f_out !<
	825	REAL(wp), DIMENSION(nxl:nxr,1:nz,0:ny) :: work !<
[1]	826
[2300]	827	REAL(wp), DIMENSION(:,:), ALLOCATABLE :: work_ffty !<
	828	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: work_ffty_vec !<
	829
[1]	830	!
	831	!-- Transpose array
[1111]	832	#if defined( __parallel )
[1]	833	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	834	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	835	CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
	836	work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
	837	comm1dx, ierr )
	838	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[4429]	839	#else
	840	!
	841	!-- Next line required to avoid compile error about unused dummy argument in serial mode
	842	i = SIZE( f_in )
[1111]	843	#endif
[1]	844
	845	!
	846	!-- Resort the data in a way that y becomes the first index and carry out the
	847	!-- backward fft along y.
[1106]	848	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'continue' )
[1]	849
[2300]	850	IF ( loop_optimization == 'vector' ) THEN
	851
	852	ALLOCATE( work_ffty_vec(0:ny+1,1:nz,nxl:nxr) )
[1]	853	!
	854	!-- Code optimized for vector processors
[2300]	855	!$OMP PARALLEL PRIVATE ( i, j, k )
	856	!$OMP DO
[1]	857	DO k = 1, nz
	858	DO j = 0, ny
	859	DO i = nxl, nxr
	860	work_ffty_vec(j,k,i) = work(i,k,j)
	861	ENDDO
	862	ENDDO
	863	ENDDO
	864
[2300]	865	!$OMP DO
[1]	866	DO i = nxl, nxr
	867
	868	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' )
	869
	870	DO j = 0, ny
	871	DO k = 1, nz
	872	f_out(k,j,i) = work_ffty_vec(j,k,i)
	873	ENDDO
	874	ENDDO
	875
	876	ENDDO
[2300]	877	!$OMP END PARALLEL
[1]	878
[2300]	879	DEALLOCATE( work_ffty_vec )
	880
[1]	881	ELSE
	882	!
	883	!-- Cache optimized code.
[2300]	884	ALLOCATE( work_ffty(0:ny,stridex) )
	885	!
[1]	886	!-- The i-(x-)direction is split into a strided outer loop and an inner
	887	!-- loop for better cache performance
[2300]	888	!$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty )
	889	!$OMP DO
[1]	890	DO iouter = nxl, nxr, stridex
	891
	892	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
	893
	894	DO k = 1, nz
	895	!
	896	!-- Resort
	897	DO j = 0, ny
	898	DO i = iouter, iend
	899	work_ffty(j,i-iouter+1) = work(i,k,j)
	900	ENDDO
	901	ENDDO
	902
	903	DO i = iouter, iend
	904
	905	!
	906	!-- FFT along y
	907	ir = i-iouter+1 ! counter within a stride
[1106]	908	CALL fft_y_1d( work_ffty(:,ir), 'backward' )
[1]	909
	910	DO j = 0, ny
	911	f_out(k,j,i) = work_ffty(j,ir)
	912	ENDDO
	913	ENDDO
	914
	915	ENDDO
	916
	917	ENDDO
[2300]	918	!$OMP END PARALLEL
[1]	919
[2300]	920	DEALLOCATE( work_ffty )
	921
[1]	922	ENDIF
	923
[1106]	924	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'stop' )
[1]	925
	926	END SUBROUTINE tr_xy_ffty
	927
	928
	929	!------------------------------------------------------------------------------!
[1682]	930	! Description:
	931	! ------------
	932	!> FFT along x, solution of the tridiagonal system and backward FFT for
	933	!> a 1d-decomposition along x
	934	!>
	935	!> @warning this subroutine may still not work for hybrid parallelization
	936	!> with OpenMP (for possible necessary changes see the original
	937	!> routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
[1]	938	!------------------------------------------------------------------------------!
[1682]	939	SUBROUTINE fftx_tri_fftx( ar )
[1]	940
[1320]	941	USE control_parameters, &
[2300]	942	ONLY: loop_optimization
[1320]	943
	944	USE cpulog, &
	945	ONLY: cpu_log, log_point_s
	946
	947	USE grid_variables, &
	948	ONLY: ddx2, ddy2
	949
	950	USE kinds
	951
[1]	952	USE pegrid
	953
	954	IMPLICIT NONE
	955
[1682]	956	INTEGER(iwp) :: i !<
	957	INTEGER(iwp) :: j !<
	958	INTEGER(iwp) :: k !<
	959	INTEGER(iwp) :: m !<
	960	INTEGER(iwp) :: n !<
[3241]	961	!$ INTEGER(iwp) :: omp_get_thread_num !<
[1682]	962	INTEGER(iwp) :: tn !<
[1]	963
[1682]	964	REAL(wp), DIMENSION(0:nx) :: work_fftx !<
	965	REAL(wp), DIMENSION(0:nx,1:nz) :: work_trix !<
	966	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: ar !<
	967	REAL(wp), DIMENSION(:,:,:,:), ALLOCATABLE :: tri !<
[1]	968
	969
[1106]	970	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'start' )
[1]	971
	972	ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) )
	973
	974	tn = 0 ! Default thread number in case of one thread
	975	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_fftx, work_trix )
	976	DO j = nys_x, nyn_x
	977
	978	!$ tn = omp_get_thread_num()
	979
[2300]	980	IF ( loop_optimization == 'vector' ) THEN
[1]	981	!
	982	!-- Code optimized for vector processors
	983	DO k = 1, nz
	984
	985	m = 0
	986	DO n = 1, pdims(1)
[1003]	987	DO i = 1, nnx
[1]	988	work_trix(m,k) = ar(i,k,j,n)
	989	m = m + 1
	990	ENDDO
	991	ENDDO
	992
	993	ENDDO
	994
	995	CALL fft_x_m( work_trix, 'forward' )
	996
	997	ELSE
	998	!
	999	!-- Cache optimized code
	1000	DO k = 1, nz
	1001
	1002	m = 0
	1003	DO n = 1, pdims(1)
[1003]	1004	DO i = 1, nnx
[1]	1005	work_fftx(m) = ar(i,k,j,n)
	1006	m = m + 1
	1007	ENDDO
	1008	ENDDO
	1009
[1106]	1010	CALL fft_x_1d( work_fftx, 'forward' )
[1]	1011
	1012	DO i = 0, nx
	1013	work_trix(i,k) = work_fftx(i)
	1014	ENDDO
	1015
	1016	ENDDO
	1017
	1018	ENDIF
	1019
	1020	!
	1021	!-- Solve the linear equation system
	1022	CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) )
	1023
[2300]	1024	IF ( loop_optimization == 'vector' ) THEN
[1]	1025	!
	1026	!-- Code optimized for vector processors
	1027	CALL fft_x_m( work_trix, 'backward' )
	1028
	1029	DO k = 1, nz
	1030
	1031	m = 0
	1032	DO n = 1, pdims(1)
[1003]	1033	DO i = 1, nnx
[1]	1034	ar(i,k,j,n) = work_trix(m,k)
	1035	m = m + 1
	1036	ENDDO
	1037	ENDDO
	1038
	1039	ENDDO
	1040
	1041	ELSE
	1042	!
	1043	!-- Cache optimized code
	1044	DO k = 1, nz
	1045
	1046	DO i = 0, nx
	1047	work_fftx(i) = work_trix(i,k)
	1048	ENDDO
	1049
[1106]	1050	CALL fft_x_1d( work_fftx, 'backward' )
[1]	1051
	1052	m = 0
	1053	DO n = 1, pdims(1)
[1003]	1054	DO i = 1, nnx
[1]	1055	ar(i,k,j,n) = work_fftx(m)
	1056	m = m + 1
	1057	ENDDO
	1058	ENDDO
	1059
	1060	ENDDO
	1061
	1062	ENDIF
	1063
	1064	ENDDO
	1065
	1066	DEALLOCATE( tri )
	1067
[1106]	1068	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'stop' )
[1]	1069
	1070	END SUBROUTINE fftx_tri_fftx
	1071
	1072
	1073	!------------------------------------------------------------------------------!
[1682]	1074	! Description:
	1075	! ------------
	1076	!> Fourier-transformation along x with subsequent transposition x --> y for
	1077	!> a 1d-decomposition along y.
	1078	!>
	1079	!> @attention NEC-branch of this routine may significantly profit from
	1080	!> further optimizations. So far, performance is much worse than
	1081	!> for routine ffty_tr_yx (more than three times slower).
[1]	1082	!------------------------------------------------------------------------------!
[1682]	1083	SUBROUTINE fftx_tr_xy( f_in, f_out )
[1]	1084
[1682]	1085
[1320]	1086	USE control_parameters, &
[2300]	1087	ONLY: loop_optimization
[1320]	1088
	1089	USE cpulog, &
	1090	ONLY: cpu_log, log_point_s
	1091
	1092	USE kinds
	1093
[1]	1094	USE pegrid
	1095
	1096	IMPLICIT NONE
	1097
[1682]	1098	INTEGER(iwp) :: i !<
	1099	INTEGER(iwp) :: j !<
	1100	INTEGER(iwp) :: k !<
[1]	1101
[1682]	1102	REAL(wp), DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx !<
	1103	REAL(wp), DIMENSION(1:nz,nys:nyn,0:nx) :: f_in !<
	1104	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_out !<
	1105	REAL(wp), DIMENSION(nys:nyn,1:nz,0:nx) :: work !<
[1]	1106
	1107	!
	1108	!-- Carry out the FFT along x, where all data are present due to the
	1109	!-- 1d-decomposition along y. Resort the data in a way that y becomes
	1110	!-- the first index.
[1106]	1111	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'start' )
[1]	1112
[2300]	1113	IF ( loop_optimization == 'vector' ) THEN
[1]	1114	!
	1115	!-- Code for vector processors
[85]	1116	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	1117	!$OMP DO
	1118	DO i = 0, nx
	1119
	1120	DO j = nys, nyn
	1121	DO k = 1, nz
	1122	work_fftx(i,k,j) = f_in(k,j,i)
	1123	ENDDO
	1124	ENDDO
	1125
	1126	ENDDO
	1127
	1128	!$OMP DO
	1129	DO j = nys, nyn
	1130
	1131	CALL fft_x_m( work_fftx(:,:,j), 'forward' )
	1132
	1133	DO k = 1, nz
	1134	DO i = 0, nx
	1135	work(j,k,i) = work_fftx(i,k,j)
	1136	ENDDO
	1137	ENDDO
	1138
	1139	ENDDO
	1140	!$OMP END PARALLEL
	1141
	1142	ELSE
	1143
	1144	!
	1145	!-- Cache optimized code (there might be still a potential for better
	1146	!-- optimization).
[696]	1147	!$OMP PARALLEL PRIVATE (i,j,k)
[1]	1148	!$OMP DO
	1149	DO i = 0, nx
	1150
	1151	DO j = nys, nyn
	1152	DO k = 1, nz
	1153	work_fftx(i,k,j) = f_in(k,j,i)
	1154	ENDDO
	1155	ENDDO
	1156
	1157	ENDDO
	1158
	1159	!$OMP DO
	1160	DO j = nys, nyn
	1161	DO k = 1, nz
	1162
[1106]	1163	CALL fft_x_1d( work_fftx(0:nx,k,j), 'forward' )
[1]	1164
	1165	DO i = 0, nx
	1166	work(j,k,i) = work_fftx(i,k,j)
	1167	ENDDO
	1168	ENDDO
	1169
	1170	ENDDO
	1171	!$OMP END PARALLEL
	1172
	1173	ENDIF
[1106]	1174	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'pause' )
[1]	1175
	1176	!
	1177	!-- Transpose array
[1111]	1178	#if defined( __parallel )
[1]	1179	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	1180	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	1181	CALL MPI_ALLTOALL( work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
	1182	f_out(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
	1183	comm1dy, ierr )
	1184	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[4429]	1185	#else
	1186	!
	1187	!-- Next line required to avoid compile error about unused dummy argument in serial mode
	1188	i = SIZE( f_out )
[1111]	1189	#endif
[1]	1190
	1191	END SUBROUTINE fftx_tr_xy
	1192
	1193
	1194	!------------------------------------------------------------------------------!
[1682]	1195	! Description:
	1196	! ------------
	1197	!> Transposition y --> x with a subsequent backward Fourier transformation for
	1198	!> a 1d-decomposition along x.
[1]	1199	!------------------------------------------------------------------------------!
[1682]	1200	SUBROUTINE tr_yx_fftx( f_in, f_out )
[1]	1201
[1682]	1202
[1320]	1203	USE control_parameters, &
[2300]	1204	ONLY: loop_optimization
[1320]	1205
	1206	USE cpulog, &
	1207	ONLY: cpu_log, log_point_s
	1208
	1209	USE kinds
	1210
[1]	1211	USE pegrid
	1212
	1213	IMPLICIT NONE
	1214
[1682]	1215	INTEGER(iwp) :: i !<
	1216	INTEGER(iwp) :: j !<
	1217	INTEGER(iwp) :: k !<
[1]	1218
[1682]	1219	REAL(wp), DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx !<
	1220	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_in !<
	1221	REAL(wp), DIMENSION(1:nz,nys:nyn,0:nx) :: f_out !<
	1222	REAL(wp), DIMENSION(nys:nyn,1:nz,0:nx) :: work !<
[1]	1223
[4429]	1224
[1]	1225	!
	1226	!-- Transpose array
[1111]	1227	#if defined( __parallel )
[1]	1228	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	1229	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	1230	CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
	1231	work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
	1232	comm1dy, ierr )
	1233	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[4429]	1234	#else
	1235	!
	1236	!-- Next line required to avoid compile error about unused dummy argument in serial mode
	1237	i = SIZE( f_in )
[1111]	1238	#endif
[1]	1239
	1240	!
	1241	!-- Carry out the FFT along x, where all data are present due to the
	1242	!-- 1d-decomposition along y. Resort the data in a way that y becomes
	1243	!-- the first index.
[1106]	1244	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'continue' )
[1]	1245
[2300]	1246	IF ( loop_optimization == 'vector' ) THEN
[1]	1247	!
	1248	!-- Code optimized for vector processors
[85]	1249	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	1250	!$OMP DO
	1251	DO j = nys, nyn
	1252
	1253	DO k = 1, nz
	1254	DO i = 0, nx
	1255	work_fftx(i,k,j) = work(j,k,i)
	1256	ENDDO
	1257	ENDDO
	1258
	1259	CALL fft_x_m( work_fftx(:,:,j), 'backward' )
	1260
	1261	ENDDO
	1262
	1263	!$OMP DO
	1264	DO i = 0, nx
	1265	DO j = nys, nyn
	1266	DO k = 1, nz
	1267	f_out(k,j,i) = work_fftx(i,k,j)
	1268	ENDDO
	1269	ENDDO
	1270	ENDDO
	1271	!$OMP END PARALLEL
	1272
	1273	ELSE
	1274
	1275	!
	1276	!-- Cache optimized code (there might be still a potential for better
	1277	!-- optimization).
[696]	1278	!$OMP PARALLEL PRIVATE (i,j,k)
[1]	1279	!$OMP DO
	1280	DO j = nys, nyn
	1281	DO k = 1, nz
	1282
	1283	DO i = 0, nx
	1284	work_fftx(i,k,j) = work(j,k,i)
	1285	ENDDO
	1286
[1106]	1287	CALL fft_x_1d( work_fftx(0:nx,k,j), 'backward' )
[1]	1288
	1289	ENDDO
	1290	ENDDO
	1291
	1292	!$OMP DO
	1293	DO i = 0, nx
	1294	DO j = nys, nyn
	1295	DO k = 1, nz
	1296	f_out(k,j,i) = work_fftx(i,k,j)
	1297	ENDDO
	1298	ENDDO
	1299	ENDDO
	1300	!$OMP END PARALLEL
	1301
	1302	ENDIF
[1106]	1303	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'stop' )
[1]	1304
	1305	END SUBROUTINE tr_yx_fftx
	1306
	1307
	1308	!------------------------------------------------------------------------------!
[1682]	1309	! Description:
	1310	! ------------
	1311	!> FFT along y, solution of the tridiagonal system and backward FFT for
	1312	!> a 1d-decomposition along y.
	1313	!>
	1314	!> @warning this subroutine may still not work for hybrid parallelization
	1315	!> with OpenMP (for possible necessary changes see the original
	1316	!> routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
[1]	1317	!------------------------------------------------------------------------------!
[1682]	1318	SUBROUTINE ffty_tri_ffty( ar )
[1]	1319
[1682]	1320
[1320]	1321	USE control_parameters, &
[2300]	1322	ONLY: loop_optimization
[1320]	1323
	1324	USE cpulog, &
	1325	ONLY: cpu_log, log_point_s
	1326
	1327	USE grid_variables, &
	1328	ONLY: ddx2, ddy2
	1329
	1330	USE kinds
	1331
[1]	1332	USE pegrid
	1333
	1334	IMPLICIT NONE
	1335
[1682]	1336	INTEGER(iwp) :: i !<
	1337	INTEGER(iwp) :: j !<
	1338	INTEGER(iwp) :: k !<
	1339	INTEGER(iwp) :: m !<
	1340	INTEGER(iwp) :: n !<
[3241]	1341	!$ INTEGER(iwp) :: omp_get_thread_num !<
[1682]	1342	INTEGER(iwp) :: tn !<
[1]	1343
[1682]	1344	REAL(wp), DIMENSION(0:ny) :: work_ffty !<
	1345	REAL(wp), DIMENSION(0:ny,1:nz) :: work_triy !<
	1346	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: ar !<
	1347	REAL(wp), DIMENSION(:,:,:,:), ALLOCATABLE :: tri !<
[1]	1348
	1349
[1106]	1350	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'start' )
[1]	1351
	1352	ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) )
	1353
	1354	tn = 0 ! Default thread number in case of one thread
[696]	1355	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy )
[1]	1356	DO i = nxl_y, nxr_y
	1357
	1358	!$ tn = omp_get_thread_num()
	1359
[2300]	1360	IF ( loop_optimization == 'vector' ) THEN
[1]	1361	!
	1362	!-- Code optimized for vector processors
	1363	DO k = 1, nz
	1364
	1365	m = 0
	1366	DO n = 1, pdims(2)
[1003]	1367	DO j = 1, nny
[1]	1368	work_triy(m,k) = ar(j,k,i,n)
	1369	m = m + 1
	1370	ENDDO
	1371	ENDDO
	1372
	1373	ENDDO
	1374
	1375	CALL fft_y_m( work_triy, ny, 'forward' )
	1376
	1377	ELSE
	1378	!
	1379	!-- Cache optimized code
	1380	DO k = 1, nz
	1381
	1382	m = 0
	1383	DO n = 1, pdims(2)
[1003]	1384	DO j = 1, nny
[1]	1385	work_ffty(m) = ar(j,k,i,n)
	1386	m = m + 1
	1387	ENDDO
	1388	ENDDO
	1389
[1106]	1390	CALL fft_y_1d( work_ffty, 'forward' )
[1]	1391
	1392	DO j = 0, ny
	1393	work_triy(j,k) = work_ffty(j)
	1394	ENDDO
	1395
	1396	ENDDO
	1397
	1398	ENDIF
	1399
	1400	!
	1401	!-- Solve the linear equation system
	1402	CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) )
	1403
[2300]	1404	IF ( loop_optimization == 'vector' ) THEN
[1]	1405	!
	1406	!-- Code optimized for vector processors
	1407	CALL fft_y_m( work_triy, ny, 'backward' )
	1408
	1409	DO k = 1, nz
	1410
	1411	m = 0
	1412	DO n = 1, pdims(2)
[1003]	1413	DO j = 1, nny
[1]	1414	ar(j,k,i,n) = work_triy(m,k)
	1415	m = m + 1
	1416	ENDDO
	1417	ENDDO
	1418
	1419	ENDDO
	1420
	1421	ELSE
	1422	!
	1423	!-- Cache optimized code
	1424	DO k = 1, nz
	1425
	1426	DO j = 0, ny
	1427	work_ffty(j) = work_triy(j,k)
	1428	ENDDO
	1429
[1106]	1430	CALL fft_y_1d( work_ffty, 'backward' )
[1]	1431
	1432	m = 0
	1433	DO n = 1, pdims(2)
[1003]	1434	DO j = 1, nny
[1]	1435	ar(j,k,i,n) = work_ffty(m)
	1436	m = m + 1
	1437	ENDDO
	1438	ENDDO
	1439
	1440	ENDDO
	1441
	1442	ENDIF
	1443
	1444	ENDDO
	1445
	1446	DEALLOCATE( tri )
	1447
[1106]	1448	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'stop' )
[1]	1449
	1450	END SUBROUTINE ffty_tri_ffty
	1451
	1452	END MODULE poisfft_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |