Home

Context Navigation

source: palm/trunk/SOURCE/poisfft_mod.f90 @ 4377

Last change on this file since 4377 was 4366, checked in by raasch, 5 years ago
code vectorization for NEC Aurora: vectorized version of Temperton FFT, vectorization of Newtor iteration for calculating the Obukhov length
Property svn:keywords set to `Id`
File size: 43.3 KB

Rev	Line
[1850]	1	!> @file poisfft_mod.f90
[2000]	2	!------------------------------------------------------------------------------!
[2696]	3	! This file is part of the PALM model system.
[1036]	4	!
[2000]	5	! PALM is free software: you can redistribute it and/or modify it under the
	6	! terms of the GNU General Public License as published by the Free Software
	7	! Foundation, either version 3 of the License, or (at your option) any later
	8	! version.
[1036]	9	!
	10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
	11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
	12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
	13	!
	14	! You should have received a copy of the GNU General Public License along with
	15	! PALM. If not, see <http://www.gnu.org/licenses/>.
	16	!
[4360]	17	! Copyright 1997-2020 Leibniz Universitaet Hannover
[2000]	18	!------------------------------------------------------------------------------!
[1036]	19	!
[484]	20	! Current revisions:
[1]	21	! -----------------
[1683]	22	!
[2119]	23	!
[1321]	24	! Former revisions:
	25	! -----------------
	26	! $Id: poisfft_mod.f90 4366 2020-01-09 08:12:43Z gronemeier $
[4366]	27	! modification concerning NEC vectorizatio
	28	!
	29	! 4360 2020-01-07 11:25:50Z suehring
[4182]	30	! Corrected "Former revisions" section
	31	!
	32	! 3690 2019-01-22 22:56:42Z knoop
[3634]	33	! OpenACC port for SPEC
[2716]	34	!
[4182]	35	! Revision 1.1 1997/07/24 11:24:14 raasch
	36	! Initial revision
	37	!
	38	!
[1]	39	! Description:
	40	! ------------
[1682]	41	!> Solves the Poisson equation with a 2D spectral method
	42	!> d^2 p / dx^2 + d^2 p / dy^2 + d^2 p / dz^2 = s
	43	!>
	44	!> Input:
	45	!> real ar contains (nnz,nny,nnx) elements of the velocity divergence,
	46	!> starting from (1,nys,nxl)
	47	!>
	48	!> Output:
	49	!> real ar contains the solution for perturbation pressure p
[1]	50	!------------------------------------------------------------------------------!
[1682]	51	MODULE poisfft_mod
	52
[1]	53
[1320]	54	USE fft_xy, &
[4366]	55	ONLY: fft_init, fft_y, fft_y_1d, fft_y_m, fft_x, fft_x_1d, fft_x_m, &
	56	temperton_fft_vec
[1]	57
[1320]	58	USE indices, &
	59	ONLY: nnx, nny, nx, nxl, nxr, ny, nys, nyn, nz
	60
	61	USE transpose_indices, &
	62	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nys_x, nys_z, nyn_x, nyn_z, nzb_x, &
	63	nzb_y, nzt_x, nzt_y
	64
	65	USE tridia_solver, &
	66	ONLY: tridia_1dd, tridia_init, tridia_substi, tridia_substi_overlap
	67
[1]	68	IMPLICIT NONE
	69
[1111]	70	LOGICAL, SAVE :: poisfft_initialized = .FALSE.
	71
[1]	72	PRIVATE
[807]	73
[1]	74	PUBLIC poisfft, poisfft_init
	75
	76	INTERFACE poisfft
	77	MODULE PROCEDURE poisfft
	78	END INTERFACE poisfft
	79
	80	INTERFACE poisfft_init
	81	MODULE PROCEDURE poisfft_init
	82	END INTERFACE poisfft_init
	83
[807]	84
[1]	85	CONTAINS
	86
[1682]	87	!------------------------------------------------------------------------------!
	88	! Description:
	89	! ------------
[3241]	90	!> Setup coefficients for FFT and the tridiagonal solver
[1682]	91	!------------------------------------------------------------------------------!
[1]	92	SUBROUTINE poisfft_init
	93
[1111]	94	IMPLICIT NONE
	95
	96
[1]	97	CALL fft_init
	98
[1212]	99	CALL tridia_init
[1111]	100
	101	poisfft_initialized = .TRUE.
	102
[1]	103	END SUBROUTINE poisfft_init
	104
[1111]	105
[1804]	106
[1682]	107	!------------------------------------------------------------------------------!
	108	! Description:
	109	! ------------
	110	!> Two-dimensional Fourier Transformation in x- and y-direction.
	111	!------------------------------------------------------------------------------!
[1306]	112	SUBROUTINE poisfft( ar )
[1]	113
[1320]	114	USE control_parameters, &
[3241]	115	ONLY: transpose_compute_overlap
[1320]	116
	117	USE cpulog, &
	118	ONLY: cpu_log, cpu_log_nowait, log_point_s
	119
	120	USE kinds
	121
[1]	122	USE pegrid
	123
	124	IMPLICIT NONE
	125
[1682]	126	INTEGER(iwp) :: ii !<
	127	INTEGER(iwp) :: iind !<
	128	INTEGER(iwp) :: inew !<
	129	INTEGER(iwp) :: jj !<
	130	INTEGER(iwp) :: jind !<
	131	INTEGER(iwp) :: jnew !<
	132	INTEGER(iwp) :: ki !<
	133	INTEGER(iwp) :: kk !<
	134	INTEGER(iwp) :: knew !<
	135	INTEGER(iwp) :: n !<
	136	INTEGER(iwp) :: nblk !<
	137	INTEGER(iwp) :: nnx_y !<
	138	INTEGER(iwp) :: nny_z !<
	139	INTEGER(iwp) :: nnz_x !<
	140	INTEGER(iwp) :: nxl_y_bound !<
	141	INTEGER(iwp) :: nxr_y_bound !<
[1]	142
[1682]	143	INTEGER(iwp), DIMENSION(4) :: isave !<
[1320]	144
[1682]	145	REAL(wp), DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar !<
	146	REAL(wp), DIMENSION(nys:nyn,nxl:nxr,1:nz) :: ar_inv !<
[3690]	147
	148	#define __acc_fft_device ( defined( _OPENACC ) && ( defined ( __cuda_fft ) ) )
	149	#if __acc_fft_device
[3634]	150	!$ACC DECLARE CREATE(ar_inv)
[3690]	151	#endif
[1]	152
[1682]	153	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: ar1 !<
	154	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_in !<
	155	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_inv !<
	156	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_out_y !<
	157	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_out_z !<
[1216]	158
	159
[1]	160	CALL cpu_log( log_point_s(3), 'poisfft', 'start' )
	161
[1111]	162	IF ( .NOT. poisfft_initialized ) CALL poisfft_init
	163
[3690]	164	#if !__acc_fft_device
	165	!$ACC UPDATE HOST(ar)
	166	#endif
	167
[3634]	168	#ifndef _OPENACC
[1]	169	!
	170	!-- Two-dimensional Fourier Transformation in x- and y-direction.
[2118]	171	IF ( pdims(2) == 1 .AND. pdims(1) > 1 ) THEN
[1]	172
	173	!
	174	!-- 1d-domain-decomposition along x:
	175	!-- FFT along y and transposition y --> x
[1216]	176	CALL ffty_tr_yx( ar, ar )
[1]	177
	178	!
	179	!-- FFT along x, solving the tridiagonal system and backward FFT
	180	CALL fftx_tri_fftx( ar )
	181
	182	!
	183	!-- Transposition x --> y and backward FFT along y
[1216]	184	CALL tr_xy_ffty( ar, ar )
[1]	185
[2118]	186	ELSEIF ( pdims(1) == 1 .AND. pdims(2) > 1 ) THEN
[1]	187
	188	!
	189	!-- 1d-domain-decomposition along y:
	190	!-- FFT along x and transposition x --> y
[1216]	191	CALL fftx_tr_xy( ar, ar )
[1]	192
	193	!
	194	!-- FFT along y, solving the tridiagonal system and backward FFT
	195	CALL ffty_tri_ffty( ar )
	196
	197	!
	198	!-- Transposition y --> x and backward FFT along x
[1216]	199	CALL tr_yx_fftx( ar, ar )
[1]	200
[1216]	201	ELSEIF ( .NOT. transpose_compute_overlap ) THEN
[3634]	202	#endif
[1]	203
	204	!
[1111]	205	!-- 2d-domain-decomposition or no decomposition (1 PE run)
[1]	206	!-- Transposition z --> x
	207	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
[1216]	208	CALL resort_for_zx( ar, ar_inv )
	209	CALL transpose_zx( ar_inv, ar )
[1]	210	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
	211
	212	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
[4366]	213	IF ( temperton_fft_vec ) THEN
	214	!
	215	!-- Vector version outputs a transformed array ar_inv that does not require resorting
	216	!-- (which is done for ar further below)
	217	CALL fft_x( ar, 'forward', ar_inv=ar_inv)
	218	ELSE
	219	CALL fft_x( ar, 'forward')
	220	ENDIF
[1]	221	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
	222
	223	!
	224	!-- Transposition x --> y
	225	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
[4366]	226	IF( .NOT. temperton_fft_vec ) CALL resort_for_xy( ar, ar_inv )
[1216]	227	CALL transpose_xy( ar_inv, ar )
[1]	228	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
	229
	230	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
[4366]	231	IF ( temperton_fft_vec ) THEN
	232	!
	233	!-- Input array ar_inv from fft_x can be directly used here.
	234	!-- The output (also in array ar_inv) does not require resorting below.
	235	CALL fft_y( ar, 'forward', ar_inv = ar_inv, nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
	236	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
	237	ELSE
	238	CALL fft_y( ar, 'forward', ar_tr = ar, nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
	239	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
	240	ENDIF
[1]	241	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	242
	243	!
	244	!-- Transposition y --> z
	245	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
[4366]	246	IF ( .NOT. temperton_fft_vec ) CALL resort_for_yz( ar, ar_inv )
[1216]	247	CALL transpose_yz( ar_inv, ar )
[1]	248	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
	249
	250	!
[1106]	251	!-- Solve the tridiagonal equation system along z
[1]	252	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
[1212]	253	CALL tridia_substi( ar )
[1]	254	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
	255
	256	!
	257	!-- Inverse Fourier Transformation
	258	!-- Transposition z --> y
	259	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
[1216]	260	CALL transpose_zy( ar, ar_inv )
[4366]	261	!
	262	!-- The fft_y below (vector branch) can directly process ar_inv (i.e. does not require a
	263	!-- resorting)
	264	IF ( .NOT. temperton_fft_vec ) CALL resort_for_zy( ar_inv, ar )
[1]	265	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
	266
	267	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
[4366]	268	IF ( temperton_fft_vec ) THEN
	269	!
	270	!-- Output array ar_inv can be used as input to the below fft_x routine without resorting
	271	CALL fft_y( ar, 'backward', ar_inv = ar_inv, nxl_y_bound = nxl_y, nxr_y_bound = nxr_y,&
	272	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
	273	ELSE
	274	CALL fft_y( ar, 'backward', ar_tr = ar, nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
	275	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
	276	ENDIF
	277
[1]	278	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	279
	280	!
	281	!-- Transposition y --> x
	282	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
[1216]	283	CALL transpose_yx( ar, ar_inv )
[4366]	284	IF ( .NOT. temperton_fft_vec ) CALL resort_for_yx( ar_inv, ar )
[1]	285	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
	286
	287	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
[4366]	288	IF ( temperton_fft_vec ) THEN
	289	CALL fft_x( ar, 'backward', ar_inv=ar_inv )
	290	ELSE
	291	CALL fft_x( ar, 'backward' )
	292	ENDIF
[1]	293	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	294
	295	!
	296	!-- Transposition x --> z
	297	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
[1216]	298	CALL transpose_xz( ar, ar_inv )
	299	CALL resort_for_xz( ar_inv, ar )
[1]	300	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
	301
[3634]	302	#ifndef _OPENACC
[1216]	303	ELSE
	304
	305	!
	306	!-- 2d-domain-decomposition or no decomposition (1 PE run) with
	307	!-- overlapping transposition / fft
[1318]	308	!-- cputime logging must not use barriers, which would prevent overlapping
[1216]	309	ALLOCATE( f_out_y(0:ny,nxl_y:nxr_y,nzb_y:nzt_y), &
	310	f_out_z(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
	311	!
	312	!-- Transposition z --> x + subsequent fft along x
	313	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
	314	CALL resort_for_zx( ar, f_inv )
	315	!
	316	!-- Save original indices and gridpoint counter
	317	isave(1) = nz
	318	isave(2) = nzb_x
	319	isave(3) = nzt_x
	320	isave(4) = sendrecvcount_zx
	321	!
	322	!-- Set new indices for transformation
	323	nblk = nz / pdims(1)
	324	nz = pdims(1)
	325	nnz_x = 1
	326	nzb_x = 1 + myidx * nnz_x
	327	nzt_x = ( myidx + 1 ) * nnz_x
	328	sendrecvcount_zx = nnx * nny * nnz_x
	329
[1306]	330	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
[1216]	331	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
	332
[1306]	333	DO kk = 1, nblk
[1216]	334
[1306]	335	IF ( kk == 1 ) THEN
[1318]	336	CALL cpu_log( log_point_s(5), 'transpo forward', 'start', cpu_log_nowait )
[1306]	337	ELSE
[1318]	338	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
[1306]	339	ENDIF
[1216]	340
[1306]	341	DO knew = 1, nz
	342	ki = kk + nblk * ( knew - 1 )
	343	f_in(:,:,knew) = f_inv(:,:,ki)
	344	ENDDO
[1216]	345
[1306]	346	CALL transpose_zx( f_in, ar1(:,:,:))
	347	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
[1216]	348
[1306]	349	IF ( kk == 1 ) THEN
[1318]	350	CALL cpu_log( log_point_s(4), 'fft_x', 'start', cpu_log_nowait )
[1306]	351	ELSE
[1318]	352	CALL cpu_log( log_point_s(4), 'fft_x', 'continue', cpu_log_nowait )
[1216]	353	ENDIF
	354
[1306]	355	n = isave(2) + kk - 1
	356	CALL fft_x( ar1(:,:,:), 'forward', ar_2d = f_out_z(:,:,n))
	357	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
[1216]	358
	359	ENDDO
	360	!
	361	!-- Restore original indices/counters
	362	nz = isave(1)
	363	nzb_x = isave(2)
	364	nzt_x = isave(3)
	365	sendrecvcount_zx = isave(4)
	366
	367	DEALLOCATE( ar1, f_in, f_inv )
	368
	369	!
	370	!-- Transposition x --> y + subsequent fft along y
	371	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
	372	CALL resort_for_xy( f_out_z, f_inv )
	373	!
	374	!-- Save original indices and gridpoint counter
	375	isave(1) = nx
	376	isave(2) = nxl_y
	377	isave(3) = nxr_y
	378	isave(4) = sendrecvcount_xy
	379	!
	380	!-- Set new indices for transformation
	381	nblk = ( ( nx+1 ) / pdims(2) ) - 1
	382	nx = pdims(2)
	383	nnx_y = 1
	384	nxl_y = myidy * nnx_y
	385	nxr_y = ( myidy + 1 ) * nnx_y - 1
	386	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
	387
[1306]	388	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
[1216]	389	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
	390
[1306]	391	DO ii = 0, nblk
[1216]	392
[1318]	393	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
[1216]	394
[1306]	395	DO inew = 0, nx-1
	396	iind = ii + ( nblk + 1 ) * inew
	397	f_in(:,:,inew) = f_inv(:,:,iind)
	398	ENDDO
[1216]	399
[1306]	400	CALL transpose_xy( f_in, ar1(:,:,:) )
[1216]	401
[1306]	402	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
[1216]	403
[1306]	404	IF ( ii == 1 ) THEN
[1318]	405	CALL cpu_log( log_point_s(7), 'fft_y', 'start', cpu_log_nowait )
[1306]	406	ELSE
[1318]	407	CALL cpu_log( log_point_s(7), 'fft_y', 'continue', cpu_log_nowait )
[1216]	408	ENDIF
	409
[1306]	410	nxl_y_bound = isave(2)
	411	nxr_y_bound = isave(3)
	412	n = isave(2) + ii
	413	CALL fft_y( ar1(:,:,:), 'forward', ar_tr = f_out_y, &
	414	nxl_y_bound = nxl_y_bound, nxr_y_bound = nxr_y_bound, &
	415	nxl_y_l = n, nxr_y_l = n )
[1216]	416
[1306]	417	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
[1216]	418
	419	ENDDO
	420	!
	421	!-- Restore original indices/counters
	422	nx = isave(1)
	423	nxl_y = isave(2)
	424	nxr_y = isave(3)
	425	sendrecvcount_xy = isave(4)
	426
	427	DEALLOCATE( ar1, f_in, f_inv )
	428
	429	!
	430	!-- Transposition y --> z + subsequent tridia + resort for z --> y
	431	ALLOCATE( f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
	432	CALL resort_for_yz( f_out_y, f_inv )
	433	!
	434	!-- Save original indices and gridpoint counter
	435	isave(1) = ny
	436	isave(2) = nys_z
	437	isave(3) = nyn_z
	438	isave(4) = sendrecvcount_yz
	439	!
	440	!-- Set new indices for transformation
	441	nblk = ( ( ny+1 ) / pdims(1) ) - 1
	442	ny = pdims(1)
	443	nny_z = 1
	444	nys_z = myidx * nny_z
	445	nyn_z = ( myidx + 1 ) * nny_z - 1
	446	sendrecvcount_yz = ( nxr_y-nxl_y+1 ) * nny_z * ( nzt_y-nzb_y+1 )
	447
[1306]	448	ALLOCATE( ar1(nxl_z:nxr_z,nys_z:nyn_z,1:nz) )
[1216]	449	ALLOCATE( f_in(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
	450
[1306]	451	DO jj = 0, nblk
[1216]	452	!
[1306]	453	!-- Forward Fourier Transformation
	454	!-- Transposition y --> z
[1318]	455	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
[1216]	456
[1306]	457	DO jnew = 0, ny-1
	458	jind = jj + ( nblk + 1 ) * jnew
	459	f_in(:,:,jnew) = f_inv(:,:,jind)
	460	ENDDO
[1216]	461
[1306]	462	CALL transpose_yz( f_in, ar1(:,:,:) )
[1216]	463
[1306]	464	IF ( jj == nblk ) THEN
	465	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
	466	ELSE
	467	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
[1216]	468	ENDIF
	469
	470	!
[1306]	471	!-- Solve the tridiagonal equation system along z
[1318]	472	CALL cpu_log( log_point_s(6), 'tridia', 'start', cpu_log_nowait )
[1216]	473
[1306]	474	n = isave(2) + jj
	475	CALL tridia_substi_overlap( ar1(:,:,:), n )
[1216]	476
[1306]	477	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
[1216]	478
[1306]	479	!
	480	!-- Inverse Fourier Transformation
	481	!-- Transposition z --> y
	482	!-- Only one thread should call MPI routines, therefore forward and
	483	!-- backward tranpose are in the same section
	484	IF ( jj == 0 ) THEN
[1318]	485	CALL cpu_log( log_point_s(8), 'transpo invers', 'start', cpu_log_nowait )
[1306]	486	ELSE
[1318]	487	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
[1216]	488	ENDIF
	489
[1306]	490	CALL transpose_zy( ar1(:,:,:), f_in )
[1216]	491
[1306]	492	DO jnew = 0, ny-1
	493	jind = jj + ( nblk + 1 ) * jnew
	494	f_inv(:,:,jind) = f_in(:,:,jnew)
	495	ENDDO
[1216]	496
[1306]	497	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
[1216]	498
	499	ENDDO
	500	!
	501	!-- Restore original indices/counters
	502	ny = isave(1)
	503	nys_z = isave(2)
	504	nyn_z = isave(3)
	505	sendrecvcount_yz = isave(4)
	506
	507	CALL resort_for_zy( f_inv, f_out_y )
	508
	509	DEALLOCATE( ar1, f_in, f_inv )
	510
	511	!
	512	!-- fft along y backward + subsequent transposition y --> x
	513	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
	514	!
	515	!-- Save original indices and gridpoint counter
	516	isave(1) = nx
	517	isave(2) = nxl_y
	518	isave(3) = nxr_y
	519	isave(4) = sendrecvcount_xy
	520	!
	521	!-- Set new indices for transformation
	522	nblk = (( nx+1 ) / pdims(2) ) - 1
	523	nx = pdims(2)
	524	nnx_y = 1
	525	nxl_y = myidy * nnx_y
	526	nxr_y = ( myidy + 1 ) * nnx_y - 1
	527	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
	528
[1306]	529	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
[1216]	530	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
	531
[1306]	532	DO ii = 0, nblk
[1216]	533
[1318]	534	CALL cpu_log( log_point_s(7), 'fft_y', 'continue', cpu_log_nowait )
[1216]	535
[1306]	536	n = isave(2) + ii
	537	nxl_y_bound = isave(2)
	538	nxr_y_bound = isave(3)
[1216]	539
[1306]	540	CALL fft_y( ar1(:,:,:), 'backward', ar_tr = f_out_y, &
	541	nxl_y_bound = nxl_y_bound, nxr_y_bound = nxr_y_bound, &
	542	nxl_y_l = n, nxr_y_l = n )
[1216]	543
[1306]	544	IF ( ii == nblk ) THEN
	545	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	546	ELSE
	547	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
[1216]	548	ENDIF
	549
[1318]	550	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
[1216]	551
[1306]	552	CALL transpose_yx( ar1(:,:,:), f_in )
[1216]	553
[1306]	554	DO inew = 0, nx-1
	555	iind = ii + (nblk+1) * inew
	556	f_inv(:,:,iind) = f_in(:,:,inew)
	557	ENDDO
[1216]	558
[1306]	559	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
[1216]	560
	561	ENDDO
	562	!
	563	!-- Restore original indices/counters
	564	nx = isave(1)
	565	nxl_y = isave(2)
	566	nxr_y = isave(3)
	567	sendrecvcount_xy = isave(4)
	568
	569	CALL resort_for_yx( f_inv, f_out_z )
	570
	571	DEALLOCATE( ar1, f_in, f_inv )
	572
	573	!
	574	!-- fft along x backward + subsequent final transposition x --> z
	575	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
	576	!
	577	!-- Save original indices and gridpoint counter
	578	isave(1) = nz
	579	isave(2) = nzb_x
	580	isave(3) = nzt_x
	581	isave(4) = sendrecvcount_zx
	582	!
	583	!-- Set new indices for transformation
	584	nblk = nz / pdims(1)
	585	nz = pdims(1)
	586	nnz_x = 1
	587	nzb_x = 1 + myidx * nnz_x
	588	nzt_x = ( myidx + 1 ) * nnz_x
	589	sendrecvcount_zx = nnx * nny * nnz_x
	590
[1306]	591	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
[1216]	592	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
	593
[1306]	594	DO kk = 1, nblk
[1216]	595
[1318]	596	CALL cpu_log( log_point_s(4), 'fft_x', 'continue', cpu_log_nowait )
[1216]	597
[1306]	598	n = isave(2) + kk - 1
	599	CALL fft_x( ar1(:,:,:), 'backward', f_out_z(:,:,n))
[1216]	600
[1306]	601	IF ( kk == nblk ) THEN
	602	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	603	ELSE
	604	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
[1216]	605	ENDIF
	606
[1318]	607	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
[1216]	608
[1306]	609	CALL transpose_xz( ar1(:,:,:), f_in )
[1216]	610
[1306]	611	DO knew = 1, nz
	612	ki = kk + nblk * (knew-1)
	613	f_inv(:,:,ki) = f_in(:,:,knew)
	614	ENDDO
[1216]	615
[1306]	616	IF ( kk == nblk ) THEN
	617	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
	618	ELSE
	619	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
[1216]	620	ENDIF
	621
	622	ENDDO
	623	!
	624	!-- Restore original indices/counters
	625	nz = isave(1)
	626	nzb_x = isave(2)
	627	nzt_x = isave(3)
	628	sendrecvcount_zx = isave(4)
	629
	630	CALL resort_for_xz( f_inv, ar )
	631
	632	DEALLOCATE( ar1, f_in, f_inv )
	633
[1]	634	ENDIF
[3634]	635	#endif
[1]	636
[3690]	637	#if !__acc_fft_device
	638	!$ACC UPDATE DEVICE(ar)
	639	#endif
	640
[1]	641	CALL cpu_log( log_point_s(3), 'poisfft', 'stop' )
	642
	643	END SUBROUTINE poisfft
	644
	645
	646	!------------------------------------------------------------------------------!
[1682]	647	! Description:
	648	! ------------
	649	!> Fourier-transformation along y with subsequent transposition y --> x for
	650	!> a 1d-decomposition along x.
	651	!>
	652	!> @attention The performance of this routine is much faster on the NEC-SX6,
	653	!> if the first index of work_ffty_vec is odd. Otherwise
	654	!> memory bank conflicts may occur (especially if the index is a
	655	!> multiple of 128). That's why work_ffty_vec is dimensioned as
	656	!> 0:ny+1.
	657	!> Of course, this will not work if users are using an odd number
	658	!> of gridpoints along y.
[1]	659	!------------------------------------------------------------------------------!
[1682]	660	SUBROUTINE ffty_tr_yx( f_in, f_out )
[1]	661
[1320]	662	USE control_parameters, &
[2300]	663	ONLY: loop_optimization
[1320]	664
	665	USE cpulog, &
	666	ONLY: cpu_log, log_point_s
	667
	668	USE kinds
	669
[1]	670	USE pegrid
	671
	672	IMPLICIT NONE
	673
[1682]	674	INTEGER(iwp) :: i !<
	675	INTEGER(iwp) :: iend !<
	676	INTEGER(iwp) :: iouter !<
	677	INTEGER(iwp) :: ir !<
	678	INTEGER(iwp) :: j !<
	679	INTEGER(iwp) :: k !<
[1]	680
[1682]	681	INTEGER(iwp), PARAMETER :: stridex = 4 !<
[1320]	682
[1682]	683	REAL(wp), DIMENSION(1:nz,0:ny,nxl:nxr) :: f_in !<
	684	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_out !<
	685	REAL(wp), DIMENSION(nxl:nxr,1:nz,0:ny) :: work !<
[1]	686
[2300]	687	REAL(wp), DIMENSION(:,:), ALLOCATABLE :: work_ffty !<
	688	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: work_ffty_vec !<
	689
[1]	690	!
	691	!-- Carry out the FFT along y, where all data are present due to the
	692	!-- 1d-decomposition along x. Resort the data in a way that x becomes
	693	!-- the first index.
[1106]	694	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'start' )
[1]	695
[2300]	696	IF ( loop_optimization == 'vector' ) THEN
	697
	698	ALLOCATE( work_ffty_vec(0:ny+1,1:nz,nxl:nxr) )
[1]	699	!
	700	!-- Code optimized for vector processors
[2300]	701	!$OMP PARALLEL PRIVATE ( i, j, k )
	702	!$OMP DO
[1]	703	DO i = nxl, nxr
	704
	705	DO j = 0, ny
	706	DO k = 1, nz
	707	work_ffty_vec(j,k,i) = f_in(k,j,i)
	708	ENDDO
	709	ENDDO
	710
	711	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' )
	712
	713	ENDDO
	714
[2300]	715	!$OMP DO
[1]	716	DO k = 1, nz
	717	DO j = 0, ny
	718	DO i = nxl, nxr
	719	work(i,k,j) = work_ffty_vec(j,k,i)
	720	ENDDO
	721	ENDDO
	722	ENDDO
[2300]	723	!$OMP END PARALLEL
[1]	724
[2300]	725	DEALLOCATE( work_ffty_vec )
	726
[1]	727	ELSE
	728	!
	729	!-- Cache optimized code.
[2300]	730	ALLOCATE( work_ffty(0:ny,stridex) )
	731	!
[1]	732	!-- The i-(x-)direction is split into a strided outer loop and an inner
	733	!-- loop for better cache performance
[2300]	734	!$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty)
	735	!$OMP DO
[1]	736	DO iouter = nxl, nxr, stridex
	737
	738	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
	739
	740	DO k = 1, nz
	741
	742	DO i = iouter, iend
	743
	744	ir = i-iouter+1 ! counter within a stride
	745	DO j = 0, ny
	746	work_ffty(j,ir) = f_in(k,j,i)
	747	ENDDO
	748	!
	749	!-- FFT along y
[1106]	750	CALL fft_y_1d( work_ffty(:,ir), 'forward' )
[1]	751
	752	ENDDO
	753
	754	!
	755	!-- Resort
	756	DO j = 0, ny
	757	DO i = iouter, iend
	758	work(i,k,j) = work_ffty(j,i-iouter+1)
	759	ENDDO
	760	ENDDO
	761
	762	ENDDO
	763
	764	ENDDO
[2300]	765	!$OMP END PARALLEL
[1]	766
[2300]	767	DEALLOCATE( work_ffty )
	768
[1]	769	ENDIF
[2300]	770
[1106]	771	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'pause' )
[1]	772
	773	!
	774	!-- Transpose array
[1111]	775	#if defined( __parallel )
[1]	776	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	777	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	778	CALL MPI_ALLTOALL( work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
	779	f_out(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
	780	comm1dx, ierr )
	781	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1111]	782	#endif
[1]	783
	784	END SUBROUTINE ffty_tr_yx
	785
	786
	787	!------------------------------------------------------------------------------!
[1682]	788	! Description:
	789	! ------------
	790	!> Transposition x --> y with a subsequent backward Fourier transformation for
	791	!> a 1d-decomposition along x
[1]	792	!------------------------------------------------------------------------------!
[1682]	793	SUBROUTINE tr_xy_ffty( f_in, f_out )
[1]	794
[1320]	795	USE control_parameters, &
[2300]	796	ONLY: loop_optimization
[1320]	797
	798	USE cpulog, &
	799	ONLY: cpu_log, log_point_s
	800
	801	USE kinds
	802
[1]	803	USE pegrid
	804
	805	IMPLICIT NONE
	806
[1682]	807	INTEGER(iwp) :: i !<
	808	INTEGER(iwp) :: iend !<
	809	INTEGER(iwp) :: iouter !<
	810	INTEGER(iwp) :: ir !<
	811	INTEGER(iwp) :: j !<
	812	INTEGER(iwp) :: k !<
[1]	813
[1682]	814	INTEGER(iwp), PARAMETER :: stridex = 4 !<
[1320]	815
[1682]	816	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_in !<
	817	REAL(wp), DIMENSION(1:nz,0:ny,nxl:nxr) :: f_out !<
	818	REAL(wp), DIMENSION(nxl:nxr,1:nz,0:ny) :: work !<
[1]	819
[2300]	820	REAL(wp), DIMENSION(:,:), ALLOCATABLE :: work_ffty !<
	821	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: work_ffty_vec !<
	822
[1]	823	!
	824	!-- Transpose array
[1111]	825	#if defined( __parallel )
[1]	826	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	827	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	828	CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
	829	work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
	830	comm1dx, ierr )
	831	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1111]	832	#endif
[1]	833
	834	!
	835	!-- Resort the data in a way that y becomes the first index and carry out the
	836	!-- backward fft along y.
[1106]	837	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'continue' )
[1]	838
[2300]	839	IF ( loop_optimization == 'vector' ) THEN
	840
	841	ALLOCATE( work_ffty_vec(0:ny+1,1:nz,nxl:nxr) )
[1]	842	!
	843	!-- Code optimized for vector processors
[2300]	844	!$OMP PARALLEL PRIVATE ( i, j, k )
	845	!$OMP DO
[1]	846	DO k = 1, nz
	847	DO j = 0, ny
	848	DO i = nxl, nxr
	849	work_ffty_vec(j,k,i) = work(i,k,j)
	850	ENDDO
	851	ENDDO
	852	ENDDO
	853
[2300]	854	!$OMP DO
[1]	855	DO i = nxl, nxr
	856
	857	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' )
	858
	859	DO j = 0, ny
	860	DO k = 1, nz
	861	f_out(k,j,i) = work_ffty_vec(j,k,i)
	862	ENDDO
	863	ENDDO
	864
	865	ENDDO
[2300]	866	!$OMP END PARALLEL
[1]	867
[2300]	868	DEALLOCATE( work_ffty_vec )
	869
[1]	870	ELSE
	871	!
	872	!-- Cache optimized code.
[2300]	873	ALLOCATE( work_ffty(0:ny,stridex) )
	874	!
[1]	875	!-- The i-(x-)direction is split into a strided outer loop and an inner
	876	!-- loop for better cache performance
[2300]	877	!$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty )
	878	!$OMP DO
[1]	879	DO iouter = nxl, nxr, stridex
	880
	881	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
	882
	883	DO k = 1, nz
	884	!
	885	!-- Resort
	886	DO j = 0, ny
	887	DO i = iouter, iend
	888	work_ffty(j,i-iouter+1) = work(i,k,j)
	889	ENDDO
	890	ENDDO
	891
	892	DO i = iouter, iend
	893
	894	!
	895	!-- FFT along y
	896	ir = i-iouter+1 ! counter within a stride
[1106]	897	CALL fft_y_1d( work_ffty(:,ir), 'backward' )
[1]	898
	899	DO j = 0, ny
	900	f_out(k,j,i) = work_ffty(j,ir)
	901	ENDDO
	902	ENDDO
	903
	904	ENDDO
	905
	906	ENDDO
[2300]	907	!$OMP END PARALLEL
[1]	908
[2300]	909	DEALLOCATE( work_ffty )
	910
[1]	911	ENDIF
	912
[1106]	913	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'stop' )
[1]	914
	915	END SUBROUTINE tr_xy_ffty
	916
	917
	918	!------------------------------------------------------------------------------!
[1682]	919	! Description:
	920	! ------------
	921	!> FFT along x, solution of the tridiagonal system and backward FFT for
	922	!> a 1d-decomposition along x
	923	!>
	924	!> @warning this subroutine may still not work for hybrid parallelization
	925	!> with OpenMP (for possible necessary changes see the original
	926	!> routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
[1]	927	!------------------------------------------------------------------------------!
[1682]	928	SUBROUTINE fftx_tri_fftx( ar )
[1]	929
[1320]	930	USE control_parameters, &
[2300]	931	ONLY: loop_optimization
[1320]	932
	933	USE cpulog, &
	934	ONLY: cpu_log, log_point_s
	935
	936	USE grid_variables, &
	937	ONLY: ddx2, ddy2
	938
	939	USE kinds
	940
[1]	941	USE pegrid
	942
	943	IMPLICIT NONE
	944
[1682]	945	INTEGER(iwp) :: i !<
	946	INTEGER(iwp) :: j !<
	947	INTEGER(iwp) :: k !<
	948	INTEGER(iwp) :: m !<
	949	INTEGER(iwp) :: n !<
[3241]	950	!$ INTEGER(iwp) :: omp_get_thread_num !<
[1682]	951	INTEGER(iwp) :: tn !<
[1]	952
[1682]	953	REAL(wp), DIMENSION(0:nx) :: work_fftx !<
	954	REAL(wp), DIMENSION(0:nx,1:nz) :: work_trix !<
	955	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: ar !<
	956	REAL(wp), DIMENSION(:,:,:,:), ALLOCATABLE :: tri !<
[1]	957
	958
[1106]	959	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'start' )
[1]	960
	961	ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) )
	962
	963	tn = 0 ! Default thread number in case of one thread
	964	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_fftx, work_trix )
	965	DO j = nys_x, nyn_x
	966
	967	!$ tn = omp_get_thread_num()
	968
[2300]	969	IF ( loop_optimization == 'vector' ) THEN
[1]	970	!
	971	!-- Code optimized for vector processors
	972	DO k = 1, nz
	973
	974	m = 0
	975	DO n = 1, pdims(1)
[1003]	976	DO i = 1, nnx
[1]	977	work_trix(m,k) = ar(i,k,j,n)
	978	m = m + 1
	979	ENDDO
	980	ENDDO
	981
	982	ENDDO
	983
	984	CALL fft_x_m( work_trix, 'forward' )
	985
	986	ELSE
	987	!
	988	!-- Cache optimized code
	989	DO k = 1, nz
	990
	991	m = 0
	992	DO n = 1, pdims(1)
[1003]	993	DO i = 1, nnx
[1]	994	work_fftx(m) = ar(i,k,j,n)
	995	m = m + 1
	996	ENDDO
	997	ENDDO
	998
[1106]	999	CALL fft_x_1d( work_fftx, 'forward' )
[1]	1000
	1001	DO i = 0, nx
	1002	work_trix(i,k) = work_fftx(i)
	1003	ENDDO
	1004
	1005	ENDDO
	1006
	1007	ENDIF
	1008
	1009	!
	1010	!-- Solve the linear equation system
	1011	CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) )
	1012
[2300]	1013	IF ( loop_optimization == 'vector' ) THEN
[1]	1014	!
	1015	!-- Code optimized for vector processors
	1016	CALL fft_x_m( work_trix, 'backward' )
	1017
	1018	DO k = 1, nz
	1019
	1020	m = 0
	1021	DO n = 1, pdims(1)
[1003]	1022	DO i = 1, nnx
[1]	1023	ar(i,k,j,n) = work_trix(m,k)
	1024	m = m + 1
	1025	ENDDO
	1026	ENDDO
	1027
	1028	ENDDO
	1029
	1030	ELSE
	1031	!
	1032	!-- Cache optimized code
	1033	DO k = 1, nz
	1034
	1035	DO i = 0, nx
	1036	work_fftx(i) = work_trix(i,k)
	1037	ENDDO
	1038
[1106]	1039	CALL fft_x_1d( work_fftx, 'backward' )
[1]	1040
	1041	m = 0
	1042	DO n = 1, pdims(1)
[1003]	1043	DO i = 1, nnx
[1]	1044	ar(i,k,j,n) = work_fftx(m)
	1045	m = m + 1
	1046	ENDDO
	1047	ENDDO
	1048
	1049	ENDDO
	1050
	1051	ENDIF
	1052
	1053	ENDDO
	1054
	1055	DEALLOCATE( tri )
	1056
[1106]	1057	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'stop' )
[1]	1058
	1059	END SUBROUTINE fftx_tri_fftx
	1060
	1061
	1062	!------------------------------------------------------------------------------!
[1682]	1063	! Description:
	1064	! ------------
	1065	!> Fourier-transformation along x with subsequent transposition x --> y for
	1066	!> a 1d-decomposition along y.
	1067	!>
	1068	!> @attention NEC-branch of this routine may significantly profit from
	1069	!> further optimizations. So far, performance is much worse than
	1070	!> for routine ffty_tr_yx (more than three times slower).
[1]	1071	!------------------------------------------------------------------------------!
[1682]	1072	SUBROUTINE fftx_tr_xy( f_in, f_out )
[1]	1073
[1682]	1074
[1320]	1075	USE control_parameters, &
[2300]	1076	ONLY: loop_optimization
[1320]	1077
	1078	USE cpulog, &
	1079	ONLY: cpu_log, log_point_s
	1080
	1081	USE kinds
	1082
[1]	1083	USE pegrid
	1084
	1085	IMPLICIT NONE
	1086
[1682]	1087	INTEGER(iwp) :: i !<
	1088	INTEGER(iwp) :: j !<
	1089	INTEGER(iwp) :: k !<
[1]	1090
[1682]	1091	REAL(wp), DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx !<
	1092	REAL(wp), DIMENSION(1:nz,nys:nyn,0:nx) :: f_in !<
	1093	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_out !<
	1094	REAL(wp), DIMENSION(nys:nyn,1:nz,0:nx) :: work !<
[1]	1095
	1096	!
	1097	!-- Carry out the FFT along x, where all data are present due to the
	1098	!-- 1d-decomposition along y. Resort the data in a way that y becomes
	1099	!-- the first index.
[1106]	1100	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'start' )
[1]	1101
[2300]	1102	IF ( loop_optimization == 'vector' ) THEN
[1]	1103	!
	1104	!-- Code for vector processors
[85]	1105	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	1106	!$OMP DO
	1107	DO i = 0, nx
	1108
	1109	DO j = nys, nyn
	1110	DO k = 1, nz
	1111	work_fftx(i,k,j) = f_in(k,j,i)
	1112	ENDDO
	1113	ENDDO
	1114
	1115	ENDDO
	1116
	1117	!$OMP DO
	1118	DO j = nys, nyn
	1119
	1120	CALL fft_x_m( work_fftx(:,:,j), 'forward' )
	1121
	1122	DO k = 1, nz
	1123	DO i = 0, nx
	1124	work(j,k,i) = work_fftx(i,k,j)
	1125	ENDDO
	1126	ENDDO
	1127
	1128	ENDDO
	1129	!$OMP END PARALLEL
	1130
	1131	ELSE
	1132
	1133	!
	1134	!-- Cache optimized code (there might be still a potential for better
	1135	!-- optimization).
[696]	1136	!$OMP PARALLEL PRIVATE (i,j,k)
[1]	1137	!$OMP DO
	1138	DO i = 0, nx
	1139
	1140	DO j = nys, nyn
	1141	DO k = 1, nz
	1142	work_fftx(i,k,j) = f_in(k,j,i)
	1143	ENDDO
	1144	ENDDO
	1145
	1146	ENDDO
	1147
	1148	!$OMP DO
	1149	DO j = nys, nyn
	1150	DO k = 1, nz
	1151
[1106]	1152	CALL fft_x_1d( work_fftx(0:nx,k,j), 'forward' )
[1]	1153
	1154	DO i = 0, nx
	1155	work(j,k,i) = work_fftx(i,k,j)
	1156	ENDDO
	1157	ENDDO
	1158
	1159	ENDDO
	1160	!$OMP END PARALLEL
	1161
	1162	ENDIF
[1106]	1163	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'pause' )
[1]	1164
	1165	!
	1166	!-- Transpose array
[1111]	1167	#if defined( __parallel )
[1]	1168	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	1169	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	1170	CALL MPI_ALLTOALL( work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
	1171	f_out(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
	1172	comm1dy, ierr )
	1173	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1111]	1174	#endif
[1]	1175
	1176	END SUBROUTINE fftx_tr_xy
	1177
	1178
	1179	!------------------------------------------------------------------------------!
[1682]	1180	! Description:
	1181	! ------------
	1182	!> Transposition y --> x with a subsequent backward Fourier transformation for
	1183	!> a 1d-decomposition along x.
[1]	1184	!------------------------------------------------------------------------------!
[1682]	1185	SUBROUTINE tr_yx_fftx( f_in, f_out )
[1]	1186
[1682]	1187
[1320]	1188	USE control_parameters, &
[2300]	1189	ONLY: loop_optimization
[1320]	1190
	1191	USE cpulog, &
	1192	ONLY: cpu_log, log_point_s
	1193
	1194	USE kinds
	1195
[1]	1196	USE pegrid
	1197
	1198	IMPLICIT NONE
	1199
[1682]	1200	INTEGER(iwp) :: i !<
	1201	INTEGER(iwp) :: j !<
	1202	INTEGER(iwp) :: k !<
[1]	1203
[1682]	1204	REAL(wp), DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx !<
	1205	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_in !<
	1206	REAL(wp), DIMENSION(1:nz,nys:nyn,0:nx) :: f_out !<
	1207	REAL(wp), DIMENSION(nys:nyn,1:nz,0:nx) :: work !<
[1]	1208
	1209	!
	1210	!-- Transpose array
[1111]	1211	#if defined( __parallel )
[1]	1212	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	1213	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	1214	CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
	1215	work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
	1216	comm1dy, ierr )
	1217	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1111]	1218	#endif
[1]	1219
	1220	!
	1221	!-- Carry out the FFT along x, where all data are present due to the
	1222	!-- 1d-decomposition along y. Resort the data in a way that y becomes
	1223	!-- the first index.
[1106]	1224	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'continue' )
[1]	1225
[2300]	1226	IF ( loop_optimization == 'vector' ) THEN
[1]	1227	!
	1228	!-- Code optimized for vector processors
[85]	1229	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	1230	!$OMP DO
	1231	DO j = nys, nyn
	1232
	1233	DO k = 1, nz
	1234	DO i = 0, nx
	1235	work_fftx(i,k,j) = work(j,k,i)
	1236	ENDDO
	1237	ENDDO
	1238
	1239	CALL fft_x_m( work_fftx(:,:,j), 'backward' )
	1240
	1241	ENDDO
	1242
	1243	!$OMP DO
	1244	DO i = 0, nx
	1245	DO j = nys, nyn
	1246	DO k = 1, nz
	1247	f_out(k,j,i) = work_fftx(i,k,j)
	1248	ENDDO
	1249	ENDDO
	1250	ENDDO
	1251	!$OMP END PARALLEL
	1252
	1253	ELSE
	1254
	1255	!
	1256	!-- Cache optimized code (there might be still a potential for better
	1257	!-- optimization).
[696]	1258	!$OMP PARALLEL PRIVATE (i,j,k)
[1]	1259	!$OMP DO
	1260	DO j = nys, nyn
	1261	DO k = 1, nz
	1262
	1263	DO i = 0, nx
	1264	work_fftx(i,k,j) = work(j,k,i)
	1265	ENDDO
	1266
[1106]	1267	CALL fft_x_1d( work_fftx(0:nx,k,j), 'backward' )
[1]	1268
	1269	ENDDO
	1270	ENDDO
	1271
	1272	!$OMP DO
	1273	DO i = 0, nx
	1274	DO j = nys, nyn
	1275	DO k = 1, nz
	1276	f_out(k,j,i) = work_fftx(i,k,j)
	1277	ENDDO
	1278	ENDDO
	1279	ENDDO
	1280	!$OMP END PARALLEL
	1281
	1282	ENDIF
[1106]	1283	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'stop' )
[1]	1284
	1285	END SUBROUTINE tr_yx_fftx
	1286
	1287
	1288	!------------------------------------------------------------------------------!
[1682]	1289	! Description:
	1290	! ------------
	1291	!> FFT along y, solution of the tridiagonal system and backward FFT for
	1292	!> a 1d-decomposition along y.
	1293	!>
	1294	!> @warning this subroutine may still not work for hybrid parallelization
	1295	!> with OpenMP (for possible necessary changes see the original
	1296	!> routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
[1]	1297	!------------------------------------------------------------------------------!
[1682]	1298	SUBROUTINE ffty_tri_ffty( ar )
[1]	1299
[1682]	1300
[1320]	1301	USE control_parameters, &
[2300]	1302	ONLY: loop_optimization
[1320]	1303
	1304	USE cpulog, &
	1305	ONLY: cpu_log, log_point_s
	1306
	1307	USE grid_variables, &
	1308	ONLY: ddx2, ddy2
	1309
	1310	USE kinds
	1311
[1]	1312	USE pegrid
	1313
	1314	IMPLICIT NONE
	1315
[1682]	1316	INTEGER(iwp) :: i !<
	1317	INTEGER(iwp) :: j !<
	1318	INTEGER(iwp) :: k !<
	1319	INTEGER(iwp) :: m !<
	1320	INTEGER(iwp) :: n !<
[3241]	1321	!$ INTEGER(iwp) :: omp_get_thread_num !<
[1682]	1322	INTEGER(iwp) :: tn !<
[1]	1323
[1682]	1324	REAL(wp), DIMENSION(0:ny) :: work_ffty !<
	1325	REAL(wp), DIMENSION(0:ny,1:nz) :: work_triy !<
	1326	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: ar !<
	1327	REAL(wp), DIMENSION(:,:,:,:), ALLOCATABLE :: tri !<
[1]	1328
	1329
[1106]	1330	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'start' )
[1]	1331
	1332	ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) )
	1333
	1334	tn = 0 ! Default thread number in case of one thread
[696]	1335	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy )
[1]	1336	DO i = nxl_y, nxr_y
	1337
	1338	!$ tn = omp_get_thread_num()
	1339
[2300]	1340	IF ( loop_optimization == 'vector' ) THEN
[1]	1341	!
	1342	!-- Code optimized for vector processors
	1343	DO k = 1, nz
	1344
	1345	m = 0
	1346	DO n = 1, pdims(2)
[1003]	1347	DO j = 1, nny
[1]	1348	work_triy(m,k) = ar(j,k,i,n)
	1349	m = m + 1
	1350	ENDDO
	1351	ENDDO
	1352
	1353	ENDDO
	1354
	1355	CALL fft_y_m( work_triy, ny, 'forward' )
	1356
	1357	ELSE
	1358	!
	1359	!-- Cache optimized code
	1360	DO k = 1, nz
	1361
	1362	m = 0
	1363	DO n = 1, pdims(2)
[1003]	1364	DO j = 1, nny
[1]	1365	work_ffty(m) = ar(j,k,i,n)
	1366	m = m + 1
	1367	ENDDO
	1368	ENDDO
	1369
[1106]	1370	CALL fft_y_1d( work_ffty, 'forward' )
[1]	1371
	1372	DO j = 0, ny
	1373	work_triy(j,k) = work_ffty(j)
	1374	ENDDO
	1375
	1376	ENDDO
	1377
	1378	ENDIF
	1379
	1380	!
	1381	!-- Solve the linear equation system
	1382	CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) )
	1383
[2300]	1384	IF ( loop_optimization == 'vector' ) THEN
[1]	1385	!
	1386	!-- Code optimized for vector processors
	1387	CALL fft_y_m( work_triy, ny, 'backward' )
	1388
	1389	DO k = 1, nz
	1390
	1391	m = 0
	1392	DO n = 1, pdims(2)
[1003]	1393	DO j = 1, nny
[1]	1394	ar(j,k,i,n) = work_triy(m,k)
	1395	m = m + 1
	1396	ENDDO
	1397	ENDDO
	1398
	1399	ENDDO
	1400
	1401	ELSE
	1402	!
	1403	!-- Cache optimized code
	1404	DO k = 1, nz
	1405
	1406	DO j = 0, ny
	1407	work_ffty(j) = work_triy(j,k)
	1408	ENDDO
	1409
[1106]	1410	CALL fft_y_1d( work_ffty, 'backward' )
[1]	1411
	1412	m = 0
	1413	DO n = 1, pdims(2)
[1003]	1414	DO j = 1, nny
[1]	1415	ar(j,k,i,n) = work_ffty(m)
	1416	m = m + 1
	1417	ENDDO
	1418	ENDDO
	1419
	1420	ENDDO
	1421
	1422	ENDIF
	1423
	1424	ENDDO
	1425
	1426	DEALLOCATE( tri )
	1427
[1106]	1428	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'stop' )
[1]	1429
	1430	END SUBROUTINE ffty_tri_ffty
	1431
	1432	END MODULE poisfft_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |