Home

Context Navigation

source: palm/trunk/SOURCE/poisfft_mod.f90 @ 4784

Last change on this file since 4784 was 4717, checked in by pavelkrc, 4 years ago
Fixes and optimizations of OpenMP parallelization, formatting of OpenMP directives
Property svn:keywords set to `Id`
File size: 45.7 KB

Rev	Line
[1850]	1	!> @file poisfft_mod.f90
[4649]	2	!--------------------------------------------------------------------------------------------------!
[2696]	3	! This file is part of the PALM model system.
[1036]	4	!
[4649]	5	! PALM is free software: you can redistribute it and/or modify it under the terms of the GNU General
	6	! Public License as published by the Free Software Foundation, either version 3 of the License, or
	7	! (at your option) any later version.
[1036]	8	!
[4649]	9	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the
	10	! implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
	11	! Public License for more details.
[1036]	12	!
[4649]	13	! You should have received a copy of the GNU General Public License along with PALM. If not, see
	14	! <http://www.gnu.org/licenses/>.
[1036]	15	!
[4360]	16	! Copyright 1997-2020 Leibniz Universitaet Hannover
[4649]	17	!--------------------------------------------------------------------------------------------------!
[1036]	18	!
[4649]	19	!
[484]	20	! Current revisions:
[1]	21	! -----------------
[4649]	22	!
	23	!
[1321]	24	! Former revisions:
	25	! -----------------
	26	! $Id: poisfft_mod.f90 4717 2020-09-30 22:27:40Z suehring $
[4717]	27	! Formatting of OpenMP directives (J. Resler)
	28	!
	29	! 4671 2020-09-09 20:27:58Z pavelkrc
[4671]	30	! OMP bugfix
	31	!
	32	! 4649 2020-08-25 12:11:17Z raasch
[4649]	33	! File re-formatted to follow the PALM coding standard
	34	!
	35	! 4429 2020-02-27 15:24:30Z raasch
[4429]	36	! Statements added to avoid compile errors due to unused dummy arguments in serial mode
[4649]	37	!
[4429]	38	! 4366 2020-01-09 08:12:43Z raasch
[4649]	39	! Modification concerning NEC vectorizatio
	40	!
[4366]	41	! 4360 2020-01-07 11:25:50Z suehring
[4182]	42	! Corrected "Former revisions" section
[4649]	43	!
[4182]	44	! 3690 2019-01-22 22:56:42Z knoop
[3634]	45	! OpenACC port for SPEC
[2716]	46	!
[4182]	47	! Revision 1.1 1997/07/24 11:24:14 raasch
	48	! Initial revision
	49	!
	50	!
[4649]	51	!--------------------------------------------------------------------------------------------------!
[1]	52	! Description:
	53	! ------------
[1682]	54	!> Solves the Poisson equation with a 2D spectral method
	55	!> d^2 p / dx^2 + d^2 p / dy^2 + d^2 p / dz^2 = s
	56	!>
	57	!> Input:
[4649]	58	!> real ar contains (nnz,nny,nnx) elements of the velocity divergence, starting from (1,nys,nxl)
[1682]	59	!>
	60	!> Output:
[4649]	61	!> real ar contains the solution for perturbation pressure p
	62	!--------------------------------------------------------------------------------------------------!
[1682]	63	MODULE poisfft_mod
[1]	64
[4649]	65
	66	USE fft_xy, &
	67	ONLY: fft_init, &
	68	fft_y, &
	69	fft_y_1d, &
	70	fft_y_m, &
	71	fft_x, &
	72	fft_x_1d, &
	73	fft_x_m, &
[4366]	74	temperton_fft_vec
[1]	75
[4649]	76	USE indices, &
	77	ONLY: nnx, &
	78	nny, &
	79	nx, &
	80	nxl, &
	81	nxr, &
	82	ny, &
	83	nys, &
	84	nyn, &
	85	nz
[1320]	86
[4649]	87	USE transpose_indices, &
	88	ONLY: nxl_y, &
	89	nxl_z, &
	90	nxr_y, &
	91	nxr_z, &
	92	nys_x, &
	93	nys_z, &
	94	nyn_x, &
	95	nyn_z, &
	96	nzb_x, &
	97	nzb_y, &
	98	nzt_x, &
	99	nzt_y
[1320]	100
[4649]	101	USE tridia_solver, &
	102	ONLY: tridia_1dd, &
	103	tridia_init, &
	104	tridia_substi, &
	105	tridia_substi_overlap
[1320]	106
[1]	107	IMPLICIT NONE
	108
[4649]	109	LOGICAL, SAVE :: poisfft_initialized = .FALSE. !<
[1111]	110
[1]	111	PRIVATE
[807]	112
[1]	113	PUBLIC poisfft, poisfft_init
	114
	115	INTERFACE poisfft
	116	MODULE PROCEDURE poisfft
	117	END INTERFACE poisfft
	118
	119	INTERFACE poisfft_init
	120	MODULE PROCEDURE poisfft_init
	121	END INTERFACE poisfft_init
	122
[807]	123
[1]	124	CONTAINS
	125
[4649]	126	!--------------------------------------------------------------------------------------------------!
[1682]	127	! Description:
	128	! ------------
[3241]	129	!> Setup coefficients for FFT and the tridiagonal solver
[4649]	130	!--------------------------------------------------------------------------------------------------!
	131	SUBROUTINE poisfft_init
[1]	132
[4649]	133	IMPLICIT NONE
[1111]	134
	135
[4649]	136	CALL fft_init
[1]	137
[4649]	138	CALL tridia_init
[1111]	139
[4649]	140	poisfft_initialized = .TRUE.
[1111]	141
[4649]	142	END SUBROUTINE poisfft_init
[1]	143
[1111]	144
[1804]	145
[4649]	146	!--------------------------------------------------------------------------------------------------!
[1682]	147	! Description:
	148	! ------------
	149	!> Two-dimensional Fourier Transformation in x- and y-direction.
[4649]	150	!--------------------------------------------------------------------------------------------------!
	151	SUBROUTINE poisfft( ar )
[1]	152
[4649]	153	USE control_parameters, &
	154	ONLY: transpose_compute_overlap
[1320]	155
[4649]	156	USE cpulog, &
	157	ONLY: cpu_log, &
	158	cpu_log_nowait, &
	159	log_point_s
[1320]	160
[4649]	161	USE kinds
[1320]	162
[4649]	163	USE pegrid
[1]	164
[4649]	165	IMPLICIT NONE
[1]	166
[4649]	167	INTEGER(iwp) :: ii !<
	168	INTEGER(iwp) :: iind !<
	169	INTEGER(iwp) :: inew !<
	170	INTEGER(iwp) :: jj !<
	171	INTEGER(iwp) :: jind !<
	172	INTEGER(iwp) :: jnew !<
	173	INTEGER(iwp) :: ki !<
	174	INTEGER(iwp) :: kk !<
	175	INTEGER(iwp) :: knew !<
	176	INTEGER(iwp) :: n !<
	177	INTEGER(iwp) :: nblk !<
	178	INTEGER(iwp) :: nnx_y !<
	179	INTEGER(iwp) :: nny_z !<
	180	INTEGER(iwp) :: nnz_x !<
	181	INTEGER(iwp) :: nxl_y_bound !<
	182	INTEGER(iwp) :: nxr_y_bound !<
[1]	183
[4649]	184	INTEGER(iwp), DIMENSION(4) :: isave !<
[1320]	185
[4649]	186	REAL(wp), DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar !<
	187	REAL(wp), DIMENSION(nys:nyn,nxl:nxr,1:nz) :: ar_inv !<
[3690]	188
	189	#define __acc_fft_device ( defined( _OPENACC ) && ( defined ( __cuda_fft ) ) )
	190	#if __acc_fft_device
[4649]	191	!$ACC DECLARE CREATE(ar_inv)
[3690]	192	#endif
[1]	193
[4649]	194	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: ar1 !<
	195	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_in !<
	196	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_inv !<
	197	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_out_y !<
	198	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_out_z !<
[1216]	199
	200
[4649]	201	CALL cpu_log( log_point_s(3), 'poisfft', 'start' )
[1]	202
[4649]	203	IF ( .NOT. poisfft_initialized ) CALL poisfft_init
[1111]	204
[3690]	205	#if !__acc_fft_device
[4649]	206	!$ACC UPDATE HOST(ar)
[3690]	207	#endif
	208
[3634]	209	#ifndef _OPENACC
[1]	210	!
[4649]	211	!-- Two-dimensional Fourier Transformation in x- and y-direction.
	212	IF ( pdims(2) == 1 .AND. pdims(1) > 1 ) THEN
[1]	213
	214	!
[4649]	215	!-- 1d-domain-decomposition along x:
	216	!-- FFT along y and transposition y --> x
	217	CALL ffty_tr_yx( ar, ar )
[1]	218
	219	!
[4649]	220	!-- FFT along x, solving the tridiagonal system and backward FFT
	221	CALL fftx_tri_fftx( ar )
[1]	222
	223	!
[4649]	224	!-- Transposition x --> y and backward FFT along y
	225	CALL tr_xy_ffty( ar, ar )
[1]	226
[4649]	227	ELSEIF ( pdims(1) == 1 .AND. pdims(2) > 1 ) THEN
[1]	228
	229	!
[4649]	230	!-- 1d-domain-decomposition along y:
	231	!-- FFT along x and transposition x --> y
	232	CALL fftx_tr_xy( ar, ar )
[1]	233
	234	!
[4649]	235	!-- FFT along y, solving the tridiagonal system and backward FFT
	236	CALL ffty_tri_ffty( ar )
[1]	237
	238	!
[4649]	239	!-- Transposition y --> x and backward FFT along x
	240	CALL tr_yx_fftx( ar, ar )
[1]	241
[4649]	242	ELSEIF ( .NOT. transpose_compute_overlap ) THEN
[3634]	243	#endif
[1]	244
	245	!
[4649]	246	!-- 2d-domain-decomposition or no decomposition (1 PE run)
	247	!-- Transposition z --> x
	248	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
	249	CALL resort_for_zx( ar, ar_inv )
	250	CALL transpose_zx( ar_inv, ar )
	251	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
[1]	252
[4649]	253	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
	254	IF ( temperton_fft_vec ) THEN
[4366]	255	!
[4649]	256	!-- Vector version outputs a transformed array ar_inv that does not require resorting
	257	!-- (which is done for ar further below)
	258	CALL fft_x( ar, 'forward', ar_inv=ar_inv)
	259	ELSE
	260	CALL fft_x( ar, 'forward')
	261	ENDIF
	262	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
[1]	263
	264	!
[4649]	265	!-- Transposition x --> y
	266	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
	267	IF( .NOT. temperton_fft_vec ) CALL resort_for_xy( ar, ar_inv )
	268	CALL transpose_xy( ar_inv, ar )
	269	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
[1]	270
[4649]	271	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
	272	IF ( temperton_fft_vec ) THEN
[4366]	273	!
[4649]	274	!-- Input array ar_inv from fft_x can be directly used here.
	275	!-- The output (also in array ar_inv) does not require resorting below.
	276	CALL fft_y( ar, 'forward', ar_inv = ar_inv, nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
	277	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
	278	ELSE
	279	CALL fft_y( ar, 'forward', ar_tr = ar, nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
	280	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
	281	ENDIF
	282	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
[1]	283
	284	!
[4649]	285	!-- Transposition y --> z
	286	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
	287	IF ( .NOT. temperton_fft_vec ) CALL resort_for_yz( ar, ar_inv )
	288	CALL transpose_yz( ar_inv, ar )
	289	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
[1]	290
	291	!
[4649]	292	!-- Solve the tridiagonal equation system along z
	293	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
	294	CALL tridia_substi( ar )
	295	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
[1]	296
	297	!
[4649]	298	!-- Inverse Fourier Transformation
	299	!-- Transposition z --> y
	300	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
	301	CALL transpose_zy( ar, ar_inv )
[4366]	302	!
[4649]	303	!-- The fft_y below (vector branch) can directly process ar_inv (i.e. does not require a
	304	!-- resorting)
	305	IF ( .NOT. temperton_fft_vec ) CALL resort_for_zy( ar_inv, ar )
	306	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
[1]	307
[4649]	308	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
	309	IF ( temperton_fft_vec ) THEN
[4366]	310	!
[4649]	311	!-- Output array ar_inv can be used as input to the below fft_x routine without resorting
	312	CALL fft_y( ar, 'backward', ar_inv = ar_inv, nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
	313	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
	314	ELSE
	315	CALL fft_y( ar, 'backward', ar_tr = ar, nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
	316	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
	317	ENDIF
[4366]	318
[4649]	319	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
[1]	320
	321	!
[4649]	322	!-- Transposition y --> x
	323	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
	324	CALL transpose_yx( ar, ar_inv )
	325	IF ( .NOT. temperton_fft_vec ) CALL resort_for_yx( ar_inv, ar )
	326	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
[1]	327
[4649]	328	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
	329	IF ( temperton_fft_vec ) THEN
	330	CALL fft_x( ar, 'backward', ar_inv=ar_inv )
	331	ELSE
	332	CALL fft_x( ar, 'backward' )
	333	ENDIF
	334	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
[1]	335
	336	!
[4649]	337	!-- Transposition x --> z
	338	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
	339	CALL transpose_xz( ar, ar_inv )
	340	CALL resort_for_xz( ar_inv, ar )
	341	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
[1]	342
[3634]	343	#ifndef _OPENACC
[4649]	344	ELSE
[1216]	345
	346	!
[4649]	347	!-- 2d-domain-decomposition or no decomposition (1 PE run) with overlapping transposition / fft
	348	!-- cputime logging must not use barriers, which would prevent overlapping
	349	ALLOCATE( f_out_y(0:ny,nxl_y:nxr_y,nzb_y:nzt_y), &
	350	f_out_z(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
[1216]	351	!
[4649]	352	!-- Transposition z --> x + subsequent fft along x
	353	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
	354	CALL resort_for_zx( ar, f_inv )
[1216]	355	!
[4649]	356	!-- Save original indices and gridpoint counter
	357	isave(1) = nz
	358	isave(2) = nzb_x
	359	isave(3) = nzt_x
	360	isave(4) = sendrecvcount_zx
[1216]	361	!
[4649]	362	!-- Set new indices for transformation
	363	nblk = nz / pdims(1)
	364	nz = pdims(1)
	365	nnz_x = 1
	366	nzb_x = 1 + myidx * nnz_x
	367	nzt_x = ( myidx + 1 ) * nnz_x
	368	sendrecvcount_zx = nnx * nny * nnz_x
[1216]	369
[4649]	370	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
	371	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
[1216]	372
[4649]	373	DO kk = 1, nblk
[1216]	374
[4649]	375	IF ( kk == 1 ) THEN
	376	CALL cpu_log( log_point_s(5), 'transpo forward', 'start', cpu_log_nowait )
	377	ELSE
	378	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
	379	ENDIF
[1216]	380
[4649]	381	DO knew = 1, nz
	382	ki = kk + nblk * ( knew - 1 )
	383	f_in(:,:,knew) = f_inv(:,:,ki)
	384	ENDDO
[1216]	385
[4649]	386	CALL transpose_zx( f_in, ar1(:,:,:))
	387	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
[1216]	388
[4649]	389	IF ( kk == 1 ) THEN
	390	CALL cpu_log( log_point_s(4), 'fft_x', 'start', cpu_log_nowait )
	391	ELSE
	392	CALL cpu_log( log_point_s(4), 'fft_x', 'continue', cpu_log_nowait )
	393	ENDIF
[1216]	394
[4649]	395	n = isave(2) + kk - 1
	396	CALL fft_x( ar1(:,:,:), 'forward', ar_2d = f_out_z(:,:,n) )
	397	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
[1216]	398
[4649]	399	ENDDO
[1216]	400	!
[4649]	401	!-- Restore original indices/counters
	402	nz = isave(1)
	403	nzb_x = isave(2)
	404	nzt_x = isave(3)
	405	sendrecvcount_zx = isave(4)
[1216]	406
[4649]	407	DEALLOCATE( ar1, f_in, f_inv )
[1216]	408
	409	!
[4649]	410	!-- Transposition x --> y + subsequent fft along y
	411	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
	412	CALL resort_for_xy( f_out_z, f_inv )
[1216]	413	!
[4649]	414	!-- Save original indices and gridpoint counter
	415	isave(1) = nx
	416	isave(2) = nxl_y
	417	isave(3) = nxr_y
	418	isave(4) = sendrecvcount_xy
[1216]	419	!
[4649]	420	!-- Set new indices for transformation
	421	nblk = ( ( nx+1 ) / pdims(2) ) - 1
	422	nx = pdims(2)
	423	nnx_y = 1
	424	nxl_y = myidy * nnx_y
	425	nxr_y = ( myidy + 1 ) * nnx_y - 1
	426	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
[1216]	427
[4649]	428	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
	429	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
[1216]	430
[4649]	431	DO ii = 0, nblk
[1216]	432
[4649]	433	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
[1216]	434
[4649]	435	DO inew = 0, nx-1
	436	iind = ii + ( nblk + 1 ) * inew
	437	f_in(:,:,inew) = f_inv(:,:,iind)
	438	ENDDO
[1216]	439
[4649]	440	CALL transpose_xy( f_in, ar1(:,:,:) )
[1216]	441
[4649]	442	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
[1216]	443
[4649]	444	IF ( ii == 1 ) THEN
	445	CALL cpu_log( log_point_s(7), 'fft_y', 'start', cpu_log_nowait )
	446	ELSE
	447	CALL cpu_log( log_point_s(7), 'fft_y', 'continue', cpu_log_nowait )
	448	ENDIF
[1216]	449
[4649]	450	nxl_y_bound = isave(2)
	451	nxr_y_bound = isave(3)
	452	n = isave(2) + ii
	453	CALL fft_y( ar1(:,:,:), 'forward', ar_tr = f_out_y, nxl_y_bound = nxl_y_bound, &
	454	nxr_y_bound = nxr_y_bound, nxl_y_l = n, nxr_y_l = n )
[1216]	455
[4649]	456	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
[1216]	457
[4649]	458	ENDDO
[1216]	459	!
[4649]	460	!-- Restore original indices/counters
	461	nx = isave(1)
	462	nxl_y = isave(2)
	463	nxr_y = isave(3)
	464	sendrecvcount_xy = isave(4)
[1216]	465
[4649]	466	DEALLOCATE( ar1, f_in, f_inv )
[1216]	467
	468	!
[4649]	469	!-- Transposition y --> z + subsequent tridia + resort for z --> y
	470	ALLOCATE( f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
	471	CALL resort_for_yz( f_out_y, f_inv )
[1216]	472	!
[4649]	473	!-- Save original indices and gridpoint counter
	474	isave(1) = ny
	475	isave(2) = nys_z
	476	isave(3) = nyn_z
	477	isave(4) = sendrecvcount_yz
[1216]	478	!
[4649]	479	!-- Set new indices for transformation
	480	nblk = ( ( ny+1 ) / pdims(1) ) - 1
	481	ny = pdims(1)
	482	nny_z = 1
	483	nys_z = myidx * nny_z
	484	nyn_z = ( myidx + 1 ) * nny_z - 1
	485	sendrecvcount_yz = ( nxr_y-nxl_y+1 ) * nny_z * ( nzt_y-nzb_y+1 )
[1216]	486
[4649]	487	ALLOCATE( ar1(nxl_z:nxr_z,nys_z:nyn_z,1:nz) )
	488	ALLOCATE( f_in(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
[1216]	489
[4649]	490	DO jj = 0, nblk
[1216]	491	!
[4649]	492	!-- Forward Fourier Transformation
	493	!-- Transposition y --> z
	494	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
[1216]	495
[4649]	496	DO jnew = 0, ny-1
	497	jind = jj + ( nblk + 1 ) * jnew
	498	f_in(:,:,jnew) = f_inv(:,:,jind)
	499	ENDDO
[1216]	500
[4649]	501	CALL transpose_yz( f_in, ar1(:,:,:) )
[1216]	502
[4649]	503	IF ( jj == nblk ) THEN
	504	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
	505	ELSE
	506	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
	507	ENDIF
[1216]	508
	509	!
[4649]	510	!-- Solve the tridiagonal equation system along z
	511	CALL cpu_log( log_point_s(6), 'tridia', 'start', cpu_log_nowait )
[1216]	512
[4649]	513	n = isave(2) + jj
	514	CALL tridia_substi_overlap( ar1(:,:,:), n )
[1216]	515
[4649]	516	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
[1216]	517
[1306]	518	!
[4649]	519	!-- Inverse Fourier Transformation
	520	!-- Transposition z --> y
	521	!-- Only one thread should call MPI routines, therefore forward and backward tranpose are in
	522	!-- the same section
	523	IF ( jj == 0 ) THEN
	524	CALL cpu_log( log_point_s(8), 'transpo invers', 'start', cpu_log_nowait )
	525	ELSE
	526	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
	527	ENDIF
[1216]	528
[4649]	529	CALL transpose_zy( ar1(:,:,:), f_in )
[1216]	530
[4649]	531	DO jnew = 0, ny-1
	532	jind = jj + ( nblk + 1 ) * jnew
	533	f_inv(:,:,jind) = f_in(:,:,jnew)
	534	ENDDO
[1216]	535
[4649]	536	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
[1216]	537
[4649]	538	ENDDO
[1216]	539	!
[4649]	540	!-- Restore original indices/counters
	541	ny = isave(1)
	542	nys_z = isave(2)
	543	nyn_z = isave(3)
	544	sendrecvcount_yz = isave(4)
[1216]	545
[4649]	546	CALL resort_for_zy( f_inv, f_out_y )
[1216]	547
[4649]	548	DEALLOCATE( ar1, f_in, f_inv )
[1216]	549
	550	!
[4649]	551	!-- fft along y backward + subsequent transposition y --> x
	552	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
[1216]	553	!
[4649]	554	!-- Save original indices and gridpoint counter
	555	isave(1) = nx
	556	isave(2) = nxl_y
	557	isave(3) = nxr_y
	558	isave(4) = sendrecvcount_xy
[1216]	559	!
[4649]	560	!-- Set new indices for transformation
	561	nblk = ( ( nx+1 ) / pdims(2) ) - 1
	562	nx = pdims(2)
	563	nnx_y = 1
	564	nxl_y = myidy * nnx_y
	565	nxr_y = ( myidy + 1 ) * nnx_y - 1
	566	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
[1216]	567
[4649]	568	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
	569	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
[1216]	570
[4649]	571	DO ii = 0, nblk
[1216]	572
[4649]	573	CALL cpu_log( log_point_s(7), 'fft_y', 'continue', cpu_log_nowait )
[1216]	574
[4649]	575	n = isave(2) + ii
	576	nxl_y_bound = isave(2)
	577	nxr_y_bound = isave(3)
[1216]	578
[4649]	579	CALL fft_y( ar1(:,:,:), 'backward', ar_tr = f_out_y, nxl_y_bound = nxl_y_bound, &
	580	nxr_y_bound = nxr_y_bound, nxl_y_l = n, nxr_y_l = n )
[1216]	581
[4649]	582	IF ( ii == nblk ) THEN
	583	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	584	ELSE
	585	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	586	ENDIF
[1216]	587
[4649]	588	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
[1216]	589
[4649]	590	CALL transpose_yx( ar1(:,:,:), f_in )
[1216]	591
[4649]	592	DO inew = 0, nx-1
	593	iind = ii + (nblk+1) * inew
	594	f_inv(:,:,iind) = f_in(:,:,inew)
	595	ENDDO
[1216]	596
[4649]	597	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
[1216]	598
[4649]	599	ENDDO
[1216]	600	!
[4649]	601	!-- Restore original indices/counters
	602	nx = isave(1)
	603	nxl_y = isave(2)
	604	nxr_y = isave(3)
	605	sendrecvcount_xy = isave(4)
[1216]	606
[4649]	607	CALL resort_for_yx( f_inv, f_out_z )
[1216]	608
[4649]	609	DEALLOCATE( ar1, f_in, f_inv )
[1216]	610
	611	!
[4649]	612	!-- fft along x backward + subsequent final transposition x --> z
	613	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
[1216]	614	!
[4649]	615	!-- Save original indices and gridpoint counter
	616	isave(1) = nz
	617	isave(2) = nzb_x
	618	isave(3) = nzt_x
	619	isave(4) = sendrecvcount_zx
[1216]	620	!
[4649]	621	!-- Set new indices for transformation
	622	nblk = nz / pdims(1)
	623	nz = pdims(1)
	624	nnz_x = 1
	625	nzb_x = 1 + myidx * nnz_x
	626	nzt_x = ( myidx + 1 ) * nnz_x
	627	sendrecvcount_zx = nnx * nny * nnz_x
[1216]	628
[4649]	629	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
	630	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
[1216]	631
[4649]	632	DO kk = 1, nblk
[1216]	633
[4649]	634	CALL cpu_log( log_point_s(4), 'fft_x', 'continue', cpu_log_nowait )
[1216]	635
[4649]	636	n = isave(2) + kk - 1
	637	CALL fft_x( ar1(:,:,:), 'backward', f_out_z(:,:,n) )
[1216]	638
[4649]	639	IF ( kk == nblk ) THEN
	640	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	641	ELSE
	642	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
	643	ENDIF
[1216]	644
[4649]	645	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
[1216]	646
[4649]	647	CALL transpose_xz( ar1(:,:,:), f_in )
[1216]	648
[4649]	649	DO knew = 1, nz
	650	ki = kk + nblk * ( knew - 1 )
	651	f_inv(:,:,ki) = f_in(:,:,knew)
	652	ENDDO
[1216]	653
[4649]	654	IF ( kk == nblk ) THEN
	655	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
	656	ELSE
	657	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
	658	ENDIF
[1216]	659
[4649]	660	ENDDO
[1216]	661	!
[4649]	662	!-- Restore original indices/counters
	663	nz = isave(1)
	664	nzb_x = isave(2)
	665	nzt_x = isave(3)
	666	sendrecvcount_zx = isave(4)
[1216]	667
[4649]	668	CALL resort_for_xz( f_inv, ar )
[1216]	669
[4649]	670	DEALLOCATE( ar1, f_in, f_inv )
[1216]	671
[4649]	672	ENDIF
[3634]	673	#endif
[1]	674
[3690]	675	#if !__acc_fft_device
[4649]	676	!$ACC UPDATE DEVICE(ar)
[3690]	677	#endif
	678
[4649]	679	CALL cpu_log( log_point_s(3), 'poisfft', 'stop' )
[1]	680
[4649]	681	END SUBROUTINE poisfft
[1]	682
	683
[4649]	684	!--------------------------------------------------------------------------------------------------!
[1682]	685	! Description:
	686	! ------------
[4649]	687	!> Fourier-transformation along y with subsequent transposition y --> x for a 1d-decomposition
	688	!> along x.
[1682]	689	!>
[4649]	690	!> @attention The performance of this routine is much faster on the NEC-SX6, if the first index of
	691	!> work_ffty_vec is odd. Otherwise memory bank conflicts may occur (especially if the
	692	!> index is a multiple of 128). That's why work_ffty_vec is dimensioned as 0:ny+1.
	693	!> Of course, this will not work if users are using an odd number of gridpoints along y.
	694	!--------------------------------------------------------------------------------------------------!
	695	SUBROUTINE ffty_tr_yx( f_in, f_out )
[1]	696
[4649]	697	USE control_parameters, &
	698	ONLY: loop_optimization
[1320]	699
[4649]	700	USE cpulog, &
	701	ONLY: cpu_log, &
	702	log_point_s
[1320]	703
[4649]	704	USE kinds
[1320]	705
[4649]	706	USE pegrid
[1]	707
[4649]	708	IMPLICIT NONE
[1]	709
[4649]	710	INTEGER(iwp) :: i !<
	711	INTEGER(iwp) :: iend !<
	712	INTEGER(iwp) :: iouter !<
	713	INTEGER(iwp) :: ir !<
	714	INTEGER(iwp) :: j !<
	715	INTEGER(iwp) :: k !<
[1]	716
[4649]	717	INTEGER(iwp), PARAMETER :: stridex = 4 !<
[1320]	718
[4649]	719	REAL(wp), DIMENSION(1:nz,0:ny,nxl:nxr) :: f_in !<
	720	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_out !<
	721	REAL(wp), DIMENSION(nxl:nxr,1:nz,0:ny) :: work !<
[1]	722
[4649]	723	REAL(wp), DIMENSION(:,:), ALLOCATABLE :: work_ffty !<
	724	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: work_ffty_vec !<
[2300]	725
[1]	726	!
[4649]	727	!-- Carry out the FFT along y, where all data are present due to the 1d-decomposition along x.
	728	!-- Resort the data in a way that x becomes the first index.
	729	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'start' )
[1]	730
[4649]	731	IF ( loop_optimization == 'vector' ) THEN
[2300]	732
[4649]	733	ALLOCATE( work_ffty_vec(0:ny+1,1:nz,nxl:nxr) )
[1]	734	!
[4649]	735	!-- Code optimized for vector processors
	736	!$OMP PARALLEL PRIVATE ( i, j, k )
	737	!$OMP DO
	738	DO i = nxl, nxr
[1]	739
[4649]	740	DO j = 0, ny
	741	DO k = 1, nz
	742	work_ffty_vec(j,k,i) = f_in(k,j,i)
[1]	743	ENDDO
[4649]	744	ENDDO
[1]	745
[4649]	746	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' )
[1]	747
[4649]	748	ENDDO
[1]	749
[4649]	750	!$OMP DO
	751	DO k = 1, nz
	752	DO j = 0, ny
	753	DO i = nxl, nxr
	754	work(i,k,j) = work_ffty_vec(j,k,i)
[1]	755	ENDDO
	756	ENDDO
[4649]	757	ENDDO
	758	!$OMP END PARALLEL
[1]	759
[4649]	760	DEALLOCATE( work_ffty_vec )
[2300]	761
[4649]	762	ELSE
[1]	763	!
[4649]	764	!-- Cache optimized code.
	765	ALLOCATE( work_ffty(0:ny,stridex) )
[2300]	766	!
[4649]	767	!-- The i-(x-)direction is split into a strided outer loop and an inner loop for better cache
	768	!-- performance
	769	!$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty)
	770	!$OMP DO
	771	DO iouter = nxl, nxr, stridex
[1]	772
[4649]	773	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
[1]	774
[4649]	775	DO k = 1, nz
[1]	776
[4649]	777	DO i = iouter, iend
[1]	778
[4649]	779	ir = i-iouter+1 ! Counter within a stride
	780	DO j = 0, ny
	781	work_ffty(j,ir) = f_in(k,j,i)
	782	ENDDO
[1]	783	!
[4649]	784	!-- FFT along y
	785	CALL fft_y_1d( work_ffty(:,ir), 'forward' )
[1]	786
[4649]	787	ENDDO
[1]	788
	789	!
[4649]	790	!-- Resort
	791	DO j = 0, ny
	792	DO i = iouter, iend
	793	work(i,k,j) = work_ffty(j,i-iouter+1)
[1]	794	ENDDO
	795	ENDDO
	796
	797	ENDDO
	798
[4649]	799	ENDDO
	800	!$OMP END PARALLEL
[2300]	801
[4649]	802	DEALLOCATE( work_ffty )
[2300]	803
[4649]	804	ENDIF
[1]	805
[4649]	806	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'pause' )
	807
[1]	808	!
[4649]	809	!-- Transpose array
[1111]	810	#if defined( __parallel )
[4649]	811	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
	812	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
	813	CALL MPI_ALLTOALL( work(nxl,1,0), sendrecvcount_xy, MPI_REAL, f_out(1,1,nys_x,1), &
	814	sendrecvcount_xy, MPI_REAL, comm1dx, ierr )
	815	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[4429]	816	#else
	817	!
[4649]	818	!-- Next line required to avoid compile error about unused dummy argument in serial mode
	819	i = SIZE( f_out )
[1111]	820	#endif
[1]	821
[4649]	822	END SUBROUTINE ffty_tr_yx
[1]	823
	824
[4649]	825	!--------------------------------------------------------------------------------------------------!
[1682]	826	! Description:
	827	! ------------
[4649]	828	!> Transposition x --> y with a subsequent backward Fourier transformation for a 1d-decomposition
	829	!> along x
	830	!--------------------------------------------------------------------------------------------------!
	831	SUBROUTINE tr_xy_ffty( f_in, f_out )
[1]	832
[4649]	833	USE control_parameters, &
	834	ONLY: loop_optimization
[1320]	835
[4649]	836	USE cpulog, &
	837	ONLY: cpu_log, &
	838	log_point_s
[1320]	839
[4649]	840	USE kinds
[1320]	841
[4649]	842	USE pegrid
[1]	843
[4649]	844	IMPLICIT NONE
[1]	845
[4649]	846	INTEGER(iwp) :: i !<
	847	INTEGER(iwp) :: iend !<
	848	INTEGER(iwp) :: iouter !<
	849	INTEGER(iwp) :: ir !<
	850	INTEGER(iwp) :: j !<
	851	INTEGER(iwp) :: k !<
[1]	852
[4649]	853	INTEGER(iwp), PARAMETER :: stridex = 4 !<
[1320]	854
[4649]	855	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_in !<
	856	REAL(wp), DIMENSION(1:nz,0:ny,nxl:nxr) :: f_out !<
	857	REAL(wp), DIMENSION(nxl:nxr,1:nz,0:ny) :: work !<
[1]	858
[4649]	859	REAL(wp), DIMENSION(:,:), ALLOCATABLE :: work_ffty !<
	860	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: work_ffty_vec !<
[2300]	861
[1]	862	!
[4649]	863	!-- Transpose array
[1111]	864	#if defined( __parallel )
[4649]	865	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
	866	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
	867	CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, work(nxl,1,0), &
	868	sendrecvcount_xy, MPI_REAL, comm1dx, ierr )
	869	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[4429]	870	#else
	871	!
[4649]	872	!-- Next line required to avoid compile error about unused dummy argument in serial mode
	873	i = SIZE( f_in )
[1111]	874	#endif
[1]	875
	876	!
[4649]	877	!-- Resort the data in a way that y becomes the first index and carry out the backward fft along y.
	878	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'continue' )
[1]	879
[4649]	880	IF ( loop_optimization == 'vector' ) THEN
[2300]	881
[4649]	882	ALLOCATE( work_ffty_vec(0:ny+1,1:nz,nxl:nxr) )
[1]	883	!
[4649]	884	!-- Code optimized for vector processors
	885	!$OMP PARALLEL PRIVATE ( i, j, k )
	886	!$OMP DO
	887	DO k = 1, nz
	888	DO j = 0, ny
	889	DO i = nxl, nxr
	890	work_ffty_vec(j,k,i) = work(i,k,j)
[1]	891	ENDDO
	892	ENDDO
[4649]	893	ENDDO
[1]	894
[4649]	895	!$OMP DO
	896	DO i = nxl, nxr
[1]	897
[4649]	898	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' )
[1]	899
[4649]	900	DO j = 0, ny
	901	DO k = 1, nz
	902	f_out(k,j,i) = work_ffty_vec(j,k,i)
[1]	903	ENDDO
	904	ENDDO
	905
[4649]	906	ENDDO
	907	!$OMP END PARALLEL
[2300]	908
[4649]	909	DEALLOCATE( work_ffty_vec )
	910
	911	ELSE
[1]	912	!
[4649]	913	!-- Cache optimized code.
	914	ALLOCATE( work_ffty(0:ny,stridex) )
[2300]	915	!
[4649]	916	!-- The i-(x-)direction is split into a strided outer loop and an inner loop for better cache
	917	!-- performance
	918	!$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty )
	919	!$OMP DO
	920	DO iouter = nxl, nxr, stridex
[1]	921
[4649]	922	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
[1]	923
[4649]	924	DO k = 1, nz
[1]	925	!
[4649]	926	!-- Resort
	927	DO j = 0, ny
	928	DO i = iouter, iend
	929	work_ffty(j,i-iouter+1) = work(i,k,j)
[1]	930	ENDDO
[4649]	931	ENDDO
[1]	932
[4649]	933	DO i = iouter, iend
[1]	934
	935	!
[4649]	936	!-- FFT along y
	937	ir = i-iouter+1 ! Counter within a stride
	938	CALL fft_y_1d( work_ffty(:,ir), 'backward' )
[1]	939
[4649]	940	DO j = 0, ny
	941	f_out(k,j,i) = work_ffty(j,ir)
[1]	942	ENDDO
	943	ENDDO
	944
	945	ENDDO
	946
[4649]	947	ENDDO
	948	!$OMP END PARALLEL
[2300]	949
[4649]	950	DEALLOCATE( work_ffty )
[1]	951
[4649]	952	ENDIF
[1]	953
[4649]	954	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'stop' )
[1]	955
[4649]	956	END SUBROUTINE tr_xy_ffty
[1]	957
[4649]	958
	959	!--------------------------------------------------------------------------------------------------!
[1682]	960	! Description:
	961	! ------------
[4649]	962	!> FFT along x, solution of the tridiagonal system and backward FFT for a 1d-decomposition along x
[1682]	963	!>
[4649]	964	!> @warning This subroutine may still not work for hybrid parallelization with OpenMP (for possible
	965	!> necessary changes see the original routine poisfft_hybrid, developed by Klaus Ketelsen,
	966	!> May 2002)
	967	!--------------------------------------------------------------------------------------------------!
	968	SUBROUTINE fftx_tri_fftx( ar )
[1]	969
[4649]	970	USE control_parameters, &
	971	ONLY: loop_optimization
[1320]	972
[4649]	973	USE cpulog, &
	974	ONLY: cpu_log, &
	975	log_point_s
[1320]	976
[4649]	977	USE grid_variables, &
	978	ONLY: ddx2, &
	979	ddy2
[1320]	980
[4649]	981	USE kinds
[1320]	982
[4649]	983	USE pegrid
[1]	984
[4649]	985	IMPLICIT NONE
[1]	986
[4649]	987	INTEGER(iwp) :: i !<
	988	INTEGER(iwp) :: j !<
	989	INTEGER(iwp) :: k !<
	990	INTEGER(iwp) :: m !<
	991	INTEGER(iwp) :: n !<
[4671]	992	!$ INTEGER(iwp) :: omp_get_thread_num !<
[4649]	993	INTEGER(iwp) :: tn !<
[1]	994
[4649]	995	REAL(wp), DIMENSION(0:nx) :: work_fftx !<
	996	REAL(wp), DIMENSION(0:nx,1:nz) :: work_trix !<
	997	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: ar !<
	998	REAL(wp), DIMENSION(:,:,:,:), ALLOCATABLE :: tri !<
[1]	999
	1000
[4649]	1001	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'start' )
[1]	1002
[4649]	1003	ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) )
[1]	1004
[4649]	1005	tn = 0 ! Default thread number in case of one thread
[4717]	1006	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_fftx, work_trix )
[4649]	1007	DO j = nys_x, nyn_x
[1]	1008
[4649]	1009	!$ tn = omp_get_thread_num()
[1]	1010
[4649]	1011	IF ( loop_optimization == 'vector' ) THEN
[1]	1012	!
[4649]	1013	!-- Code optimized for vector processors
	1014	DO k = 1, nz
[1]	1015
[4649]	1016	m = 0
	1017	DO n = 1, pdims(1)
	1018	DO i = 1, nnx
	1019	work_trix(m,k) = ar(i,k,j,n)
	1020	m = m + 1
[1]	1021	ENDDO
	1022	ENDDO
	1023
[4649]	1024	ENDDO
[1]	1025
[4649]	1026	CALL fft_x_m( work_trix, 'forward' )
	1027
	1028	ELSE
[1]	1029	!
[4649]	1030	!-- Cache optimized code
	1031	DO k = 1, nz
[1]	1032
[4649]	1033	m = 0
	1034	DO n = 1, pdims(1)
	1035	DO i = 1, nnx
	1036	work_fftx(m) = ar(i,k,j,n)
	1037	m = m + 1
[1]	1038	ENDDO
[4649]	1039	ENDDO
[1]	1040
[4649]	1041	CALL fft_x_1d( work_fftx, 'forward' )
[1]	1042
[4649]	1043	DO i = 0, nx
	1044	work_trix(i,k) = work_fftx(i)
[1]	1045	ENDDO
	1046
[4649]	1047	ENDDO
[1]	1048
[4649]	1049	ENDIF
	1050
[1]	1051	!
[4649]	1052	!-- Solve the linear equation system
	1053	CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) )
[1]	1054
[4649]	1055	IF ( loop_optimization == 'vector' ) THEN
[1]	1056	!
[4649]	1057	!-- Code optimized for vector processors
	1058	CALL fft_x_m( work_trix, 'backward' )
[1]	1059
[4649]	1060	DO k = 1, nz
[1]	1061
[4649]	1062	m = 0
	1063	DO n = 1, pdims(1)
	1064	DO i = 1, nnx
	1065	ar(i,k,j,n) = work_trix(m,k)
	1066	m = m + 1
[1]	1067	ENDDO
	1068	ENDDO
	1069
[4649]	1070	ENDDO
	1071
	1072	ELSE
[1]	1073	!
[4649]	1074	!-- Cache optimized code
	1075	DO k = 1, nz
[1]	1076
[4649]	1077	DO i = 0, nx
	1078	work_fftx(i) = work_trix(i,k)
	1079	ENDDO
[1]	1080
[4649]	1081	CALL fft_x_1d( work_fftx, 'backward' )
[1]	1082
[4649]	1083	m = 0
	1084	DO n = 1, pdims(1)
	1085	DO i = 1, nnx
	1086	ar(i,k,j,n) = work_fftx(m)
	1087	m = m + 1
[1]	1088	ENDDO
	1089	ENDDO
	1090
[4649]	1091	ENDDO
[1]	1092
[4649]	1093	ENDIF
[1]	1094
[4649]	1095	ENDDO
[1]	1096
[4649]	1097	DEALLOCATE( tri )
[1]	1098
[4649]	1099	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'stop' )
[1]	1100
[4649]	1101	END SUBROUTINE fftx_tri_fftx
[1]	1102
[4649]	1103
	1104	!--------------------------------------------------------------------------------------------------!
[1682]	1105	! Description:
	1106	! ------------
[4649]	1107	!> Fourier-transformation along x with subsequent transposition x --> y for a 1d-decomposition
	1108	!> along y.
[1682]	1109	!>
[4649]	1110	!> @attention NEC-branch of this routine may significantly profit from further optimizations. So
	1111	!> far, performance is much worse than for routine ffty_tr_yx (more than three times
	1112	!> slower).
	1113	!--------------------------------------------------------------------------------------------------!
	1114	SUBROUTINE fftx_tr_xy( f_in, f_out )
[1]	1115
[1682]	1116
[4649]	1117	USE control_parameters, &
	1118	ONLY: loop_optimization
[1320]	1119
[4649]	1120	USE cpulog, &
	1121	ONLY: cpu_log, &
	1122	log_point_s
[1320]	1123
[4649]	1124	USE kinds
[1320]	1125
[4649]	1126	USE pegrid
[1]	1127
[4649]	1128	IMPLICIT NONE
[1]	1129
[4649]	1130	INTEGER(iwp) :: i !<
	1131	INTEGER(iwp) :: j !<
	1132	INTEGER(iwp) :: k !<
[1]	1133
[4649]	1134	REAL(wp), DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx !<
	1135	REAL(wp), DIMENSION(1:nz,nys:nyn,0:nx) :: f_in !<
	1136	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_out !<
	1137	REAL(wp), DIMENSION(nys:nyn,1:nz,0:nx) :: work !<
[1]	1138
	1139	!
[4649]	1140	!-- Carry out the FFT along x, where all data are present due to the 1d-decomposition along y.
	1141	!-- Resort the data in a way that y becomes the first index.
	1142	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'start' )
[1]	1143
[4649]	1144	IF ( loop_optimization == 'vector' ) THEN
[1]	1145	!
[4649]	1146	!-- Code for vector processors
[4717]	1147	!$OMP PARALLEL PRIVATE ( i, j, k )
	1148	!$OMP DO
[4649]	1149	DO i = 0, nx
[1]	1150
[4649]	1151	DO j = nys, nyn
	1152	DO k = 1, nz
	1153	work_fftx(i,k,j) = f_in(k,j,i)
[1]	1154	ENDDO
	1155	ENDDO
	1156
[4649]	1157	ENDDO
[1]	1158
[4717]	1159	!$OMP DO
[4649]	1160	DO j = nys, nyn
[1]	1161
[4649]	1162	CALL fft_x_m( work_fftx(:,:,j), 'forward' )
	1163
	1164	DO k = 1, nz
	1165	DO i = 0, nx
	1166	work(j,k,i) = work_fftx(i,k,j)
[1]	1167	ENDDO
	1168	ENDDO
	1169
[4649]	1170	ENDDO
[4717]	1171	!$OMP END PARALLEL
[1]	1172
[4649]	1173	ELSE
	1174
[1]	1175	!
[4649]	1176	!-- Cache optimized code (there might still be a potential for better optimization).
[4717]	1177	!$OMP PARALLEL PRIVATE (i,j,k)
	1178	!$OMP DO
[4649]	1179	DO i = 0, nx
[1]	1180
[4649]	1181	DO j = nys, nyn
	1182	DO k = 1, nz
	1183	work_fftx(i,k,j) = f_in(k,j,i)
[1]	1184	ENDDO
	1185	ENDDO
	1186
[4649]	1187	ENDDO
[1]	1188
[4717]	1189	!$OMP DO
[4649]	1190	DO j = nys, nyn
	1191	DO k = 1, nz
[1]	1192
[4649]	1193	CALL fft_x_1d( work_fftx(0:nx,k,j), 'forward' )
	1194
	1195	DO i = 0, nx
	1196	work(j,k,i) = work_fftx(i,k,j)
[1]	1197	ENDDO
	1198	ENDDO
	1199
[4649]	1200	ENDDO
[4717]	1201	!$OMP END PARALLEL
[1]	1202
[4649]	1203	ENDIF
	1204	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'pause' )
	1205
[1]	1206	!
[4649]	1207	!-- Transpose array
[1111]	1208	#if defined( __parallel )
[4649]	1209	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
	1210	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
	1211	CALL MPI_ALLTOALL( work(nys,1,0), sendrecvcount_xy, MPI_REAL, f_out(1,1,nxl_y,1), &
	1212	sendrecvcount_xy, MPI_REAL, comm1dy, ierr )
	1213	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[4429]	1214	#else
	1215	!
[4649]	1216	!-- Next line required to avoid compile error about unused dummy argument in serial mode
	1217	i = SIZE( f_out )
[1111]	1218	#endif
[1]	1219
[4649]	1220	END SUBROUTINE fftx_tr_xy
[1]	1221
	1222
[4649]	1223	!--------------------------------------------------------------------------------------------------!
[1682]	1224	! Description:
	1225	! ------------
[4649]	1226	!> Transposition y --> x with a subsequent backward Fourier transformation for a 1d-decomposition
	1227	!> along x.
	1228	!--------------------------------------------------------------------------------------------------!
	1229	SUBROUTINE tr_yx_fftx( f_in, f_out )
[1]	1230
[1682]	1231
[4649]	1232	USE control_parameters, &
	1233	ONLY: loop_optimization
[1320]	1234
[4649]	1235	USE cpulog, &
	1236	ONLY: cpu_log, &
	1237	log_point_s
[1320]	1238
[4649]	1239	USE kinds
[1320]	1240
[4649]	1241	USE pegrid
[1]	1242
[4649]	1243	IMPLICIT NONE
[1]	1244
[4649]	1245	INTEGER(iwp) :: i !<
	1246	INTEGER(iwp) :: j !<
	1247	INTEGER(iwp) :: k !<
[1]	1248
[4649]	1249	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_in !<
	1250	REAL(wp), DIMENSION(1:nz,nys:nyn,0:nx) :: f_out !<
	1251	REAL(wp), DIMENSION(nys:nyn,1:nz,0:nx) :: work !<
	1252	REAL(wp), DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx !<
[1]	1253
[4429]	1254
[1]	1255	!
[4649]	1256	!-- Transpose array
[1111]	1257	#if defined( __parallel )
[4649]	1258	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
	1259	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
	1260	CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, work(nys,1,0), &
	1261	sendrecvcount_xy, MPI_REAL, comm1dy, ierr )
	1262	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[4429]	1263	#else
	1264	!
[4649]	1265	!-- Next line required to avoid compile error about unused dummy argument in serial mode
	1266	i = SIZE( f_in )
[1111]	1267	#endif
[1]	1268
	1269	!
[4649]	1270	!-- Carry out the FFT along x, where all data are present due to the 1d-decomposition along y.
	1271	!-- Resort the data in a way that y becomes the first index.
	1272	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'continue' )
[1]	1273
[4649]	1274	IF ( loop_optimization == 'vector' ) THEN
[1]	1275	!
[4649]	1276	!-- Code optimized for vector processors
	1277	!$OMP PARALLEL PRIVATE ( i, j, k )
	1278	!$OMP DO
	1279	DO j = nys, nyn
[1]	1280
[4649]	1281	DO k = 1, nz
	1282	DO i = 0, nx
	1283	work_fftx(i,k,j) = work(j,k,i)
[1]	1284	ENDDO
[4649]	1285	ENDDO
[1]	1286
[4649]	1287	CALL fft_x_m( work_fftx(:,:,j), 'backward' )
[1]	1288
[4649]	1289	ENDDO
[1]	1290
[4649]	1291	!$OMP DO
	1292	DO i = 0, nx
	1293	DO j = nys, nyn
	1294	DO k = 1, nz
	1295	f_out(k,j,i) = work_fftx(i,k,j)
[1]	1296	ENDDO
	1297	ENDDO
[4649]	1298	ENDDO
	1299	!$OMP END PARALLEL
[1]	1300
[4649]	1301	ELSE
[1]	1302
	1303	!
[4649]	1304	!-- Cache optimized code (there might be still a potential for better optimization).
	1305	!$OMP PARALLEL PRIVATE (i,j,k)
	1306	!$OMP DO
	1307	DO j = nys, nyn
	1308	DO k = 1, nz
[1]	1309
[4649]	1310	DO i = 0, nx
	1311	work_fftx(i,k,j) = work(j,k,i)
	1312	ENDDO
[1]	1313
[4649]	1314	CALL fft_x_1d( work_fftx(0:nx,k,j), 'backward' )
[1]	1315
	1316	ENDDO
[4649]	1317	ENDDO
[1]	1318
[4649]	1319	!$OMP DO
	1320	DO i = 0, nx
	1321	DO j = nys, nyn
	1322	DO k = 1, nz
	1323	f_out(k,j,i) = work_fftx(i,k,j)
[1]	1324	ENDDO
	1325	ENDDO
[4649]	1326	ENDDO
	1327	!$OMP END PARALLEL
[1]	1328
[4649]	1329	ENDIF
	1330	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'stop' )
[1]	1331
[4649]	1332	END SUBROUTINE tr_yx_fftx
[1]	1333
	1334
[4649]	1335	!--------------------------------------------------------------------------------------------------!
[1682]	1336	! Description:
	1337	! ------------
[4649]	1338	!> FFT along y, solution of the tridiagonal system and backward FFT for a 1d-decomposition along y.
[1682]	1339	!>
[4649]	1340	!> @warning This subroutine may still not work for hybrid parallelization with OpenMP (for possible
	1341	!> necessary changes see the original routine poisfft_hybrid, developed by Klaus Ketelsen,
	1342	!> May 2002)
	1343	!--------------------------------------------------------------------------------------------------!
	1344	SUBROUTINE ffty_tri_ffty( ar )
[1]	1345
[1682]	1346
[4649]	1347	USE control_parameters, &
	1348	ONLY: loop_optimization
[1320]	1349
[4649]	1350	USE cpulog, &
	1351	ONLY: cpu_log, &
	1352	log_point_s
[1320]	1353
[4649]	1354	USE grid_variables, &
	1355	ONLY: ddx2, &
	1356	ddy2
[1320]	1357
[4649]	1358	USE kinds
[1320]	1359
[4649]	1360	USE pegrid
[1]	1361
[4649]	1362	IMPLICIT NONE
[1]	1363
[4649]	1364	INTEGER(iwp) :: i !<
	1365	INTEGER(iwp) :: j !<
	1366	INTEGER(iwp) :: k !<
	1367	INTEGER(iwp) :: m !<
	1368	INTEGER(iwp) :: n !<
	1369	!$ INTEGER(iwp) :: omp_get_thread_num !<
	1370	INTEGER(iwp) :: tn !<
[1]	1371
[4649]	1372	REAL(wp), DIMENSION(0:ny) :: work_ffty !<
	1373	REAL(wp), DIMENSION(0:ny,1:nz) :: work_triy !<
	1374	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: ar !<
	1375	REAL(wp), DIMENSION(:,:,:,:), ALLOCATABLE :: tri !<
[1]	1376
	1377
[4649]	1378	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'start' )
[1]	1379
[4649]	1380	ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) )
[1]	1381
[4649]	1382	tn = 0 ! Default thread number in case of one thread
	1383	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy )
	1384	DO i = nxl_y, nxr_y
[1]	1385
[4649]	1386	!$ tn = omp_get_thread_num()
[1]	1387
[4649]	1388	IF ( loop_optimization == 'vector' ) THEN
[1]	1389	!
[4649]	1390	!-- Code optimized for vector processors
	1391	DO k = 1, nz
[1]	1392
[4649]	1393	m = 0
	1394	DO n = 1, pdims(2)
	1395	DO j = 1, nny
	1396	work_triy(m,k) = ar(j,k,i,n)
	1397	m = m + 1
[1]	1398	ENDDO
	1399	ENDDO
	1400
[4649]	1401	ENDDO
[1]	1402
[4649]	1403	CALL fft_y_m( work_triy, ny, 'forward' )
	1404
	1405	ELSE
[1]	1406	!
[4649]	1407	!-- Cache optimized code
	1408	DO k = 1, nz
[1]	1409
[4649]	1410	m = 0
	1411	DO n = 1, pdims(2)
	1412	DO j = 1, nny
	1413	work_ffty(m) = ar(j,k,i,n)
	1414	m = m + 1
[1]	1415	ENDDO
[4649]	1416	ENDDO
[1]	1417
[4649]	1418	CALL fft_y_1d( work_ffty, 'forward' )
[1]	1419
[4649]	1420	DO j = 0, ny
	1421	work_triy(j,k) = work_ffty(j)
[1]	1422	ENDDO
	1423
[4649]	1424	ENDDO
[1]	1425
[4649]	1426	ENDIF
	1427
[1]	1428	!
[4649]	1429	!-- Solve the linear equation system
	1430	CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) )
[1]	1431
[4649]	1432	IF ( loop_optimization == 'vector' ) THEN
[1]	1433	!
[4649]	1434	!-- Code optimized for vector processors
	1435	CALL fft_y_m( work_triy, ny, 'backward' )
[1]	1436
[4649]	1437	DO k = 1, nz
[1]	1438
[4649]	1439	m = 0
	1440	DO n = 1, pdims(2)
	1441	DO j = 1, nny
	1442	ar(j,k,i,n) = work_triy(m,k)
	1443	m = m + 1
[1]	1444	ENDDO
	1445	ENDDO
	1446
[4649]	1447	ENDDO
	1448
	1449	ELSE
[1]	1450	!
[4649]	1451	!-- Cache optimized code
	1452	DO k = 1, nz
[1]	1453
[4649]	1454	DO j = 0, ny
	1455	work_ffty(j) = work_triy(j,k)
	1456	ENDDO
[1]	1457
[4649]	1458	CALL fft_y_1d( work_ffty, 'backward' )
[1]	1459
[4649]	1460	m = 0
	1461	DO n = 1, pdims(2)
	1462	DO j = 1, nny
	1463	ar(j,k,i,n) = work_ffty(m)
	1464	m = m + 1
[1]	1465	ENDDO
	1466	ENDDO
	1467
[4649]	1468	ENDDO
[1]	1469
[4649]	1470	ENDIF
[1]	1471
[4649]	1472	ENDDO
[1]	1473
[4649]	1474	DEALLOCATE( tri )
[1]	1475
[4649]	1476	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'stop' )
[1]	1477
[4649]	1478	END SUBROUTINE ffty_tri_ffty
	1479
[4671]	1480	END MODULE poisfft_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |