Home

Context Navigation

source: palm/trunk/SOURCE/poisfft_mod.f90 @ 4677

Last change on this file since 4677 was 4671, checked in by pavelkrc, 4 years ago
Radiative transfer model RTM version 4.1
Property svn:keywords set to `Id`
File size: 45.5 KB

Rev	Line
[1850]	1	!> @file poisfft_mod.f90
[4649]	2	!--------------------------------------------------------------------------------------------------!
[2696]	3	! This file is part of the PALM model system.
[1036]	4	!
[4649]	5	! PALM is free software: you can redistribute it and/or modify it under the terms of the GNU General
	6	! Public License as published by the Free Software Foundation, either version 3 of the License, or
	7	! (at your option) any later version.
[1036]	8	!
[4649]	9	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the
	10	! implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
	11	! Public License for more details.
[1036]	12	!
[4649]	13	! You should have received a copy of the GNU General Public License along with PALM. If not, see
	14	! <http://www.gnu.org/licenses/>.
[1036]	15	!
[4360]	16	! Copyright 1997-2020 Leibniz Universitaet Hannover
[4649]	17	!--------------------------------------------------------------------------------------------------!
[1036]	18	!
[4649]	19	!
[484]	20	! Current revisions:
[1]	21	! -----------------
[4649]	22	!
	23	!
[1321]	24	! Former revisions:
	25	! -----------------
	26	! $Id: poisfft_mod.f90 4671 2020-09-09 20:27:58Z raasch $
[4671]	27	! OMP bugfix
	28	!
	29	! 4649 2020-08-25 12:11:17Z raasch
[4649]	30	! File re-formatted to follow the PALM coding standard
	31	!
	32	! 4429 2020-02-27 15:24:30Z raasch
[4429]	33	! Statements added to avoid compile errors due to unused dummy arguments in serial mode
[4649]	34	!
[4429]	35	! 4366 2020-01-09 08:12:43Z raasch
[4649]	36	! Modification concerning NEC vectorizatio
	37	!
[4366]	38	! 4360 2020-01-07 11:25:50Z suehring
[4182]	39	! Corrected "Former revisions" section
[4649]	40	!
[4182]	41	! 3690 2019-01-22 22:56:42Z knoop
[3634]	42	! OpenACC port for SPEC
[2716]	43	!
[4182]	44	! Revision 1.1 1997/07/24 11:24:14 raasch
	45	! Initial revision
	46	!
	47	!
[4649]	48	!--------------------------------------------------------------------------------------------------!
[1]	49	! Description:
	50	! ------------
[1682]	51	!> Solves the Poisson equation with a 2D spectral method
	52	!> d^2 p / dx^2 + d^2 p / dy^2 + d^2 p / dz^2 = s
	53	!>
	54	!> Input:
[4649]	55	!> real ar contains (nnz,nny,nnx) elements of the velocity divergence, starting from (1,nys,nxl)
[1682]	56	!>
	57	!> Output:
[4649]	58	!> real ar contains the solution for perturbation pressure p
	59	!--------------------------------------------------------------------------------------------------!
[1682]	60	MODULE poisfft_mod
[1]	61
[4649]	62
	63	USE fft_xy, &
	64	ONLY: fft_init, &
	65	fft_y, &
	66	fft_y_1d, &
	67	fft_y_m, &
	68	fft_x, &
	69	fft_x_1d, &
	70	fft_x_m, &
[4366]	71	temperton_fft_vec
[1]	72
[4649]	73	USE indices, &
	74	ONLY: nnx, &
	75	nny, &
	76	nx, &
	77	nxl, &
	78	nxr, &
	79	ny, &
	80	nys, &
	81	nyn, &
	82	nz
[1320]	83
[4649]	84	USE transpose_indices, &
	85	ONLY: nxl_y, &
	86	nxl_z, &
	87	nxr_y, &
	88	nxr_z, &
	89	nys_x, &
	90	nys_z, &
	91	nyn_x, &
	92	nyn_z, &
	93	nzb_x, &
	94	nzb_y, &
	95	nzt_x, &
	96	nzt_y
[1320]	97
[4649]	98	USE tridia_solver, &
	99	ONLY: tridia_1dd, &
	100	tridia_init, &
	101	tridia_substi, &
	102	tridia_substi_overlap
[1320]	103
[1]	104	IMPLICIT NONE
	105
[4649]	106	LOGICAL, SAVE :: poisfft_initialized = .FALSE. !<
[1111]	107
[1]	108	PRIVATE
[807]	109
[1]	110	PUBLIC poisfft, poisfft_init
	111
	112	INTERFACE poisfft
	113	MODULE PROCEDURE poisfft
	114	END INTERFACE poisfft
	115
	116	INTERFACE poisfft_init
	117	MODULE PROCEDURE poisfft_init
	118	END INTERFACE poisfft_init
	119
[807]	120
[1]	121	CONTAINS
	122
[4649]	123	!--------------------------------------------------------------------------------------------------!
[1682]	124	! Description:
	125	! ------------
[3241]	126	!> Setup coefficients for FFT and the tridiagonal solver
[4649]	127	!--------------------------------------------------------------------------------------------------!
	128	SUBROUTINE poisfft_init
[1]	129
[4649]	130	IMPLICIT NONE
[1111]	131
	132
[4649]	133	CALL fft_init
[1]	134
[4649]	135	CALL tridia_init
[1111]	136
[4649]	137	poisfft_initialized = .TRUE.
[1111]	138
[4649]	139	END SUBROUTINE poisfft_init
[1]	140
[1111]	141
[1804]	142
[4649]	143	!--------------------------------------------------------------------------------------------------!
[1682]	144	! Description:
	145	! ------------
	146	!> Two-dimensional Fourier Transformation in x- and y-direction.
[4649]	147	!--------------------------------------------------------------------------------------------------!
	148	SUBROUTINE poisfft( ar )
[1]	149
[4649]	150	USE control_parameters, &
	151	ONLY: transpose_compute_overlap
[1320]	152
[4649]	153	USE cpulog, &
	154	ONLY: cpu_log, &
	155	cpu_log_nowait, &
	156	log_point_s
[1320]	157
[4649]	158	USE kinds
[1320]	159
[4649]	160	USE pegrid
[1]	161
[4649]	162	IMPLICIT NONE
[1]	163
[4649]	164	INTEGER(iwp) :: ii !<
	165	INTEGER(iwp) :: iind !<
	166	INTEGER(iwp) :: inew !<
	167	INTEGER(iwp) :: jj !<
	168	INTEGER(iwp) :: jind !<
	169	INTEGER(iwp) :: jnew !<
	170	INTEGER(iwp) :: ki !<
	171	INTEGER(iwp) :: kk !<
	172	INTEGER(iwp) :: knew !<
	173	INTEGER(iwp) :: n !<
	174	INTEGER(iwp) :: nblk !<
	175	INTEGER(iwp) :: nnx_y !<
	176	INTEGER(iwp) :: nny_z !<
	177	INTEGER(iwp) :: nnz_x !<
	178	INTEGER(iwp) :: nxl_y_bound !<
	179	INTEGER(iwp) :: nxr_y_bound !<
[1]	180
[4649]	181	INTEGER(iwp), DIMENSION(4) :: isave !<
[1320]	182
[4649]	183	REAL(wp), DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar !<
	184	REAL(wp), DIMENSION(nys:nyn,nxl:nxr,1:nz) :: ar_inv !<
[3690]	185
	186	#define __acc_fft_device ( defined( _OPENACC ) && ( defined ( __cuda_fft ) ) )
	187	#if __acc_fft_device
[4649]	188	!$ACC DECLARE CREATE(ar_inv)
[3690]	189	#endif
[1]	190
[4649]	191	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: ar1 !<
	192	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_in !<
	193	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_inv !<
	194	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_out_y !<
	195	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_out_z !<
[1216]	196
	197
[4649]	198	CALL cpu_log( log_point_s(3), 'poisfft', 'start' )
[1]	199
[4649]	200	IF ( .NOT. poisfft_initialized ) CALL poisfft_init
[1111]	201
[3690]	202	#if !__acc_fft_device
[4649]	203	!$ACC UPDATE HOST(ar)
[3690]	204	#endif
	205
[3634]	206	#ifndef _OPENACC
[1]	207	!
[4649]	208	!-- Two-dimensional Fourier Transformation in x- and y-direction.
	209	IF ( pdims(2) == 1 .AND. pdims(1) > 1 ) THEN
[1]	210
	211	!
[4649]	212	!-- 1d-domain-decomposition along x:
	213	!-- FFT along y and transposition y --> x
	214	CALL ffty_tr_yx( ar, ar )
[1]	215
	216	!
[4649]	217	!-- FFT along x, solving the tridiagonal system and backward FFT
	218	CALL fftx_tri_fftx( ar )
[1]	219
	220	!
[4649]	221	!-- Transposition x --> y and backward FFT along y
	222	CALL tr_xy_ffty( ar, ar )
[1]	223
[4649]	224	ELSEIF ( pdims(1) == 1 .AND. pdims(2) > 1 ) THEN
[1]	225
	226	!
[4649]	227	!-- 1d-domain-decomposition along y:
	228	!-- FFT along x and transposition x --> y
	229	CALL fftx_tr_xy( ar, ar )
[1]	230
	231	!
[4649]	232	!-- FFT along y, solving the tridiagonal system and backward FFT
	233	CALL ffty_tri_ffty( ar )
[1]	234
	235	!
[4649]	236	!-- Transposition y --> x and backward FFT along x
	237	CALL tr_yx_fftx( ar, ar )
[1]	238
[4649]	239	ELSEIF ( .NOT. transpose_compute_overlap ) THEN
[3634]	240	#endif
[1]	241
	242	!
[4649]	243	!-- 2d-domain-decomposition or no decomposition (1 PE run)
	244	!-- Transposition z --> x
	245	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
	246	CALL resort_for_zx( ar, ar_inv )
	247	CALL transpose_zx( ar_inv, ar )
	248	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
[1]	249
[4649]	250	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
	251	IF ( temperton_fft_vec ) THEN
[4366]	252	!
[4649]	253	!-- Vector version outputs a transformed array ar_inv that does not require resorting
	254	!-- (which is done for ar further below)
	255	CALL fft_x( ar, 'forward', ar_inv=ar_inv)
	256	ELSE
	257	CALL fft_x( ar, 'forward')
	258	ENDIF
	259	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
[1]	260
	261	!
[4649]	262	!-- Transposition x --> y
	263	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
	264	IF( .NOT. temperton_fft_vec ) CALL resort_for_xy( ar, ar_inv )
	265	CALL transpose_xy( ar_inv, ar )
	266	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
[1]	267
[4649]	268	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
	269	IF ( temperton_fft_vec ) THEN
[4366]	270	!
[4649]	271	!-- Input array ar_inv from fft_x can be directly used here.
	272	!-- The output (also in array ar_inv) does not require resorting below.
	273	CALL fft_y( ar, 'forward', ar_inv = ar_inv, nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
	274	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
	275	ELSE
	276	CALL fft_y( ar, 'forward', ar_tr = ar, nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
	277	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
	278	ENDIF
	279	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
[1]	280
	281	!
[4649]	282	!-- Transposition y --> z
	283	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
	284	IF ( .NOT. temperton_fft_vec ) CALL resort_for_yz( ar, ar_inv )
	285	CALL transpose_yz( ar_inv, ar )
	286	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
[1]	287
	288	!
[4649]	289	!-- Solve the tridiagonal equation system along z
	290	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
	291	CALL tridia_substi( ar )
	292	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
[1]	293
	294	!
[4649]	295	!-- Inverse Fourier Transformation
	296	!-- Transposition z --> y
	297	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
	298	CALL transpose_zy( ar, ar_inv )
[4366]	299	!
[4649]	300	!-- The fft_y below (vector branch) can directly process ar_inv (i.e. does not require a
	301	!-- resorting)
	302	IF ( .NOT. temperton_fft_vec ) CALL resort_for_zy( ar_inv, ar )
	303	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
[1]	304
[4649]	305	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
	306	IF ( temperton_fft_vec ) THEN
[4366]	307	!
[4649]	308	!-- Output array ar_inv can be used as input to the below fft_x routine without resorting
	309	CALL fft_y( ar, 'backward', ar_inv = ar_inv, nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
	310	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
	311	ELSE
	312	CALL fft_y( ar, 'backward', ar_tr = ar, nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
	313	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
	314	ENDIF
[4366]	315
[4649]	316	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
[1]	317
	318	!
[4649]	319	!-- Transposition y --> x
	320	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
	321	CALL transpose_yx( ar, ar_inv )
	322	IF ( .NOT. temperton_fft_vec ) CALL resort_for_yx( ar_inv, ar )
	323	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
[1]	324
[4649]	325	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
	326	IF ( temperton_fft_vec ) THEN
	327	CALL fft_x( ar, 'backward', ar_inv=ar_inv )
	328	ELSE
	329	CALL fft_x( ar, 'backward' )
	330	ENDIF
	331	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
[1]	332
	333	!
[4649]	334	!-- Transposition x --> z
	335	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
	336	CALL transpose_xz( ar, ar_inv )
	337	CALL resort_for_xz( ar_inv, ar )
	338	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
[1]	339
[3634]	340	#ifndef _OPENACC
[4649]	341	ELSE
[1216]	342
	343	!
[4649]	344	!-- 2d-domain-decomposition or no decomposition (1 PE run) with overlapping transposition / fft
	345	!-- cputime logging must not use barriers, which would prevent overlapping
	346	ALLOCATE( f_out_y(0:ny,nxl_y:nxr_y,nzb_y:nzt_y), &
	347	f_out_z(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
[1216]	348	!
[4649]	349	!-- Transposition z --> x + subsequent fft along x
	350	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
	351	CALL resort_for_zx( ar, f_inv )
[1216]	352	!
[4649]	353	!-- Save original indices and gridpoint counter
	354	isave(1) = nz
	355	isave(2) = nzb_x
	356	isave(3) = nzt_x
	357	isave(4) = sendrecvcount_zx
[1216]	358	!
[4649]	359	!-- Set new indices for transformation
	360	nblk = nz / pdims(1)
	361	nz = pdims(1)
	362	nnz_x = 1
	363	nzb_x = 1 + myidx * nnz_x
	364	nzt_x = ( myidx + 1 ) * nnz_x
	365	sendrecvcount_zx = nnx * nny * nnz_x
[1216]	366
[4649]	367	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
	368	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
[1216]	369
[4649]	370	DO kk = 1, nblk
[1216]	371
[4649]	372	IF ( kk == 1 ) THEN
	373	CALL cpu_log( log_point_s(5), 'transpo forward', 'start', cpu_log_nowait )
	374	ELSE
	375	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
	376	ENDIF
[1216]	377
[4649]	378	DO knew = 1, nz
	379	ki = kk + nblk * ( knew - 1 )
	380	f_in(:,:,knew) = f_inv(:,:,ki)
	381	ENDDO
[1216]	382
[4649]	383	CALL transpose_zx( f_in, ar1(:,:,:))
	384	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
[1216]	385
[4649]	386	IF ( kk == 1 ) THEN
	387	CALL cpu_log( log_point_s(4), 'fft_x', 'start', cpu_log_nowait )
	388	ELSE
	389	CALL cpu_log( log_point_s(4), 'fft_x', 'continue', cpu_log_nowait )
	390	ENDIF
[1216]	391
[4649]	392	n = isave(2) + kk - 1
	393	CALL fft_x( ar1(:,:,:), 'forward', ar_2d = f_out_z(:,:,n) )
	394	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
[1216]	395
[4649]	396	ENDDO
[1216]	397	!
[4649]	398	!-- Restore original indices/counters
	399	nz = isave(1)
	400	nzb_x = isave(2)
	401	nzt_x = isave(3)
	402	sendrecvcount_zx = isave(4)
[1216]	403
[4649]	404	DEALLOCATE( ar1, f_in, f_inv )
[1216]	405
	406	!
[4649]	407	!-- Transposition x --> y + subsequent fft along y
	408	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
	409	CALL resort_for_xy( f_out_z, f_inv )
[1216]	410	!
[4649]	411	!-- Save original indices and gridpoint counter
	412	isave(1) = nx
	413	isave(2) = nxl_y
	414	isave(3) = nxr_y
	415	isave(4) = sendrecvcount_xy
[1216]	416	!
[4649]	417	!-- Set new indices for transformation
	418	nblk = ( ( nx+1 ) / pdims(2) ) - 1
	419	nx = pdims(2)
	420	nnx_y = 1
	421	nxl_y = myidy * nnx_y
	422	nxr_y = ( myidy + 1 ) * nnx_y - 1
	423	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
[1216]	424
[4649]	425	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
	426	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
[1216]	427
[4649]	428	DO ii = 0, nblk
[1216]	429
[4649]	430	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
[1216]	431
[4649]	432	DO inew = 0, nx-1
	433	iind = ii + ( nblk + 1 ) * inew
	434	f_in(:,:,inew) = f_inv(:,:,iind)
	435	ENDDO
[1216]	436
[4649]	437	CALL transpose_xy( f_in, ar1(:,:,:) )
[1216]	438
[4649]	439	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
[1216]	440
[4649]	441	IF ( ii == 1 ) THEN
	442	CALL cpu_log( log_point_s(7), 'fft_y', 'start', cpu_log_nowait )
	443	ELSE
	444	CALL cpu_log( log_point_s(7), 'fft_y', 'continue', cpu_log_nowait )
	445	ENDIF
[1216]	446
[4649]	447	nxl_y_bound = isave(2)
	448	nxr_y_bound = isave(3)
	449	n = isave(2) + ii
	450	CALL fft_y( ar1(:,:,:), 'forward', ar_tr = f_out_y, nxl_y_bound = nxl_y_bound, &
	451	nxr_y_bound = nxr_y_bound, nxl_y_l = n, nxr_y_l = n )
[1216]	452
[4649]	453	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
[1216]	454
[4649]	455	ENDDO
[1216]	456	!
[4649]	457	!-- Restore original indices/counters
	458	nx = isave(1)
	459	nxl_y = isave(2)
	460	nxr_y = isave(3)
	461	sendrecvcount_xy = isave(4)
[1216]	462
[4649]	463	DEALLOCATE( ar1, f_in, f_inv )
[1216]	464
	465	!
[4649]	466	!-- Transposition y --> z + subsequent tridia + resort for z --> y
	467	ALLOCATE( f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
	468	CALL resort_for_yz( f_out_y, f_inv )
[1216]	469	!
[4649]	470	!-- Save original indices and gridpoint counter
	471	isave(1) = ny
	472	isave(2) = nys_z
	473	isave(3) = nyn_z
	474	isave(4) = sendrecvcount_yz
[1216]	475	!
[4649]	476	!-- Set new indices for transformation
	477	nblk = ( ( ny+1 ) / pdims(1) ) - 1
	478	ny = pdims(1)
	479	nny_z = 1
	480	nys_z = myidx * nny_z
	481	nyn_z = ( myidx + 1 ) * nny_z - 1
	482	sendrecvcount_yz = ( nxr_y-nxl_y+1 ) * nny_z * ( nzt_y-nzb_y+1 )
[1216]	483
[4649]	484	ALLOCATE( ar1(nxl_z:nxr_z,nys_z:nyn_z,1:nz) )
	485	ALLOCATE( f_in(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
[1216]	486
[4649]	487	DO jj = 0, nblk
[1216]	488	!
[4649]	489	!-- Forward Fourier Transformation
	490	!-- Transposition y --> z
	491	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
[1216]	492
[4649]	493	DO jnew = 0, ny-1
	494	jind = jj + ( nblk + 1 ) * jnew
	495	f_in(:,:,jnew) = f_inv(:,:,jind)
	496	ENDDO
[1216]	497
[4649]	498	CALL transpose_yz( f_in, ar1(:,:,:) )
[1216]	499
[4649]	500	IF ( jj == nblk ) THEN
	501	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
	502	ELSE
	503	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
	504	ENDIF
[1216]	505
	506	!
[4649]	507	!-- Solve the tridiagonal equation system along z
	508	CALL cpu_log( log_point_s(6), 'tridia', 'start', cpu_log_nowait )
[1216]	509
[4649]	510	n = isave(2) + jj
	511	CALL tridia_substi_overlap( ar1(:,:,:), n )
[1216]	512
[4649]	513	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
[1216]	514
[1306]	515	!
[4649]	516	!-- Inverse Fourier Transformation
	517	!-- Transposition z --> y
	518	!-- Only one thread should call MPI routines, therefore forward and backward tranpose are in
	519	!-- the same section
	520	IF ( jj == 0 ) THEN
	521	CALL cpu_log( log_point_s(8), 'transpo invers', 'start', cpu_log_nowait )
	522	ELSE
	523	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
	524	ENDIF
[1216]	525
[4649]	526	CALL transpose_zy( ar1(:,:,:), f_in )
[1216]	527
[4649]	528	DO jnew = 0, ny-1
	529	jind = jj + ( nblk + 1 ) * jnew
	530	f_inv(:,:,jind) = f_in(:,:,jnew)
	531	ENDDO
[1216]	532
[4649]	533	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
[1216]	534
[4649]	535	ENDDO
[1216]	536	!
[4649]	537	!-- Restore original indices/counters
	538	ny = isave(1)
	539	nys_z = isave(2)
	540	nyn_z = isave(3)
	541	sendrecvcount_yz = isave(4)
[1216]	542
[4649]	543	CALL resort_for_zy( f_inv, f_out_y )
[1216]	544
[4649]	545	DEALLOCATE( ar1, f_in, f_inv )
[1216]	546
	547	!
[4649]	548	!-- fft along y backward + subsequent transposition y --> x
	549	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
[1216]	550	!
[4649]	551	!-- Save original indices and gridpoint counter
	552	isave(1) = nx
	553	isave(2) = nxl_y
	554	isave(3) = nxr_y
	555	isave(4) = sendrecvcount_xy
[1216]	556	!
[4649]	557	!-- Set new indices for transformation
	558	nblk = ( ( nx+1 ) / pdims(2) ) - 1
	559	nx = pdims(2)
	560	nnx_y = 1
	561	nxl_y = myidy * nnx_y
	562	nxr_y = ( myidy + 1 ) * nnx_y - 1
	563	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
[1216]	564
[4649]	565	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
	566	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
[1216]	567
[4649]	568	DO ii = 0, nblk
[1216]	569
[4649]	570	CALL cpu_log( log_point_s(7), 'fft_y', 'continue', cpu_log_nowait )
[1216]	571
[4649]	572	n = isave(2) + ii
	573	nxl_y_bound = isave(2)
	574	nxr_y_bound = isave(3)
[1216]	575
[4649]	576	CALL fft_y( ar1(:,:,:), 'backward', ar_tr = f_out_y, nxl_y_bound = nxl_y_bound, &
	577	nxr_y_bound = nxr_y_bound, nxl_y_l = n, nxr_y_l = n )
[1216]	578
[4649]	579	IF ( ii == nblk ) THEN
	580	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	581	ELSE
	582	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	583	ENDIF
[1216]	584
[4649]	585	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
[1216]	586
[4649]	587	CALL transpose_yx( ar1(:,:,:), f_in )
[1216]	588
[4649]	589	DO inew = 0, nx-1
	590	iind = ii + (nblk+1) * inew
	591	f_inv(:,:,iind) = f_in(:,:,inew)
	592	ENDDO
[1216]	593
[4649]	594	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
[1216]	595
[4649]	596	ENDDO
[1216]	597	!
[4649]	598	!-- Restore original indices/counters
	599	nx = isave(1)
	600	nxl_y = isave(2)
	601	nxr_y = isave(3)
	602	sendrecvcount_xy = isave(4)
[1216]	603
[4649]	604	CALL resort_for_yx( f_inv, f_out_z )
[1216]	605
[4649]	606	DEALLOCATE( ar1, f_in, f_inv )
[1216]	607
	608	!
[4649]	609	!-- fft along x backward + subsequent final transposition x --> z
	610	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
[1216]	611	!
[4649]	612	!-- Save original indices and gridpoint counter
	613	isave(1) = nz
	614	isave(2) = nzb_x
	615	isave(3) = nzt_x
	616	isave(4) = sendrecvcount_zx
[1216]	617	!
[4649]	618	!-- Set new indices for transformation
	619	nblk = nz / pdims(1)
	620	nz = pdims(1)
	621	nnz_x = 1
	622	nzb_x = 1 + myidx * nnz_x
	623	nzt_x = ( myidx + 1 ) * nnz_x
	624	sendrecvcount_zx = nnx * nny * nnz_x
[1216]	625
[4649]	626	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
	627	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
[1216]	628
[4649]	629	DO kk = 1, nblk
[1216]	630
[4649]	631	CALL cpu_log( log_point_s(4), 'fft_x', 'continue', cpu_log_nowait )
[1216]	632
[4649]	633	n = isave(2) + kk - 1
	634	CALL fft_x( ar1(:,:,:), 'backward', f_out_z(:,:,n) )
[1216]	635
[4649]	636	IF ( kk == nblk ) THEN
	637	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	638	ELSE
	639	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
	640	ENDIF
[1216]	641
[4649]	642	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
[1216]	643
[4649]	644	CALL transpose_xz( ar1(:,:,:), f_in )
[1216]	645
[4649]	646	DO knew = 1, nz
	647	ki = kk + nblk * ( knew - 1 )
	648	f_inv(:,:,ki) = f_in(:,:,knew)
	649	ENDDO
[1216]	650
[4649]	651	IF ( kk == nblk ) THEN
	652	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
	653	ELSE
	654	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
	655	ENDIF
[1216]	656
[4649]	657	ENDDO
[1216]	658	!
[4649]	659	!-- Restore original indices/counters
	660	nz = isave(1)
	661	nzb_x = isave(2)
	662	nzt_x = isave(3)
	663	sendrecvcount_zx = isave(4)
[1216]	664
[4649]	665	CALL resort_for_xz( f_inv, ar )
[1216]	666
[4649]	667	DEALLOCATE( ar1, f_in, f_inv )
[1216]	668
[4649]	669	ENDIF
[3634]	670	#endif
[1]	671
[3690]	672	#if !__acc_fft_device
[4649]	673	!$ACC UPDATE DEVICE(ar)
[3690]	674	#endif
	675
[4649]	676	CALL cpu_log( log_point_s(3), 'poisfft', 'stop' )
[1]	677
[4649]	678	END SUBROUTINE poisfft
[1]	679
	680
[4649]	681	!--------------------------------------------------------------------------------------------------!
[1682]	682	! Description:
	683	! ------------
[4649]	684	!> Fourier-transformation along y with subsequent transposition y --> x for a 1d-decomposition
	685	!> along x.
[1682]	686	!>
[4649]	687	!> @attention The performance of this routine is much faster on the NEC-SX6, if the first index of
	688	!> work_ffty_vec is odd. Otherwise memory bank conflicts may occur (especially if the
	689	!> index is a multiple of 128). That's why work_ffty_vec is dimensioned as 0:ny+1.
	690	!> Of course, this will not work if users are using an odd number of gridpoints along y.
	691	!--------------------------------------------------------------------------------------------------!
	692	SUBROUTINE ffty_tr_yx( f_in, f_out )
[1]	693
[4649]	694	USE control_parameters, &
	695	ONLY: loop_optimization
[1320]	696
[4649]	697	USE cpulog, &
	698	ONLY: cpu_log, &
	699	log_point_s
[1320]	700
[4649]	701	USE kinds
[1320]	702
[4649]	703	USE pegrid
[1]	704
[4649]	705	IMPLICIT NONE
[1]	706
[4649]	707	INTEGER(iwp) :: i !<
	708	INTEGER(iwp) :: iend !<
	709	INTEGER(iwp) :: iouter !<
	710	INTEGER(iwp) :: ir !<
	711	INTEGER(iwp) :: j !<
	712	INTEGER(iwp) :: k !<
[1]	713
[4649]	714	INTEGER(iwp), PARAMETER :: stridex = 4 !<
[1320]	715
[4649]	716	REAL(wp), DIMENSION(1:nz,0:ny,nxl:nxr) :: f_in !<
	717	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_out !<
	718	REAL(wp), DIMENSION(nxl:nxr,1:nz,0:ny) :: work !<
[1]	719
[4649]	720	REAL(wp), DIMENSION(:,:), ALLOCATABLE :: work_ffty !<
	721	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: work_ffty_vec !<
[2300]	722
[1]	723	!
[4649]	724	!-- Carry out the FFT along y, where all data are present due to the 1d-decomposition along x.
	725	!-- Resort the data in a way that x becomes the first index.
	726	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'start' )
[1]	727
[4649]	728	IF ( loop_optimization == 'vector' ) THEN
[2300]	729
[4649]	730	ALLOCATE( work_ffty_vec(0:ny+1,1:nz,nxl:nxr) )
[1]	731	!
[4649]	732	!-- Code optimized for vector processors
	733	!$OMP PARALLEL PRIVATE ( i, j, k )
	734	!$OMP DO
	735	DO i = nxl, nxr
[1]	736
[4649]	737	DO j = 0, ny
	738	DO k = 1, nz
	739	work_ffty_vec(j,k,i) = f_in(k,j,i)
[1]	740	ENDDO
[4649]	741	ENDDO
[1]	742
[4649]	743	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' )
[1]	744
[4649]	745	ENDDO
[1]	746
[4649]	747	!$OMP DO
	748	DO k = 1, nz
	749	DO j = 0, ny
	750	DO i = nxl, nxr
	751	work(i,k,j) = work_ffty_vec(j,k,i)
[1]	752	ENDDO
	753	ENDDO
[4649]	754	ENDDO
	755	!$OMP END PARALLEL
[1]	756
[4649]	757	DEALLOCATE( work_ffty_vec )
[2300]	758
[4649]	759	ELSE
[1]	760	!
[4649]	761	!-- Cache optimized code.
	762	ALLOCATE( work_ffty(0:ny,stridex) )
[2300]	763	!
[4649]	764	!-- The i-(x-)direction is split into a strided outer loop and an inner loop for better cache
	765	!-- performance
	766	!$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty)
	767	!$OMP DO
	768	DO iouter = nxl, nxr, stridex
[1]	769
[4649]	770	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
[1]	771
[4649]	772	DO k = 1, nz
[1]	773
[4649]	774	DO i = iouter, iend
[1]	775
[4649]	776	ir = i-iouter+1 ! Counter within a stride
	777	DO j = 0, ny
	778	work_ffty(j,ir) = f_in(k,j,i)
	779	ENDDO
[1]	780	!
[4649]	781	!-- FFT along y
	782	CALL fft_y_1d( work_ffty(:,ir), 'forward' )
[1]	783
[4649]	784	ENDDO
[1]	785
	786	!
[4649]	787	!-- Resort
	788	DO j = 0, ny
	789	DO i = iouter, iend
	790	work(i,k,j) = work_ffty(j,i-iouter+1)
[1]	791	ENDDO
	792	ENDDO
	793
	794	ENDDO
	795
[4649]	796	ENDDO
	797	!$OMP END PARALLEL
[2300]	798
[4649]	799	DEALLOCATE( work_ffty )
[2300]	800
[4649]	801	ENDIF
[1]	802
[4649]	803	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'pause' )
	804
[1]	805	!
[4649]	806	!-- Transpose array
[1111]	807	#if defined( __parallel )
[4649]	808	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
	809	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
	810	CALL MPI_ALLTOALL( work(nxl,1,0), sendrecvcount_xy, MPI_REAL, f_out(1,1,nys_x,1), &
	811	sendrecvcount_xy, MPI_REAL, comm1dx, ierr )
	812	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[4429]	813	#else
	814	!
[4649]	815	!-- Next line required to avoid compile error about unused dummy argument in serial mode
	816	i = SIZE( f_out )
[1111]	817	#endif
[1]	818
[4649]	819	END SUBROUTINE ffty_tr_yx
[1]	820
	821
[4649]	822	!--------------------------------------------------------------------------------------------------!
[1682]	823	! Description:
	824	! ------------
[4649]	825	!> Transposition x --> y with a subsequent backward Fourier transformation for a 1d-decomposition
	826	!> along x
	827	!--------------------------------------------------------------------------------------------------!
	828	SUBROUTINE tr_xy_ffty( f_in, f_out )
[1]	829
[4649]	830	USE control_parameters, &
	831	ONLY: loop_optimization
[1320]	832
[4649]	833	USE cpulog, &
	834	ONLY: cpu_log, &
	835	log_point_s
[1320]	836
[4649]	837	USE kinds
[1320]	838
[4649]	839	USE pegrid
[1]	840
[4649]	841	IMPLICIT NONE
[1]	842
[4649]	843	INTEGER(iwp) :: i !<
	844	INTEGER(iwp) :: iend !<
	845	INTEGER(iwp) :: iouter !<
	846	INTEGER(iwp) :: ir !<
	847	INTEGER(iwp) :: j !<
	848	INTEGER(iwp) :: k !<
[1]	849
[4649]	850	INTEGER(iwp), PARAMETER :: stridex = 4 !<
[1320]	851
[4649]	852	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_in !<
	853	REAL(wp), DIMENSION(1:nz,0:ny,nxl:nxr) :: f_out !<
	854	REAL(wp), DIMENSION(nxl:nxr,1:nz,0:ny) :: work !<
[1]	855
[4649]	856	REAL(wp), DIMENSION(:,:), ALLOCATABLE :: work_ffty !<
	857	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: work_ffty_vec !<
[2300]	858
[1]	859	!
[4649]	860	!-- Transpose array
[1111]	861	#if defined( __parallel )
[4649]	862	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
	863	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
	864	CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, work(nxl,1,0), &
	865	sendrecvcount_xy, MPI_REAL, comm1dx, ierr )
	866	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[4429]	867	#else
	868	!
[4649]	869	!-- Next line required to avoid compile error about unused dummy argument in serial mode
	870	i = SIZE( f_in )
[1111]	871	#endif
[1]	872
	873	!
[4649]	874	!-- Resort the data in a way that y becomes the first index and carry out the backward fft along y.
	875	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'continue' )
[1]	876
[4649]	877	IF ( loop_optimization == 'vector' ) THEN
[2300]	878
[4649]	879	ALLOCATE( work_ffty_vec(0:ny+1,1:nz,nxl:nxr) )
[1]	880	!
[4649]	881	!-- Code optimized for vector processors
	882	!$OMP PARALLEL PRIVATE ( i, j, k )
	883	!$OMP DO
	884	DO k = 1, nz
	885	DO j = 0, ny
	886	DO i = nxl, nxr
	887	work_ffty_vec(j,k,i) = work(i,k,j)
[1]	888	ENDDO
	889	ENDDO
[4649]	890	ENDDO
[1]	891
[4649]	892	!$OMP DO
	893	DO i = nxl, nxr
[1]	894
[4649]	895	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' )
[1]	896
[4649]	897	DO j = 0, ny
	898	DO k = 1, nz
	899	f_out(k,j,i) = work_ffty_vec(j,k,i)
[1]	900	ENDDO
	901	ENDDO
	902
[4649]	903	ENDDO
	904	!$OMP END PARALLEL
[2300]	905
[4649]	906	DEALLOCATE( work_ffty_vec )
	907
	908	ELSE
[1]	909	!
[4649]	910	!-- Cache optimized code.
	911	ALLOCATE( work_ffty(0:ny,stridex) )
[2300]	912	!
[4649]	913	!-- The i-(x-)direction is split into a strided outer loop and an inner loop for better cache
	914	!-- performance
	915	!$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty )
	916	!$OMP DO
	917	DO iouter = nxl, nxr, stridex
[1]	918
[4649]	919	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
[1]	920
[4649]	921	DO k = 1, nz
[1]	922	!
[4649]	923	!-- Resort
	924	DO j = 0, ny
	925	DO i = iouter, iend
	926	work_ffty(j,i-iouter+1) = work(i,k,j)
[1]	927	ENDDO
[4649]	928	ENDDO
[1]	929
[4649]	930	DO i = iouter, iend
[1]	931
	932	!
[4649]	933	!-- FFT along y
	934	ir = i-iouter+1 ! Counter within a stride
	935	CALL fft_y_1d( work_ffty(:,ir), 'backward' )
[1]	936
[4649]	937	DO j = 0, ny
	938	f_out(k,j,i) = work_ffty(j,ir)
[1]	939	ENDDO
	940	ENDDO
	941
	942	ENDDO
	943
[4649]	944	ENDDO
	945	!$OMP END PARALLEL
[2300]	946
[4649]	947	DEALLOCATE( work_ffty )
[1]	948
[4649]	949	ENDIF
[1]	950
[4649]	951	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'stop' )
[1]	952
[4649]	953	END SUBROUTINE tr_xy_ffty
[1]	954
[4649]	955
	956	!--------------------------------------------------------------------------------------------------!
[1682]	957	! Description:
	958	! ------------
[4649]	959	!> FFT along x, solution of the tridiagonal system and backward FFT for a 1d-decomposition along x
[1682]	960	!>
[4649]	961	!> @warning This subroutine may still not work for hybrid parallelization with OpenMP (for possible
	962	!> necessary changes see the original routine poisfft_hybrid, developed by Klaus Ketelsen,
	963	!> May 2002)
	964	!--------------------------------------------------------------------------------------------------!
	965	SUBROUTINE fftx_tri_fftx( ar )
[1]	966
[4649]	967	USE control_parameters, &
	968	ONLY: loop_optimization
[1320]	969
[4649]	970	USE cpulog, &
	971	ONLY: cpu_log, &
	972	log_point_s
[1320]	973
[4649]	974	USE grid_variables, &
	975	ONLY: ddx2, &
	976	ddy2
[1320]	977
[4649]	978	USE kinds
[1320]	979
[4649]	980	USE pegrid
[1]	981
[4649]	982	IMPLICIT NONE
[1]	983
[4649]	984	INTEGER(iwp) :: i !<
	985	INTEGER(iwp) :: j !<
	986	INTEGER(iwp) :: k !<
	987	INTEGER(iwp) :: m !<
	988	INTEGER(iwp) :: n !<
[4671]	989	!$ INTEGER(iwp) :: omp_get_thread_num !<
[4649]	990	INTEGER(iwp) :: tn !<
[1]	991
[4649]	992	REAL(wp), DIMENSION(0:nx) :: work_fftx !<
	993	REAL(wp), DIMENSION(0:nx,1:nz) :: work_trix !<
	994	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: ar !<
	995	REAL(wp), DIMENSION(:,:,:,:), ALLOCATABLE :: tri !<
[1]	996
	997
[4649]	998	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'start' )
[1]	999
[4649]	1000	ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) )
[1]	1001
[4649]	1002	tn = 0 ! Default thread number in case of one thread
[1]	1003	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_fftx, work_trix )
[4649]	1004	DO j = nys_x, nyn_x
[1]	1005
[4649]	1006	!$ tn = omp_get_thread_num()
[1]	1007
[4649]	1008	IF ( loop_optimization == 'vector' ) THEN
[1]	1009	!
[4649]	1010	!-- Code optimized for vector processors
	1011	DO k = 1, nz
[1]	1012
[4649]	1013	m = 0
	1014	DO n = 1, pdims(1)
	1015	DO i = 1, nnx
	1016	work_trix(m,k) = ar(i,k,j,n)
	1017	m = m + 1
[1]	1018	ENDDO
	1019	ENDDO
	1020
[4649]	1021	ENDDO
[1]	1022
[4649]	1023	CALL fft_x_m( work_trix, 'forward' )
	1024
	1025	ELSE
[1]	1026	!
[4649]	1027	!-- Cache optimized code
	1028	DO k = 1, nz
[1]	1029
[4649]	1030	m = 0
	1031	DO n = 1, pdims(1)
	1032	DO i = 1, nnx
	1033	work_fftx(m) = ar(i,k,j,n)
	1034	m = m + 1
[1]	1035	ENDDO
[4649]	1036	ENDDO
[1]	1037
[4649]	1038	CALL fft_x_1d( work_fftx, 'forward' )
[1]	1039
[4649]	1040	DO i = 0, nx
	1041	work_trix(i,k) = work_fftx(i)
[1]	1042	ENDDO
	1043
[4649]	1044	ENDDO
[1]	1045
[4649]	1046	ENDIF
	1047
[1]	1048	!
[4649]	1049	!-- Solve the linear equation system
	1050	CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) )
[1]	1051
[4649]	1052	IF ( loop_optimization == 'vector' ) THEN
[1]	1053	!
[4649]	1054	!-- Code optimized for vector processors
	1055	CALL fft_x_m( work_trix, 'backward' )
[1]	1056
[4649]	1057	DO k = 1, nz
[1]	1058
[4649]	1059	m = 0
	1060	DO n = 1, pdims(1)
	1061	DO i = 1, nnx
	1062	ar(i,k,j,n) = work_trix(m,k)
	1063	m = m + 1
[1]	1064	ENDDO
	1065	ENDDO
	1066
[4649]	1067	ENDDO
	1068
	1069	ELSE
[1]	1070	!
[4649]	1071	!-- Cache optimized code
	1072	DO k = 1, nz
[1]	1073
[4649]	1074	DO i = 0, nx
	1075	work_fftx(i) = work_trix(i,k)
	1076	ENDDO
[1]	1077
[4649]	1078	CALL fft_x_1d( work_fftx, 'backward' )
[1]	1079
[4649]	1080	m = 0
	1081	DO n = 1, pdims(1)
	1082	DO i = 1, nnx
	1083	ar(i,k,j,n) = work_fftx(m)
	1084	m = m + 1
[1]	1085	ENDDO
	1086	ENDDO
	1087
[4649]	1088	ENDDO
[1]	1089
[4649]	1090	ENDIF
[1]	1091
[4649]	1092	ENDDO
[1]	1093
[4649]	1094	DEALLOCATE( tri )
[1]	1095
[4649]	1096	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'stop' )
[1]	1097
[4649]	1098	END SUBROUTINE fftx_tri_fftx
[1]	1099
[4649]	1100
	1101	!--------------------------------------------------------------------------------------------------!
[1682]	1102	! Description:
	1103	! ------------
[4649]	1104	!> Fourier-transformation along x with subsequent transposition x --> y for a 1d-decomposition
	1105	!> along y.
[1682]	1106	!>
[4649]	1107	!> @attention NEC-branch of this routine may significantly profit from further optimizations. So
	1108	!> far, performance is much worse than for routine ffty_tr_yx (more than three times
	1109	!> slower).
	1110	!--------------------------------------------------------------------------------------------------!
	1111	SUBROUTINE fftx_tr_xy( f_in, f_out )
[1]	1112
[1682]	1113
[4649]	1114	USE control_parameters, &
	1115	ONLY: loop_optimization
[1320]	1116
[4649]	1117	USE cpulog, &
	1118	ONLY: cpu_log, &
	1119	log_point_s
[1320]	1120
[4649]	1121	USE kinds
[1320]	1122
[4649]	1123	USE pegrid
[1]	1124
[4649]	1125	IMPLICIT NONE
[1]	1126
[4649]	1127	INTEGER(iwp) :: i !<
	1128	INTEGER(iwp) :: j !<
	1129	INTEGER(iwp) :: k !<
[1]	1130
[4649]	1131	REAL(wp), DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx !<
	1132	REAL(wp), DIMENSION(1:nz,nys:nyn,0:nx) :: f_in !<
	1133	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_out !<
	1134	REAL(wp), DIMENSION(nys:nyn,1:nz,0:nx) :: work !<
[1]	1135
	1136	!
[4649]	1137	!-- Carry out the FFT along x, where all data are present due to the 1d-decomposition along y.
	1138	!-- Resort the data in a way that y becomes the first index.
	1139	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'start' )
[1]	1140
[4649]	1141	IF ( loop_optimization == 'vector' ) THEN
[1]	1142	!
[4649]	1143	!-- Code for vector processors
	1144	!$OMP PARALLEL PRIVATE ( i, j, k )
	1145	!$OMP DO
	1146	DO i = 0, nx
[1]	1147
[4649]	1148	DO j = nys, nyn
	1149	DO k = 1, nz
	1150	work_fftx(i,k,j) = f_in(k,j,i)
[1]	1151	ENDDO
	1152	ENDDO
	1153
[4649]	1154	ENDDO
[1]	1155
[4649]	1156	!$OMP DO
	1157	DO j = nys, nyn
[1]	1158
[4649]	1159	CALL fft_x_m( work_fftx(:,:,j), 'forward' )
	1160
	1161	DO k = 1, nz
	1162	DO i = 0, nx
	1163	work(j,k,i) = work_fftx(i,k,j)
[1]	1164	ENDDO
	1165	ENDDO
	1166
[4649]	1167	ENDDO
	1168	!$OMP END PARALLEL
[1]	1169
[4649]	1170	ELSE
	1171
[1]	1172	!
[4649]	1173	!-- Cache optimized code (there might still be a potential for better optimization).
	1174	!$OMP PARALLEL PRIVATE (i,j,k)
	1175	!$OMP DO
	1176	DO i = 0, nx
[1]	1177
[4649]	1178	DO j = nys, nyn
	1179	DO k = 1, nz
	1180	work_fftx(i,k,j) = f_in(k,j,i)
[1]	1181	ENDDO
	1182	ENDDO
	1183
[4649]	1184	ENDDO
[1]	1185
[4649]	1186	!$OMP DO
	1187	DO j = nys, nyn
	1188	DO k = 1, nz
[1]	1189
[4649]	1190	CALL fft_x_1d( work_fftx(0:nx,k,j), 'forward' )
	1191
	1192	DO i = 0, nx
	1193	work(j,k,i) = work_fftx(i,k,j)
[1]	1194	ENDDO
	1195	ENDDO
	1196
[4649]	1197	ENDDO
	1198	!$OMP END PARALLEL
[1]	1199
[4649]	1200	ENDIF
	1201	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'pause' )
	1202
[1]	1203	!
[4649]	1204	!-- Transpose array
[1111]	1205	#if defined( __parallel )
[4649]	1206	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
	1207	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
	1208	CALL MPI_ALLTOALL( work(nys,1,0), sendrecvcount_xy, MPI_REAL, f_out(1,1,nxl_y,1), &
	1209	sendrecvcount_xy, MPI_REAL, comm1dy, ierr )
	1210	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[4429]	1211	#else
	1212	!
[4649]	1213	!-- Next line required to avoid compile error about unused dummy argument in serial mode
	1214	i = SIZE( f_out )
[1111]	1215	#endif
[1]	1216
[4649]	1217	END SUBROUTINE fftx_tr_xy
[1]	1218
	1219
[4649]	1220	!--------------------------------------------------------------------------------------------------!
[1682]	1221	! Description:
	1222	! ------------
[4649]	1223	!> Transposition y --> x with a subsequent backward Fourier transformation for a 1d-decomposition
	1224	!> along x.
	1225	!--------------------------------------------------------------------------------------------------!
	1226	SUBROUTINE tr_yx_fftx( f_in, f_out )
[1]	1227
[1682]	1228
[4649]	1229	USE control_parameters, &
	1230	ONLY: loop_optimization
[1320]	1231
[4649]	1232	USE cpulog, &
	1233	ONLY: cpu_log, &
	1234	log_point_s
[1320]	1235
[4649]	1236	USE kinds
[1320]	1237
[4649]	1238	USE pegrid
[1]	1239
[4649]	1240	IMPLICIT NONE
[1]	1241
[4649]	1242	INTEGER(iwp) :: i !<
	1243	INTEGER(iwp) :: j !<
	1244	INTEGER(iwp) :: k !<
[1]	1245
[4649]	1246	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_in !<
	1247	REAL(wp), DIMENSION(1:nz,nys:nyn,0:nx) :: f_out !<
	1248	REAL(wp), DIMENSION(nys:nyn,1:nz,0:nx) :: work !<
	1249	REAL(wp), DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx !<
[1]	1250
[4429]	1251
[1]	1252	!
[4649]	1253	!-- Transpose array
[1111]	1254	#if defined( __parallel )
[4649]	1255	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
	1256	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
	1257	CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, work(nys,1,0), &
	1258	sendrecvcount_xy, MPI_REAL, comm1dy, ierr )
	1259	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[4429]	1260	#else
	1261	!
[4649]	1262	!-- Next line required to avoid compile error about unused dummy argument in serial mode
	1263	i = SIZE( f_in )
[1111]	1264	#endif
[1]	1265
	1266	!
[4649]	1267	!-- Carry out the FFT along x, where all data are present due to the 1d-decomposition along y.
	1268	!-- Resort the data in a way that y becomes the first index.
	1269	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'continue' )
[1]	1270
[4649]	1271	IF ( loop_optimization == 'vector' ) THEN
[1]	1272	!
[4649]	1273	!-- Code optimized for vector processors
	1274	!$OMP PARALLEL PRIVATE ( i, j, k )
	1275	!$OMP DO
	1276	DO j = nys, nyn
[1]	1277
[4649]	1278	DO k = 1, nz
	1279	DO i = 0, nx
	1280	work_fftx(i,k,j) = work(j,k,i)
[1]	1281	ENDDO
[4649]	1282	ENDDO
[1]	1283
[4649]	1284	CALL fft_x_m( work_fftx(:,:,j), 'backward' )
[1]	1285
[4649]	1286	ENDDO
[1]	1287
[4649]	1288	!$OMP DO
	1289	DO i = 0, nx
	1290	DO j = nys, nyn
	1291	DO k = 1, nz
	1292	f_out(k,j,i) = work_fftx(i,k,j)
[1]	1293	ENDDO
	1294	ENDDO
[4649]	1295	ENDDO
	1296	!$OMP END PARALLEL
[1]	1297
[4649]	1298	ELSE
[1]	1299
	1300	!
[4649]	1301	!-- Cache optimized code (there might be still a potential for better optimization).
	1302	!$OMP PARALLEL PRIVATE (i,j,k)
	1303	!$OMP DO
	1304	DO j = nys, nyn
	1305	DO k = 1, nz
[1]	1306
[4649]	1307	DO i = 0, nx
	1308	work_fftx(i,k,j) = work(j,k,i)
	1309	ENDDO
[1]	1310
[4649]	1311	CALL fft_x_1d( work_fftx(0:nx,k,j), 'backward' )
[1]	1312
	1313	ENDDO
[4649]	1314	ENDDO
[1]	1315
[4649]	1316	!$OMP DO
	1317	DO i = 0, nx
	1318	DO j = nys, nyn
	1319	DO k = 1, nz
	1320	f_out(k,j,i) = work_fftx(i,k,j)
[1]	1321	ENDDO
	1322	ENDDO
[4649]	1323	ENDDO
	1324	!$OMP END PARALLEL
[1]	1325
[4649]	1326	ENDIF
	1327	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'stop' )
[1]	1328
[4649]	1329	END SUBROUTINE tr_yx_fftx
[1]	1330
	1331
[4649]	1332	!--------------------------------------------------------------------------------------------------!
[1682]	1333	! Description:
	1334	! ------------
[4649]	1335	!> FFT along y, solution of the tridiagonal system and backward FFT for a 1d-decomposition along y.
[1682]	1336	!>
[4649]	1337	!> @warning This subroutine may still not work for hybrid parallelization with OpenMP (for possible
	1338	!> necessary changes see the original routine poisfft_hybrid, developed by Klaus Ketelsen,
	1339	!> May 2002)
	1340	!--------------------------------------------------------------------------------------------------!
	1341	SUBROUTINE ffty_tri_ffty( ar )
[1]	1342
[1682]	1343
[4649]	1344	USE control_parameters, &
	1345	ONLY: loop_optimization
[1320]	1346
[4649]	1347	USE cpulog, &
	1348	ONLY: cpu_log, &
	1349	log_point_s
[1320]	1350
[4649]	1351	USE grid_variables, &
	1352	ONLY: ddx2, &
	1353	ddy2
[1320]	1354
[4649]	1355	USE kinds
[1320]	1356
[4649]	1357	USE pegrid
[1]	1358
[4649]	1359	IMPLICIT NONE
[1]	1360
[4649]	1361	INTEGER(iwp) :: i !<
	1362	INTEGER(iwp) :: j !<
	1363	INTEGER(iwp) :: k !<
	1364	INTEGER(iwp) :: m !<
	1365	INTEGER(iwp) :: n !<
	1366	!$ INTEGER(iwp) :: omp_get_thread_num !<
	1367	INTEGER(iwp) :: tn !<
[1]	1368
[4649]	1369	REAL(wp), DIMENSION(0:ny) :: work_ffty !<
	1370	REAL(wp), DIMENSION(0:ny,1:nz) :: work_triy !<
	1371	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: ar !<
	1372	REAL(wp), DIMENSION(:,:,:,:), ALLOCATABLE :: tri !<
[1]	1373
	1374
[4649]	1375	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'start' )
[1]	1376
[4649]	1377	ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) )
[1]	1378
[4649]	1379	tn = 0 ! Default thread number in case of one thread
	1380	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy )
	1381	DO i = nxl_y, nxr_y
[1]	1382
[4649]	1383	!$ tn = omp_get_thread_num()
[1]	1384
[4649]	1385	IF ( loop_optimization == 'vector' ) THEN
[1]	1386	!
[4649]	1387	!-- Code optimized for vector processors
	1388	DO k = 1, nz
[1]	1389
[4649]	1390	m = 0
	1391	DO n = 1, pdims(2)
	1392	DO j = 1, nny
	1393	work_triy(m,k) = ar(j,k,i,n)
	1394	m = m + 1
[1]	1395	ENDDO
	1396	ENDDO
	1397
[4649]	1398	ENDDO
[1]	1399
[4649]	1400	CALL fft_y_m( work_triy, ny, 'forward' )
	1401
	1402	ELSE
[1]	1403	!
[4649]	1404	!-- Cache optimized code
	1405	DO k = 1, nz
[1]	1406
[4649]	1407	m = 0
	1408	DO n = 1, pdims(2)
	1409	DO j = 1, nny
	1410	work_ffty(m) = ar(j,k,i,n)
	1411	m = m + 1
[1]	1412	ENDDO
[4649]	1413	ENDDO
[1]	1414
[4649]	1415	CALL fft_y_1d( work_ffty, 'forward' )
[1]	1416
[4649]	1417	DO j = 0, ny
	1418	work_triy(j,k) = work_ffty(j)
[1]	1419	ENDDO
	1420
[4649]	1421	ENDDO
[1]	1422
[4649]	1423	ENDIF
	1424
[1]	1425	!
[4649]	1426	!-- Solve the linear equation system
	1427	CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) )
[1]	1428
[4649]	1429	IF ( loop_optimization == 'vector' ) THEN
[1]	1430	!
[4649]	1431	!-- Code optimized for vector processors
	1432	CALL fft_y_m( work_triy, ny, 'backward' )
[1]	1433
[4649]	1434	DO k = 1, nz
[1]	1435
[4649]	1436	m = 0
	1437	DO n = 1, pdims(2)
	1438	DO j = 1, nny
	1439	ar(j,k,i,n) = work_triy(m,k)
	1440	m = m + 1
[1]	1441	ENDDO
	1442	ENDDO
	1443
[4649]	1444	ENDDO
	1445
	1446	ELSE
[1]	1447	!
[4649]	1448	!-- Cache optimized code
	1449	DO k = 1, nz
[1]	1450
[4649]	1451	DO j = 0, ny
	1452	work_ffty(j) = work_triy(j,k)
	1453	ENDDO
[1]	1454
[4649]	1455	CALL fft_y_1d( work_ffty, 'backward' )
[1]	1456
[4649]	1457	m = 0
	1458	DO n = 1, pdims(2)
	1459	DO j = 1, nny
	1460	ar(j,k,i,n) = work_ffty(m)
	1461	m = m + 1
[1]	1462	ENDDO
	1463	ENDDO
	1464
[4649]	1465	ENDDO
[1]	1466
[4649]	1467	ENDIF
[1]	1468
[4649]	1469	ENDDO
[1]	1470
[4649]	1471	DEALLOCATE( tri )
[1]	1472
[4649]	1473	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'stop' )
[1]	1474
[4649]	1475	END SUBROUTINE ffty_tri_ffty
	1476
[4671]	1477	END MODULE poisfft_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |