Home

Context Navigation

source: palm/trunk/SOURCE/poisfft_mod.f90 @ 4662

Last change on this file since 4662 was 4649, checked in by raasch, 4 years ago
files re-formatted to follow the PALM coding standard
Property svn:keywords set to `Id`
File size: 45.5 KB

Rev	Line
[1850]	1	!> @file poisfft_mod.f90
[4649]	2	!--------------------------------------------------------------------------------------------------!
[2696]	3	! This file is part of the PALM model system.
[1036]	4	!
[4649]	5	! PALM is free software: you can redistribute it and/or modify it under the terms of the GNU General
	6	! Public License as published by the Free Software Foundation, either version 3 of the License, or
	7	! (at your option) any later version.
[1036]	8	!
[4649]	9	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the
	10	! implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
	11	! Public License for more details.
[1036]	12	!
[4649]	13	! You should have received a copy of the GNU General Public License along with PALM. If not, see
	14	! <http://www.gnu.org/licenses/>.
[1036]	15	!
[4360]	16	! Copyright 1997-2020 Leibniz Universitaet Hannover
[4649]	17	!--------------------------------------------------------------------------------------------------!
[1036]	18	!
[4649]	19	!
[484]	20	! Current revisions:
[1]	21	! -----------------
[4649]	22	!
	23	!
[1321]	24	! Former revisions:
	25	! -----------------
	26	! $Id: poisfft_mod.f90 4649 2020-08-25 12:11:17Z pavelkrc $
[4649]	27	! File re-formatted to follow the PALM coding standard
	28	!
	29	!
	30	! 4429 2020-02-27 15:24:30Z raasch
[4429]	31	! Statements added to avoid compile errors due to unused dummy arguments in serial mode
[4649]	32	!
[4429]	33	! 4366 2020-01-09 08:12:43Z raasch
[4649]	34	! Modification concerning NEC vectorizatio
	35	!
[4366]	36	! 4360 2020-01-07 11:25:50Z suehring
[4182]	37	! Corrected "Former revisions" section
[4649]	38	!
[4182]	39	! 3690 2019-01-22 22:56:42Z knoop
[3634]	40	! OpenACC port for SPEC
[2716]	41	!
[4182]	42	! Revision 1.1 1997/07/24 11:24:14 raasch
	43	! Initial revision
	44	!
	45	!
[4649]	46	!--------------------------------------------------------------------------------------------------!
[1]	47	! Description:
	48	! ------------
[1682]	49	!> Solves the Poisson equation with a 2D spectral method
	50	!> d^2 p / dx^2 + d^2 p / dy^2 + d^2 p / dz^2 = s
	51	!>
	52	!> Input:
[4649]	53	!> real ar contains (nnz,nny,nnx) elements of the velocity divergence, starting from (1,nys,nxl)
[1682]	54	!>
	55	!> Output:
[4649]	56	!> real ar contains the solution for perturbation pressure p
	57	!--------------------------------------------------------------------------------------------------!
[1682]	58	MODULE poisfft_mod
[1]	59
[4649]	60
	61	USE fft_xy, &
	62	ONLY: fft_init, &
	63	fft_y, &
	64	fft_y_1d, &
	65	fft_y_m, &
	66	fft_x, &
	67	fft_x_1d, &
	68	fft_x_m, &
[4366]	69	temperton_fft_vec
[1]	70
[4649]	71	USE indices, &
	72	ONLY: nnx, &
	73	nny, &
	74	nx, &
	75	nxl, &
	76	nxr, &
	77	ny, &
	78	nys, &
	79	nyn, &
	80	nz
[1320]	81
[4649]	82	USE transpose_indices, &
	83	ONLY: nxl_y, &
	84	nxl_z, &
	85	nxr_y, &
	86	nxr_z, &
	87	nys_x, &
	88	nys_z, &
	89	nyn_x, &
	90	nyn_z, &
	91	nzb_x, &
	92	nzb_y, &
	93	nzt_x, &
	94	nzt_y
[1320]	95
[4649]	96	USE tridia_solver, &
	97	ONLY: tridia_1dd, &
	98	tridia_init, &
	99	tridia_substi, &
	100	tridia_substi_overlap
[1320]	101
[1]	102	IMPLICIT NONE
	103
[4649]	104	LOGICAL, SAVE :: poisfft_initialized = .FALSE. !<
[1111]	105
[1]	106	PRIVATE
[807]	107
[1]	108	PUBLIC poisfft, poisfft_init
	109
	110	INTERFACE poisfft
	111	MODULE PROCEDURE poisfft
	112	END INTERFACE poisfft
	113
	114	INTERFACE poisfft_init
	115	MODULE PROCEDURE poisfft_init
	116	END INTERFACE poisfft_init
	117
[807]	118
[1]	119	CONTAINS
	120
[4649]	121	!--------------------------------------------------------------------------------------------------!
[1682]	122	! Description:
	123	! ------------
[3241]	124	!> Setup coefficients for FFT and the tridiagonal solver
[4649]	125	!--------------------------------------------------------------------------------------------------!
	126	SUBROUTINE poisfft_init
[1]	127
[4649]	128	IMPLICIT NONE
[1111]	129
	130
[4649]	131	CALL fft_init
[1]	132
[4649]	133	CALL tridia_init
[1111]	134
[4649]	135	poisfft_initialized = .TRUE.
[1111]	136
[4649]	137	END SUBROUTINE poisfft_init
[1]	138
[1111]	139
[1804]	140
[4649]	141	!--------------------------------------------------------------------------------------------------!
[1682]	142	! Description:
	143	! ------------
	144	!> Two-dimensional Fourier Transformation in x- and y-direction.
[4649]	145	!--------------------------------------------------------------------------------------------------!
	146	SUBROUTINE poisfft( ar )
[1]	147
[4649]	148	USE control_parameters, &
	149	ONLY: transpose_compute_overlap
[1320]	150
[4649]	151	USE cpulog, &
	152	ONLY: cpu_log, &
	153	cpu_log_nowait, &
	154	log_point_s
[1320]	155
[4649]	156	USE kinds
[1320]	157
[4649]	158	USE pegrid
[1]	159
[4649]	160	IMPLICIT NONE
[1]	161
[4649]	162	INTEGER(iwp) :: ii !<
	163	INTEGER(iwp) :: iind !<
	164	INTEGER(iwp) :: inew !<
	165	INTEGER(iwp) :: jj !<
	166	INTEGER(iwp) :: jind !<
	167	INTEGER(iwp) :: jnew !<
	168	INTEGER(iwp) :: ki !<
	169	INTEGER(iwp) :: kk !<
	170	INTEGER(iwp) :: knew !<
	171	INTEGER(iwp) :: n !<
	172	INTEGER(iwp) :: nblk !<
	173	INTEGER(iwp) :: nnx_y !<
	174	INTEGER(iwp) :: nny_z !<
	175	INTEGER(iwp) :: nnz_x !<
	176	INTEGER(iwp) :: nxl_y_bound !<
	177	INTEGER(iwp) :: nxr_y_bound !<
[1]	178
[4649]	179	INTEGER(iwp), DIMENSION(4) :: isave !<
[1320]	180
[4649]	181	REAL(wp), DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar !<
	182	REAL(wp), DIMENSION(nys:nyn,nxl:nxr,1:nz) :: ar_inv !<
[3690]	183
	184	#define __acc_fft_device ( defined( _OPENACC ) && ( defined ( __cuda_fft ) ) )
	185	#if __acc_fft_device
[4649]	186	!$ACC DECLARE CREATE(ar_inv)
[3690]	187	#endif
[1]	188
[4649]	189	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: ar1 !<
	190	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_in !<
	191	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_inv !<
	192	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_out_y !<
	193	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_out_z !<
[1216]	194
	195
[4649]	196	CALL cpu_log( log_point_s(3), 'poisfft', 'start' )
[1]	197
[4649]	198	IF ( .NOT. poisfft_initialized ) CALL poisfft_init
[1111]	199
[3690]	200	#if !__acc_fft_device
[4649]	201	!$ACC UPDATE HOST(ar)
[3690]	202	#endif
	203
[3634]	204	#ifndef _OPENACC
[1]	205	!
[4649]	206	!-- Two-dimensional Fourier Transformation in x- and y-direction.
	207	IF ( pdims(2) == 1 .AND. pdims(1) > 1 ) THEN
[1]	208
	209	!
[4649]	210	!-- 1d-domain-decomposition along x:
	211	!-- FFT along y and transposition y --> x
	212	CALL ffty_tr_yx( ar, ar )
[1]	213
	214	!
[4649]	215	!-- FFT along x, solving the tridiagonal system and backward FFT
	216	CALL fftx_tri_fftx( ar )
[1]	217
	218	!
[4649]	219	!-- Transposition x --> y and backward FFT along y
	220	CALL tr_xy_ffty( ar, ar )
[1]	221
[4649]	222	ELSEIF ( pdims(1) == 1 .AND. pdims(2) > 1 ) THEN
[1]	223
	224	!
[4649]	225	!-- 1d-domain-decomposition along y:
	226	!-- FFT along x and transposition x --> y
	227	CALL fftx_tr_xy( ar, ar )
[1]	228
	229	!
[4649]	230	!-- FFT along y, solving the tridiagonal system and backward FFT
	231	CALL ffty_tri_ffty( ar )
[1]	232
	233	!
[4649]	234	!-- Transposition y --> x and backward FFT along x
	235	CALL tr_yx_fftx( ar, ar )
[1]	236
[4649]	237	ELSEIF ( .NOT. transpose_compute_overlap ) THEN
[3634]	238	#endif
[1]	239
	240	!
[4649]	241	!-- 2d-domain-decomposition or no decomposition (1 PE run)
	242	!-- Transposition z --> x
	243	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
	244	CALL resort_for_zx( ar, ar_inv )
	245	CALL transpose_zx( ar_inv, ar )
	246	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
[1]	247
[4649]	248	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
	249	IF ( temperton_fft_vec ) THEN
[4366]	250	!
[4649]	251	!-- Vector version outputs a transformed array ar_inv that does not require resorting
	252	!-- (which is done for ar further below)
	253	CALL fft_x( ar, 'forward', ar_inv=ar_inv)
	254	ELSE
	255	CALL fft_x( ar, 'forward')
	256	ENDIF
	257	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
[1]	258
	259	!
[4649]	260	!-- Transposition x --> y
	261	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
	262	IF( .NOT. temperton_fft_vec ) CALL resort_for_xy( ar, ar_inv )
	263	CALL transpose_xy( ar_inv, ar )
	264	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
[1]	265
[4649]	266	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
	267	IF ( temperton_fft_vec ) THEN
[4366]	268	!
[4649]	269	!-- Input array ar_inv from fft_x can be directly used here.
	270	!-- The output (also in array ar_inv) does not require resorting below.
	271	CALL fft_y( ar, 'forward', ar_inv = ar_inv, nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
	272	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
	273	ELSE
	274	CALL fft_y( ar, 'forward', ar_tr = ar, nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
	275	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
	276	ENDIF
	277	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
[1]	278
	279	!
[4649]	280	!-- Transposition y --> z
	281	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
	282	IF ( .NOT. temperton_fft_vec ) CALL resort_for_yz( ar, ar_inv )
	283	CALL transpose_yz( ar_inv, ar )
	284	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
[1]	285
	286	!
[4649]	287	!-- Solve the tridiagonal equation system along z
	288	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
	289	CALL tridia_substi( ar )
	290	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
[1]	291
	292	!
[4649]	293	!-- Inverse Fourier Transformation
	294	!-- Transposition z --> y
	295	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
	296	CALL transpose_zy( ar, ar_inv )
[4366]	297	!
[4649]	298	!-- The fft_y below (vector branch) can directly process ar_inv (i.e. does not require a
	299	!-- resorting)
	300	IF ( .NOT. temperton_fft_vec ) CALL resort_for_zy( ar_inv, ar )
	301	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
[1]	302
[4649]	303	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
	304	IF ( temperton_fft_vec ) THEN
[4366]	305	!
[4649]	306	!-- Output array ar_inv can be used as input to the below fft_x routine without resorting
	307	CALL fft_y( ar, 'backward', ar_inv = ar_inv, nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
	308	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
	309	ELSE
	310	CALL fft_y( ar, 'backward', ar_tr = ar, nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
	311	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
	312	ENDIF
[4366]	313
[4649]	314	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
[1]	315
	316	!
[4649]	317	!-- Transposition y --> x
	318	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
	319	CALL transpose_yx( ar, ar_inv )
	320	IF ( .NOT. temperton_fft_vec ) CALL resort_for_yx( ar_inv, ar )
	321	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
[1]	322
[4649]	323	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
	324	IF ( temperton_fft_vec ) THEN
	325	CALL fft_x( ar, 'backward', ar_inv=ar_inv )
	326	ELSE
	327	CALL fft_x( ar, 'backward' )
	328	ENDIF
	329	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
[1]	330
	331	!
[4649]	332	!-- Transposition x --> z
	333	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
	334	CALL transpose_xz( ar, ar_inv )
	335	CALL resort_for_xz( ar_inv, ar )
	336	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
[1]	337
[3634]	338	#ifndef _OPENACC
[4649]	339	ELSE
[1216]	340
	341	!
[4649]	342	!-- 2d-domain-decomposition or no decomposition (1 PE run) with overlapping transposition / fft
	343	!-- cputime logging must not use barriers, which would prevent overlapping
	344	ALLOCATE( f_out_y(0:ny,nxl_y:nxr_y,nzb_y:nzt_y), &
	345	f_out_z(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
[1216]	346	!
[4649]	347	!-- Transposition z --> x + subsequent fft along x
	348	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
	349	CALL resort_for_zx( ar, f_inv )
[1216]	350	!
[4649]	351	!-- Save original indices and gridpoint counter
	352	isave(1) = nz
	353	isave(2) = nzb_x
	354	isave(3) = nzt_x
	355	isave(4) = sendrecvcount_zx
[1216]	356	!
[4649]	357	!-- Set new indices for transformation
	358	nblk = nz / pdims(1)
	359	nz = pdims(1)
	360	nnz_x = 1
	361	nzb_x = 1 + myidx * nnz_x
	362	nzt_x = ( myidx + 1 ) * nnz_x
	363	sendrecvcount_zx = nnx * nny * nnz_x
[1216]	364
[4649]	365	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
	366	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
[1216]	367
[4649]	368	DO kk = 1, nblk
[1216]	369
[4649]	370	IF ( kk == 1 ) THEN
	371	CALL cpu_log( log_point_s(5), 'transpo forward', 'start', cpu_log_nowait )
	372	ELSE
	373	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
	374	ENDIF
[1216]	375
[4649]	376	DO knew = 1, nz
	377	ki = kk + nblk * ( knew - 1 )
	378	f_in(:,:,knew) = f_inv(:,:,ki)
	379	ENDDO
[1216]	380
[4649]	381	CALL transpose_zx( f_in, ar1(:,:,:))
	382	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
[1216]	383
[4649]	384	IF ( kk == 1 ) THEN
	385	CALL cpu_log( log_point_s(4), 'fft_x', 'start', cpu_log_nowait )
	386	ELSE
	387	CALL cpu_log( log_point_s(4), 'fft_x', 'continue', cpu_log_nowait )
	388	ENDIF
[1216]	389
[4649]	390	n = isave(2) + kk - 1
	391	CALL fft_x( ar1(:,:,:), 'forward', ar_2d = f_out_z(:,:,n) )
	392	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
[1216]	393
[4649]	394	ENDDO
[1216]	395	!
[4649]	396	!-- Restore original indices/counters
	397	nz = isave(1)
	398	nzb_x = isave(2)
	399	nzt_x = isave(3)
	400	sendrecvcount_zx = isave(4)
[1216]	401
[4649]	402	DEALLOCATE( ar1, f_in, f_inv )
[1216]	403
	404	!
[4649]	405	!-- Transposition x --> y + subsequent fft along y
	406	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
	407	CALL resort_for_xy( f_out_z, f_inv )
[1216]	408	!
[4649]	409	!-- Save original indices and gridpoint counter
	410	isave(1) = nx
	411	isave(2) = nxl_y
	412	isave(3) = nxr_y
	413	isave(4) = sendrecvcount_xy
[1216]	414	!
[4649]	415	!-- Set new indices for transformation
	416	nblk = ( ( nx+1 ) / pdims(2) ) - 1
	417	nx = pdims(2)
	418	nnx_y = 1
	419	nxl_y = myidy * nnx_y
	420	nxr_y = ( myidy + 1 ) * nnx_y - 1
	421	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
[1216]	422
[4649]	423	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
	424	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
[1216]	425
[4649]	426	DO ii = 0, nblk
[1216]	427
[4649]	428	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
[1216]	429
[4649]	430	DO inew = 0, nx-1
	431	iind = ii + ( nblk + 1 ) * inew
	432	f_in(:,:,inew) = f_inv(:,:,iind)
	433	ENDDO
[1216]	434
[4649]	435	CALL transpose_xy( f_in, ar1(:,:,:) )
[1216]	436
[4649]	437	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
[1216]	438
[4649]	439	IF ( ii == 1 ) THEN
	440	CALL cpu_log( log_point_s(7), 'fft_y', 'start', cpu_log_nowait )
	441	ELSE
	442	CALL cpu_log( log_point_s(7), 'fft_y', 'continue', cpu_log_nowait )
	443	ENDIF
[1216]	444
[4649]	445	nxl_y_bound = isave(2)
	446	nxr_y_bound = isave(3)
	447	n = isave(2) + ii
	448	CALL fft_y( ar1(:,:,:), 'forward', ar_tr = f_out_y, nxl_y_bound = nxl_y_bound, &
	449	nxr_y_bound = nxr_y_bound, nxl_y_l = n, nxr_y_l = n )
[1216]	450
[4649]	451	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
[1216]	452
[4649]	453	ENDDO
[1216]	454	!
[4649]	455	!-- Restore original indices/counters
	456	nx = isave(1)
	457	nxl_y = isave(2)
	458	nxr_y = isave(3)
	459	sendrecvcount_xy = isave(4)
[1216]	460
[4649]	461	DEALLOCATE( ar1, f_in, f_inv )
[1216]	462
	463	!
[4649]	464	!-- Transposition y --> z + subsequent tridia + resort for z --> y
	465	ALLOCATE( f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
	466	CALL resort_for_yz( f_out_y, f_inv )
[1216]	467	!
[4649]	468	!-- Save original indices and gridpoint counter
	469	isave(1) = ny
	470	isave(2) = nys_z
	471	isave(3) = nyn_z
	472	isave(4) = sendrecvcount_yz
[1216]	473	!
[4649]	474	!-- Set new indices for transformation
	475	nblk = ( ( ny+1 ) / pdims(1) ) - 1
	476	ny = pdims(1)
	477	nny_z = 1
	478	nys_z = myidx * nny_z
	479	nyn_z = ( myidx + 1 ) * nny_z - 1
	480	sendrecvcount_yz = ( nxr_y-nxl_y+1 ) * nny_z * ( nzt_y-nzb_y+1 )
[1216]	481
[4649]	482	ALLOCATE( ar1(nxl_z:nxr_z,nys_z:nyn_z,1:nz) )
	483	ALLOCATE( f_in(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
[1216]	484
[4649]	485	DO jj = 0, nblk
[1216]	486	!
[4649]	487	!-- Forward Fourier Transformation
	488	!-- Transposition y --> z
	489	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
[1216]	490
[4649]	491	DO jnew = 0, ny-1
	492	jind = jj + ( nblk + 1 ) * jnew
	493	f_in(:,:,jnew) = f_inv(:,:,jind)
	494	ENDDO
[1216]	495
[4649]	496	CALL transpose_yz( f_in, ar1(:,:,:) )
[1216]	497
[4649]	498	IF ( jj == nblk ) THEN
	499	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
	500	ELSE
	501	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
	502	ENDIF
[1216]	503
	504	!
[4649]	505	!-- Solve the tridiagonal equation system along z
	506	CALL cpu_log( log_point_s(6), 'tridia', 'start', cpu_log_nowait )
[1216]	507
[4649]	508	n = isave(2) + jj
	509	CALL tridia_substi_overlap( ar1(:,:,:), n )
[1216]	510
[4649]	511	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
[1216]	512
[1306]	513	!
[4649]	514	!-- Inverse Fourier Transformation
	515	!-- Transposition z --> y
	516	!-- Only one thread should call MPI routines, therefore forward and backward tranpose are in
	517	!-- the same section
	518	IF ( jj == 0 ) THEN
	519	CALL cpu_log( log_point_s(8), 'transpo invers', 'start', cpu_log_nowait )
	520	ELSE
	521	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
	522	ENDIF
[1216]	523
[4649]	524	CALL transpose_zy( ar1(:,:,:), f_in )
[1216]	525
[4649]	526	DO jnew = 0, ny-1
	527	jind = jj + ( nblk + 1 ) * jnew
	528	f_inv(:,:,jind) = f_in(:,:,jnew)
	529	ENDDO
[1216]	530
[4649]	531	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
[1216]	532
[4649]	533	ENDDO
[1216]	534	!
[4649]	535	!-- Restore original indices/counters
	536	ny = isave(1)
	537	nys_z = isave(2)
	538	nyn_z = isave(3)
	539	sendrecvcount_yz = isave(4)
[1216]	540
[4649]	541	CALL resort_for_zy( f_inv, f_out_y )
[1216]	542
[4649]	543	DEALLOCATE( ar1, f_in, f_inv )
[1216]	544
	545	!
[4649]	546	!-- fft along y backward + subsequent transposition y --> x
	547	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
[1216]	548	!
[4649]	549	!-- Save original indices and gridpoint counter
	550	isave(1) = nx
	551	isave(2) = nxl_y
	552	isave(3) = nxr_y
	553	isave(4) = sendrecvcount_xy
[1216]	554	!
[4649]	555	!-- Set new indices for transformation
	556	nblk = ( ( nx+1 ) / pdims(2) ) - 1
	557	nx = pdims(2)
	558	nnx_y = 1
	559	nxl_y = myidy * nnx_y
	560	nxr_y = ( myidy + 1 ) * nnx_y - 1
	561	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
[1216]	562
[4649]	563	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
	564	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
[1216]	565
[4649]	566	DO ii = 0, nblk
[1216]	567
[4649]	568	CALL cpu_log( log_point_s(7), 'fft_y', 'continue', cpu_log_nowait )
[1216]	569
[4649]	570	n = isave(2) + ii
	571	nxl_y_bound = isave(2)
	572	nxr_y_bound = isave(3)
[1216]	573
[4649]	574	CALL fft_y( ar1(:,:,:), 'backward', ar_tr = f_out_y, nxl_y_bound = nxl_y_bound, &
	575	nxr_y_bound = nxr_y_bound, nxl_y_l = n, nxr_y_l = n )
[1216]	576
[4649]	577	IF ( ii == nblk ) THEN
	578	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	579	ELSE
	580	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	581	ENDIF
[1216]	582
[4649]	583	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
[1216]	584
[4649]	585	CALL transpose_yx( ar1(:,:,:), f_in )
[1216]	586
[4649]	587	DO inew = 0, nx-1
	588	iind = ii + (nblk+1) * inew
	589	f_inv(:,:,iind) = f_in(:,:,inew)
	590	ENDDO
[1216]	591
[4649]	592	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
[1216]	593
[4649]	594	ENDDO
[1216]	595	!
[4649]	596	!-- Restore original indices/counters
	597	nx = isave(1)
	598	nxl_y = isave(2)
	599	nxr_y = isave(3)
	600	sendrecvcount_xy = isave(4)
[1216]	601
[4649]	602	CALL resort_for_yx( f_inv, f_out_z )
[1216]	603
[4649]	604	DEALLOCATE( ar1, f_in, f_inv )
[1216]	605
	606	!
[4649]	607	!-- fft along x backward + subsequent final transposition x --> z
	608	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
[1216]	609	!
[4649]	610	!-- Save original indices and gridpoint counter
	611	isave(1) = nz
	612	isave(2) = nzb_x
	613	isave(3) = nzt_x
	614	isave(4) = sendrecvcount_zx
[1216]	615	!
[4649]	616	!-- Set new indices for transformation
	617	nblk = nz / pdims(1)
	618	nz = pdims(1)
	619	nnz_x = 1
	620	nzb_x = 1 + myidx * nnz_x
	621	nzt_x = ( myidx + 1 ) * nnz_x
	622	sendrecvcount_zx = nnx * nny * nnz_x
[1216]	623
[4649]	624	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
	625	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
[1216]	626
[4649]	627	DO kk = 1, nblk
[1216]	628
[4649]	629	CALL cpu_log( log_point_s(4), 'fft_x', 'continue', cpu_log_nowait )
[1216]	630
[4649]	631	n = isave(2) + kk - 1
	632	CALL fft_x( ar1(:,:,:), 'backward', f_out_z(:,:,n) )
[1216]	633
[4649]	634	IF ( kk == nblk ) THEN
	635	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	636	ELSE
	637	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
	638	ENDIF
[1216]	639
[4649]	640	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
[1216]	641
[4649]	642	CALL transpose_xz( ar1(:,:,:), f_in )
[1216]	643
[4649]	644	DO knew = 1, nz
	645	ki = kk + nblk * ( knew - 1 )
	646	f_inv(:,:,ki) = f_in(:,:,knew)
	647	ENDDO
[1216]	648
[4649]	649	IF ( kk == nblk ) THEN
	650	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
	651	ELSE
	652	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
	653	ENDIF
[1216]	654
[4649]	655	ENDDO
[1216]	656	!
[4649]	657	!-- Restore original indices/counters
	658	nz = isave(1)
	659	nzb_x = isave(2)
	660	nzt_x = isave(3)
	661	sendrecvcount_zx = isave(4)
[1216]	662
[4649]	663	CALL resort_for_xz( f_inv, ar )
[1216]	664
[4649]	665	DEALLOCATE( ar1, f_in, f_inv )
[1216]	666
[4649]	667	ENDIF
[3634]	668	#endif
[1]	669
[3690]	670	#if !__acc_fft_device
[4649]	671	!$ACC UPDATE DEVICE(ar)
[3690]	672	#endif
	673
[4649]	674	CALL cpu_log( log_point_s(3), 'poisfft', 'stop' )
[1]	675
[4649]	676	END SUBROUTINE poisfft
[1]	677
	678
[4649]	679	!--------------------------------------------------------------------------------------------------!
[1682]	680	! Description:
	681	! ------------
[4649]	682	!> Fourier-transformation along y with subsequent transposition y --> x for a 1d-decomposition
	683	!> along x.
[1682]	684	!>
[4649]	685	!> @attention The performance of this routine is much faster on the NEC-SX6, if the first index of
	686	!> work_ffty_vec is odd. Otherwise memory bank conflicts may occur (especially if the
	687	!> index is a multiple of 128). That's why work_ffty_vec is dimensioned as 0:ny+1.
	688	!> Of course, this will not work if users are using an odd number of gridpoints along y.
	689	!--------------------------------------------------------------------------------------------------!
	690	SUBROUTINE ffty_tr_yx( f_in, f_out )
[1]	691
[4649]	692	USE control_parameters, &
	693	ONLY: loop_optimization
[1320]	694
[4649]	695	USE cpulog, &
	696	ONLY: cpu_log, &
	697	log_point_s
[1320]	698
[4649]	699	USE kinds
[1320]	700
[4649]	701	USE pegrid
[1]	702
[4649]	703	IMPLICIT NONE
[1]	704
[4649]	705	INTEGER(iwp) :: i !<
	706	INTEGER(iwp) :: iend !<
	707	INTEGER(iwp) :: iouter !<
	708	INTEGER(iwp) :: ir !<
	709	INTEGER(iwp) :: j !<
	710	INTEGER(iwp) :: k !<
[1]	711
[4649]	712	INTEGER(iwp), PARAMETER :: stridex = 4 !<
[1320]	713
[4649]	714	REAL(wp), DIMENSION(1:nz,0:ny,nxl:nxr) :: f_in !<
	715	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_out !<
	716	REAL(wp), DIMENSION(nxl:nxr,1:nz,0:ny) :: work !<
[1]	717
[4649]	718	REAL(wp), DIMENSION(:,:), ALLOCATABLE :: work_ffty !<
	719	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: work_ffty_vec !<
[2300]	720
[1]	721	!
[4649]	722	!-- Carry out the FFT along y, where all data are present due to the 1d-decomposition along x.
	723	!-- Resort the data in a way that x becomes the first index.
	724	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'start' )
[1]	725
[4649]	726	IF ( loop_optimization == 'vector' ) THEN
[2300]	727
[4649]	728	ALLOCATE( work_ffty_vec(0:ny+1,1:nz,nxl:nxr) )
[1]	729	!
[4649]	730	!-- Code optimized for vector processors
	731	!$OMP PARALLEL PRIVATE ( i, j, k )
	732	!$OMP DO
	733	DO i = nxl, nxr
[1]	734
[4649]	735	DO j = 0, ny
	736	DO k = 1, nz
	737	work_ffty_vec(j,k,i) = f_in(k,j,i)
[1]	738	ENDDO
[4649]	739	ENDDO
[1]	740
[4649]	741	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' )
[1]	742
[4649]	743	ENDDO
[1]	744
[4649]	745	!$OMP DO
	746	DO k = 1, nz
	747	DO j = 0, ny
	748	DO i = nxl, nxr
	749	work(i,k,j) = work_ffty_vec(j,k,i)
[1]	750	ENDDO
	751	ENDDO
[4649]	752	ENDDO
	753	!$OMP END PARALLEL
[1]	754
[4649]	755	DEALLOCATE( work_ffty_vec )
[2300]	756
[4649]	757	ELSE
[1]	758	!
[4649]	759	!-- Cache optimized code.
	760	ALLOCATE( work_ffty(0:ny,stridex) )
[2300]	761	!
[4649]	762	!-- The i-(x-)direction is split into a strided outer loop and an inner loop for better cache
	763	!-- performance
	764	!$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty)
	765	!$OMP DO
	766	DO iouter = nxl, nxr, stridex
[1]	767
[4649]	768	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
[1]	769
[4649]	770	DO k = 1, nz
[1]	771
[4649]	772	DO i = iouter, iend
[1]	773
[4649]	774	ir = i-iouter+1 ! Counter within a stride
	775	DO j = 0, ny
	776	work_ffty(j,ir) = f_in(k,j,i)
	777	ENDDO
[1]	778	!
[4649]	779	!-- FFT along y
	780	CALL fft_y_1d( work_ffty(:,ir), 'forward' )
[1]	781
[4649]	782	ENDDO
[1]	783
	784	!
[4649]	785	!-- Resort
	786	DO j = 0, ny
	787	DO i = iouter, iend
	788	work(i,k,j) = work_ffty(j,i-iouter+1)
[1]	789	ENDDO
	790	ENDDO
	791
	792	ENDDO
	793
[4649]	794	ENDDO
	795	!$OMP END PARALLEL
[2300]	796
[4649]	797	DEALLOCATE( work_ffty )
[2300]	798
[4649]	799	ENDIF
[1]	800
[4649]	801	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'pause' )
	802
[1]	803	!
[4649]	804	!-- Transpose array
[1111]	805	#if defined( __parallel )
[4649]	806	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
	807	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
	808	CALL MPI_ALLTOALL( work(nxl,1,0), sendrecvcount_xy, MPI_REAL, f_out(1,1,nys_x,1), &
	809	sendrecvcount_xy, MPI_REAL, comm1dx, ierr )
	810	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[4429]	811	#else
	812	!
[4649]	813	!-- Next line required to avoid compile error about unused dummy argument in serial mode
	814	i = SIZE( f_out )
[1111]	815	#endif
[1]	816
[4649]	817	END SUBROUTINE ffty_tr_yx
[1]	818
	819
[4649]	820	!--------------------------------------------------------------------------------------------------!
[1682]	821	! Description:
	822	! ------------
[4649]	823	!> Transposition x --> y with a subsequent backward Fourier transformation for a 1d-decomposition
	824	!> along x
	825	!--------------------------------------------------------------------------------------------------!
	826	SUBROUTINE tr_xy_ffty( f_in, f_out )
[1]	827
[4649]	828	USE control_parameters, &
	829	ONLY: loop_optimization
[1320]	830
[4649]	831	USE cpulog, &
	832	ONLY: cpu_log, &
	833	log_point_s
[1320]	834
[4649]	835	USE kinds
[1320]	836
[4649]	837	USE pegrid
[1]	838
[4649]	839	IMPLICIT NONE
[1]	840
[4649]	841	INTEGER(iwp) :: i !<
	842	INTEGER(iwp) :: iend !<
	843	INTEGER(iwp) :: iouter !<
	844	INTEGER(iwp) :: ir !<
	845	INTEGER(iwp) :: j !<
	846	INTEGER(iwp) :: k !<
[1]	847
[4649]	848	INTEGER(iwp), PARAMETER :: stridex = 4 !<
[1320]	849
[4649]	850	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_in !<
	851	REAL(wp), DIMENSION(1:nz,0:ny,nxl:nxr) :: f_out !<
	852	REAL(wp), DIMENSION(nxl:nxr,1:nz,0:ny) :: work !<
[1]	853
[4649]	854	REAL(wp), DIMENSION(:,:), ALLOCATABLE :: work_ffty !<
	855	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: work_ffty_vec !<
[2300]	856
[1]	857	!
[4649]	858	!-- Transpose array
[1111]	859	#if defined( __parallel )
[4649]	860	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
	861	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
	862	CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, work(nxl,1,0), &
	863	sendrecvcount_xy, MPI_REAL, comm1dx, ierr )
	864	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[4429]	865	#else
	866	!
[4649]	867	!-- Next line required to avoid compile error about unused dummy argument in serial mode
	868	i = SIZE( f_in )
[1111]	869	#endif
[1]	870
	871	!
[4649]	872	!-- Resort the data in a way that y becomes the first index and carry out the backward fft along y.
	873	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'continue' )
[1]	874
[4649]	875	IF ( loop_optimization == 'vector' ) THEN
[2300]	876
[4649]	877	ALLOCATE( work_ffty_vec(0:ny+1,1:nz,nxl:nxr) )
[1]	878	!
[4649]	879	!-- Code optimized for vector processors
	880	!$OMP PARALLEL PRIVATE ( i, j, k )
	881	!$OMP DO
	882	DO k = 1, nz
	883	DO j = 0, ny
	884	DO i = nxl, nxr
	885	work_ffty_vec(j,k,i) = work(i,k,j)
[1]	886	ENDDO
	887	ENDDO
[4649]	888	ENDDO
[1]	889
[4649]	890	!$OMP DO
	891	DO i = nxl, nxr
[1]	892
[4649]	893	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' )
[1]	894
[4649]	895	DO j = 0, ny
	896	DO k = 1, nz
	897	f_out(k,j,i) = work_ffty_vec(j,k,i)
[1]	898	ENDDO
	899	ENDDO
	900
[4649]	901	ENDDO
	902	!$OMP END PARALLEL
[2300]	903
[4649]	904	DEALLOCATE( work_ffty_vec )
	905
	906	ELSE
[1]	907	!
[4649]	908	!-- Cache optimized code.
	909	ALLOCATE( work_ffty(0:ny,stridex) )
[2300]	910	!
[4649]	911	!-- The i-(x-)direction is split into a strided outer loop and an inner loop for better cache
	912	!-- performance
	913	!$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty )
	914	!$OMP DO
	915	DO iouter = nxl, nxr, stridex
[1]	916
[4649]	917	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
[1]	918
[4649]	919	DO k = 1, nz
[1]	920	!
[4649]	921	!-- Resort
	922	DO j = 0, ny
	923	DO i = iouter, iend
	924	work_ffty(j,i-iouter+1) = work(i,k,j)
[1]	925	ENDDO
[4649]	926	ENDDO
[1]	927
[4649]	928	DO i = iouter, iend
[1]	929
	930	!
[4649]	931	!-- FFT along y
	932	ir = i-iouter+1 ! Counter within a stride
	933	CALL fft_y_1d( work_ffty(:,ir), 'backward' )
[1]	934
[4649]	935	DO j = 0, ny
	936	f_out(k,j,i) = work_ffty(j,ir)
[1]	937	ENDDO
	938	ENDDO
	939
	940	ENDDO
	941
[4649]	942	ENDDO
	943	!$OMP END PARALLEL
[2300]	944
[4649]	945	DEALLOCATE( work_ffty )
[1]	946
[4649]	947	ENDIF
[1]	948
[4649]	949	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'stop' )
[1]	950
[4649]	951	END SUBROUTINE tr_xy_ffty
[1]	952
[4649]	953
	954	!--------------------------------------------------------------------------------------------------!
[1682]	955	! Description:
	956	! ------------
[4649]	957	!> FFT along x, solution of the tridiagonal system and backward FFT for a 1d-decomposition along x
[1682]	958	!>
[4649]	959	!> @warning This subroutine may still not work for hybrid parallelization with OpenMP (for possible
	960	!> necessary changes see the original routine poisfft_hybrid, developed by Klaus Ketelsen,
	961	!> May 2002)
	962	!--------------------------------------------------------------------------------------------------!
	963	SUBROUTINE fftx_tri_fftx( ar )
[1]	964
[4649]	965	USE control_parameters, &
	966	ONLY: loop_optimization
[1320]	967
[4649]	968	USE cpulog, &
	969	ONLY: cpu_log, &
	970	log_point_s
[1320]	971
[4649]	972	USE grid_variables, &
	973	ONLY: ddx2, &
	974	ddy2
[1320]	975
[4649]	976	USE kinds
[1320]	977
[4649]	978	USE pegrid
[1]	979
[4649]	980	IMPLICIT NONE
[1]	981
[4649]	982	INTEGER(iwp) :: i !<
	983	INTEGER(iwp) :: j !<
	984	INTEGER(iwp) :: k !<
	985	INTEGER(iwp) :: m !<
	986	INTEGER(iwp) :: n !<
	987	! INTEGER(iwp) :: omp_get_thread_num !<
	988	INTEGER(iwp) :: tn !<
[1]	989
[4649]	990	REAL(wp), DIMENSION(0:nx) :: work_fftx !<
	991	REAL(wp), DIMENSION(0:nx,1:nz) :: work_trix !<
	992	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: ar !<
	993	REAL(wp), DIMENSION(:,:,:,:), ALLOCATABLE :: tri !<
[1]	994
	995
[4649]	996	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'start' )
[1]	997
[4649]	998	ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) )
[1]	999
[4649]	1000	tn = 0 ! Default thread number in case of one thread
[1]	1001	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_fftx, work_trix )
[4649]	1002	DO j = nys_x, nyn_x
[1]	1003
[4649]	1004	!$ tn = omp_get_thread_num()
[1]	1005
[4649]	1006	IF ( loop_optimization == 'vector' ) THEN
[1]	1007	!
[4649]	1008	!-- Code optimized for vector processors
	1009	DO k = 1, nz
[1]	1010
[4649]	1011	m = 0
	1012	DO n = 1, pdims(1)
	1013	DO i = 1, nnx
	1014	work_trix(m,k) = ar(i,k,j,n)
	1015	m = m + 1
[1]	1016	ENDDO
	1017	ENDDO
	1018
[4649]	1019	ENDDO
[1]	1020
[4649]	1021	CALL fft_x_m( work_trix, 'forward' )
	1022
	1023	ELSE
[1]	1024	!
[4649]	1025	!-- Cache optimized code
	1026	DO k = 1, nz
[1]	1027
[4649]	1028	m = 0
	1029	DO n = 1, pdims(1)
	1030	DO i = 1, nnx
	1031	work_fftx(m) = ar(i,k,j,n)
	1032	m = m + 1
[1]	1033	ENDDO
[4649]	1034	ENDDO
[1]	1035
[4649]	1036	CALL fft_x_1d( work_fftx, 'forward' )
[1]	1037
[4649]	1038	DO i = 0, nx
	1039	work_trix(i,k) = work_fftx(i)
[1]	1040	ENDDO
	1041
[4649]	1042	ENDDO
[1]	1043
[4649]	1044	ENDIF
	1045
[1]	1046	!
[4649]	1047	!-- Solve the linear equation system
	1048	CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) )
[1]	1049
[4649]	1050	IF ( loop_optimization == 'vector' ) THEN
[1]	1051	!
[4649]	1052	!-- Code optimized for vector processors
	1053	CALL fft_x_m( work_trix, 'backward' )
[1]	1054
[4649]	1055	DO k = 1, nz
[1]	1056
[4649]	1057	m = 0
	1058	DO n = 1, pdims(1)
	1059	DO i = 1, nnx
	1060	ar(i,k,j,n) = work_trix(m,k)
	1061	m = m + 1
[1]	1062	ENDDO
	1063	ENDDO
	1064
[4649]	1065	ENDDO
	1066
	1067	ELSE
[1]	1068	!
[4649]	1069	!-- Cache optimized code
	1070	DO k = 1, nz
[1]	1071
[4649]	1072	DO i = 0, nx
	1073	work_fftx(i) = work_trix(i,k)
	1074	ENDDO
[1]	1075
[4649]	1076	CALL fft_x_1d( work_fftx, 'backward' )
[1]	1077
[4649]	1078	m = 0
	1079	DO n = 1, pdims(1)
	1080	DO i = 1, nnx
	1081	ar(i,k,j,n) = work_fftx(m)
	1082	m = m + 1
[1]	1083	ENDDO
	1084	ENDDO
	1085
[4649]	1086	ENDDO
[1]	1087
[4649]	1088	ENDIF
[1]	1089
[4649]	1090	ENDDO
[1]	1091
[4649]	1092	DEALLOCATE( tri )
[1]	1093
[4649]	1094	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'stop' )
[1]	1095
[4649]	1096	END SUBROUTINE fftx_tri_fftx
[1]	1097
[4649]	1098
	1099	!--------------------------------------------------------------------------------------------------!
[1682]	1100	! Description:
	1101	! ------------
[4649]	1102	!> Fourier-transformation along x with subsequent transposition x --> y for a 1d-decomposition
	1103	!> along y.
[1682]	1104	!>
[4649]	1105	!> @attention NEC-branch of this routine may significantly profit from further optimizations. So
	1106	!> far, performance is much worse than for routine ffty_tr_yx (more than three times
	1107	!> slower).
	1108	!--------------------------------------------------------------------------------------------------!
	1109	SUBROUTINE fftx_tr_xy( f_in, f_out )
[1]	1110
[1682]	1111
[4649]	1112	USE control_parameters, &
	1113	ONLY: loop_optimization
[1320]	1114
[4649]	1115	USE cpulog, &
	1116	ONLY: cpu_log, &
	1117	log_point_s
[1320]	1118
[4649]	1119	USE kinds
[1320]	1120
[4649]	1121	USE pegrid
[1]	1122
[4649]	1123	IMPLICIT NONE
[1]	1124
[4649]	1125	INTEGER(iwp) :: i !<
	1126	INTEGER(iwp) :: j !<
	1127	INTEGER(iwp) :: k !<
[1]	1128
[4649]	1129	REAL(wp), DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx !<
	1130	REAL(wp), DIMENSION(1:nz,nys:nyn,0:nx) :: f_in !<
	1131	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_out !<
	1132	REAL(wp), DIMENSION(nys:nyn,1:nz,0:nx) :: work !<
[1]	1133
	1134	!
[4649]	1135	!-- Carry out the FFT along x, where all data are present due to the 1d-decomposition along y.
	1136	!-- Resort the data in a way that y becomes the first index.
	1137	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'start' )
[1]	1138
[4649]	1139	IF ( loop_optimization == 'vector' ) THEN
[1]	1140	!
[4649]	1141	!-- Code for vector processors
	1142	!$OMP PARALLEL PRIVATE ( i, j, k )
	1143	!$OMP DO
	1144	DO i = 0, nx
[1]	1145
[4649]	1146	DO j = nys, nyn
	1147	DO k = 1, nz
	1148	work_fftx(i,k,j) = f_in(k,j,i)
[1]	1149	ENDDO
	1150	ENDDO
	1151
[4649]	1152	ENDDO
[1]	1153
[4649]	1154	!$OMP DO
	1155	DO j = nys, nyn
[1]	1156
[4649]	1157	CALL fft_x_m( work_fftx(:,:,j), 'forward' )
	1158
	1159	DO k = 1, nz
	1160	DO i = 0, nx
	1161	work(j,k,i) = work_fftx(i,k,j)
[1]	1162	ENDDO
	1163	ENDDO
	1164
[4649]	1165	ENDDO
	1166	!$OMP END PARALLEL
[1]	1167
[4649]	1168	ELSE
	1169
[1]	1170	!
[4649]	1171	!-- Cache optimized code (there might still be a potential for better optimization).
	1172	!$OMP PARALLEL PRIVATE (i,j,k)
	1173	!$OMP DO
	1174	DO i = 0, nx
[1]	1175
[4649]	1176	DO j = nys, nyn
	1177	DO k = 1, nz
	1178	work_fftx(i,k,j) = f_in(k,j,i)
[1]	1179	ENDDO
	1180	ENDDO
	1181
[4649]	1182	ENDDO
[1]	1183
[4649]	1184	!$OMP DO
	1185	DO j = nys, nyn
	1186	DO k = 1, nz
[1]	1187
[4649]	1188	CALL fft_x_1d( work_fftx(0:nx,k,j), 'forward' )
	1189
	1190	DO i = 0, nx
	1191	work(j,k,i) = work_fftx(i,k,j)
[1]	1192	ENDDO
	1193	ENDDO
	1194
[4649]	1195	ENDDO
	1196	!$OMP END PARALLEL
[1]	1197
[4649]	1198	ENDIF
	1199	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'pause' )
	1200
[1]	1201	!
[4649]	1202	!-- Transpose array
[1111]	1203	#if defined( __parallel )
[4649]	1204	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
	1205	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
	1206	CALL MPI_ALLTOALL( work(nys,1,0), sendrecvcount_xy, MPI_REAL, f_out(1,1,nxl_y,1), &
	1207	sendrecvcount_xy, MPI_REAL, comm1dy, ierr )
	1208	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[4429]	1209	#else
	1210	!
[4649]	1211	!-- Next line required to avoid compile error about unused dummy argument in serial mode
	1212	i = SIZE( f_out )
[1111]	1213	#endif
[1]	1214
[4649]	1215	END SUBROUTINE fftx_tr_xy
[1]	1216
	1217
[4649]	1218	!--------------------------------------------------------------------------------------------------!
[1682]	1219	! Description:
	1220	! ------------
[4649]	1221	!> Transposition y --> x with a subsequent backward Fourier transformation for a 1d-decomposition
	1222	!> along x.
	1223	!--------------------------------------------------------------------------------------------------!
	1224	SUBROUTINE tr_yx_fftx( f_in, f_out )
[1]	1225
[1682]	1226
[4649]	1227	USE control_parameters, &
	1228	ONLY: loop_optimization
[1320]	1229
[4649]	1230	USE cpulog, &
	1231	ONLY: cpu_log, &
	1232	log_point_s
[1320]	1233
[4649]	1234	USE kinds
[1320]	1235
[4649]	1236	USE pegrid
[1]	1237
[4649]	1238	IMPLICIT NONE
[1]	1239
[4649]	1240	INTEGER(iwp) :: i !<
	1241	INTEGER(iwp) :: j !<
	1242	INTEGER(iwp) :: k !<
[1]	1243
[4649]	1244	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_in !<
	1245	REAL(wp), DIMENSION(1:nz,nys:nyn,0:nx) :: f_out !<
	1246	REAL(wp), DIMENSION(nys:nyn,1:nz,0:nx) :: work !<
	1247	REAL(wp), DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx !<
[1]	1248
[4429]	1249
[1]	1250	!
[4649]	1251	!-- Transpose array
[1111]	1252	#if defined( __parallel )
[4649]	1253	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
	1254	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
	1255	CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, work(nys,1,0), &
	1256	sendrecvcount_xy, MPI_REAL, comm1dy, ierr )
	1257	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[4429]	1258	#else
	1259	!
[4649]	1260	!-- Next line required to avoid compile error about unused dummy argument in serial mode
	1261	i = SIZE( f_in )
[1111]	1262	#endif
[1]	1263
	1264	!
[4649]	1265	!-- Carry out the FFT along x, where all data are present due to the 1d-decomposition along y.
	1266	!-- Resort the data in a way that y becomes the first index.
	1267	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'continue' )
[1]	1268
[4649]	1269	IF ( loop_optimization == 'vector' ) THEN
[1]	1270	!
[4649]	1271	!-- Code optimized for vector processors
	1272	!$OMP PARALLEL PRIVATE ( i, j, k )
	1273	!$OMP DO
	1274	DO j = nys, nyn
[1]	1275
[4649]	1276	DO k = 1, nz
	1277	DO i = 0, nx
	1278	work_fftx(i,k,j) = work(j,k,i)
[1]	1279	ENDDO
[4649]	1280	ENDDO
[1]	1281
[4649]	1282	CALL fft_x_m( work_fftx(:,:,j), 'backward' )
[1]	1283
[4649]	1284	ENDDO
[1]	1285
[4649]	1286	!$OMP DO
	1287	DO i = 0, nx
	1288	DO j = nys, nyn
	1289	DO k = 1, nz
	1290	f_out(k,j,i) = work_fftx(i,k,j)
[1]	1291	ENDDO
	1292	ENDDO
[4649]	1293	ENDDO
	1294	!$OMP END PARALLEL
[1]	1295
[4649]	1296	ELSE
[1]	1297
	1298	!
[4649]	1299	!-- Cache optimized code (there might be still a potential for better optimization).
	1300	!$OMP PARALLEL PRIVATE (i,j,k)
	1301	!$OMP DO
	1302	DO j = nys, nyn
	1303	DO k = 1, nz
[1]	1304
[4649]	1305	DO i = 0, nx
	1306	work_fftx(i,k,j) = work(j,k,i)
	1307	ENDDO
[1]	1308
[4649]	1309	CALL fft_x_1d( work_fftx(0:nx,k,j), 'backward' )
[1]	1310
	1311	ENDDO
[4649]	1312	ENDDO
[1]	1313
[4649]	1314	!$OMP DO
	1315	DO i = 0, nx
	1316	DO j = nys, nyn
	1317	DO k = 1, nz
	1318	f_out(k,j,i) = work_fftx(i,k,j)
[1]	1319	ENDDO
	1320	ENDDO
[4649]	1321	ENDDO
	1322	!$OMP END PARALLEL
[1]	1323
[4649]	1324	ENDIF
	1325	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'stop' )
[1]	1326
[4649]	1327	END SUBROUTINE tr_yx_fftx
[1]	1328
	1329
[4649]	1330	!--------------------------------------------------------------------------------------------------!
[1682]	1331	! Description:
	1332	! ------------
[4649]	1333	!> FFT along y, solution of the tridiagonal system and backward FFT for a 1d-decomposition along y.
[1682]	1334	!>
[4649]	1335	!> @warning This subroutine may still not work for hybrid parallelization with OpenMP (for possible
	1336	!> necessary changes see the original routine poisfft_hybrid, developed by Klaus Ketelsen,
	1337	!> May 2002)
	1338	!--------------------------------------------------------------------------------------------------!
	1339	SUBROUTINE ffty_tri_ffty( ar )
[1]	1340
[1682]	1341
[4649]	1342	USE control_parameters, &
	1343	ONLY: loop_optimization
[1320]	1344
[4649]	1345	USE cpulog, &
	1346	ONLY: cpu_log, &
	1347	log_point_s
[1320]	1348
[4649]	1349	USE grid_variables, &
	1350	ONLY: ddx2, &
	1351	ddy2
[1320]	1352
[4649]	1353	USE kinds
[1320]	1354
[4649]	1355	USE pegrid
[1]	1356
[4649]	1357	IMPLICIT NONE
[1]	1358
[4649]	1359	INTEGER(iwp) :: i !<
	1360	INTEGER(iwp) :: j !<
	1361	INTEGER(iwp) :: k !<
	1362	INTEGER(iwp) :: m !<
	1363	INTEGER(iwp) :: n !<
	1364	!$ INTEGER(iwp) :: omp_get_thread_num !<
	1365	INTEGER(iwp) :: tn !<
[1]	1366
[4649]	1367	REAL(wp), DIMENSION(0:ny) :: work_ffty !<
	1368	REAL(wp), DIMENSION(0:ny,1:nz) :: work_triy !<
	1369	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: ar !<
	1370	REAL(wp), DIMENSION(:,:,:,:), ALLOCATABLE :: tri !<
[1]	1371
	1372
[4649]	1373	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'start' )
[1]	1374
[4649]	1375	ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) )
[1]	1376
[4649]	1377	tn = 0 ! Default thread number in case of one thread
	1378	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy )
	1379	DO i = nxl_y, nxr_y
[1]	1380
[4649]	1381	!$ tn = omp_get_thread_num()
[1]	1382
[4649]	1383	IF ( loop_optimization == 'vector' ) THEN
[1]	1384	!
[4649]	1385	!-- Code optimized for vector processors
	1386	DO k = 1, nz
[1]	1387
[4649]	1388	m = 0
	1389	DO n = 1, pdims(2)
	1390	DO j = 1, nny
	1391	work_triy(m,k) = ar(j,k,i,n)
	1392	m = m + 1
[1]	1393	ENDDO
	1394	ENDDO
	1395
[4649]	1396	ENDDO
[1]	1397
[4649]	1398	CALL fft_y_m( work_triy, ny, 'forward' )
	1399
	1400	ELSE
[1]	1401	!
[4649]	1402	!-- Cache optimized code
	1403	DO k = 1, nz
[1]	1404
[4649]	1405	m = 0
	1406	DO n = 1, pdims(2)
	1407	DO j = 1, nny
	1408	work_ffty(m) = ar(j,k,i,n)
	1409	m = m + 1
[1]	1410	ENDDO
[4649]	1411	ENDDO
[1]	1412
[4649]	1413	CALL fft_y_1d( work_ffty, 'forward' )
[1]	1414
[4649]	1415	DO j = 0, ny
	1416	work_triy(j,k) = work_ffty(j)
[1]	1417	ENDDO
	1418
[4649]	1419	ENDDO
[1]	1420
[4649]	1421	ENDIF
	1422
[1]	1423	!
[4649]	1424	!-- Solve the linear equation system
	1425	CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) )
[1]	1426
[4649]	1427	IF ( loop_optimization == 'vector' ) THEN
[1]	1428	!
[4649]	1429	!-- Code optimized for vector processors
	1430	CALL fft_y_m( work_triy, ny, 'backward' )
[1]	1431
[4649]	1432	DO k = 1, nz
[1]	1433
[4649]	1434	m = 0
	1435	DO n = 1, pdims(2)
	1436	DO j = 1, nny
	1437	ar(j,k,i,n) = work_triy(m,k)
	1438	m = m + 1
[1]	1439	ENDDO
	1440	ENDDO
	1441
[4649]	1442	ENDDO
	1443
	1444	ELSE
[1]	1445	!
[4649]	1446	!-- Cache optimized code
	1447	DO k = 1, nz
[1]	1448
[4649]	1449	DO j = 0, ny
	1450	work_ffty(j) = work_triy(j,k)
	1451	ENDDO
[1]	1452
[4649]	1453	CALL fft_y_1d( work_ffty, 'backward' )
[1]	1454
[4649]	1455	m = 0
	1456	DO n = 1, pdims(2)
	1457	DO j = 1, nny
	1458	ar(j,k,i,n) = work_ffty(m)
	1459	m = m + 1
[1]	1460	ENDDO
	1461	ENDDO
	1462
[4649]	1463	ENDDO
[1]	1464
[4649]	1465	ENDIF
[1]	1466
[4649]	1467	ENDDO
[1]	1468
[4649]	1469	DEALLOCATE( tri )
[1]	1470
[4649]	1471	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'stop' )
[1]	1472
[4649]	1473	END SUBROUTINE ffty_tri_ffty
	1474
	1475	END MODULE poisfft_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |