Home

Context Navigation

source: palm/trunk/SOURCE/poisfft_mod.f90 @ 4180

Last change on this file since 4180 was 4180, checked in by scharf, 5 years ago
removed comments in 'Former revisions' section that are older than 01.01.2019
Property svn:keywords set to `Id`
File size: 41.8 KB

Rev	Line
[1850]	1	!> @file poisfft_mod.f90
[2000]	2	!------------------------------------------------------------------------------!
[2696]	3	! This file is part of the PALM model system.
[1036]	4	!
[2000]	5	! PALM is free software: you can redistribute it and/or modify it under the
	6	! terms of the GNU General Public License as published by the Free Software
	7	! Foundation, either version 3 of the License, or (at your option) any later
	8	! version.
[1036]	9	!
	10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
	11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
	12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
	13	!
	14	! You should have received a copy of the GNU General Public License along with
	15	! PALM. If not, see <http://www.gnu.org/licenses/>.
	16	!
[3655]	17	! Copyright 1997-2019 Leibniz Universitaet Hannover
[2000]	18	!------------------------------------------------------------------------------!
[1036]	19	!
[484]	20	! Current revisions:
[1]	21	! -----------------
[1683]	22	!
[2119]	23	!
[1321]	24	! Former revisions:
	25	! -----------------
	26	! $Id: poisfft_mod.f90 4180 2019-08-21 14:37:54Z scharf $
[3634]	27	! OpenACC port for SPEC
	28	!
[2716]	29	!
[1]	30	! Description:
	31	! ------------
[1682]	32	!> Solves the Poisson equation with a 2D spectral method
	33	!> d^2 p / dx^2 + d^2 p / dy^2 + d^2 p / dz^2 = s
	34	!>
	35	!> Input:
	36	!> real ar contains (nnz,nny,nnx) elements of the velocity divergence,
	37	!> starting from (1,nys,nxl)
	38	!>
	39	!> Output:
	40	!> real ar contains the solution for perturbation pressure p
[1]	41	!------------------------------------------------------------------------------!
[1682]	42	MODULE poisfft_mod
	43
[1]	44
[1320]	45	USE fft_xy, &
	46	ONLY: fft_init, fft_y, fft_y_1d, fft_y_m, fft_x, fft_x_1d, fft_x_m
[1]	47
[1320]	48	USE indices, &
	49	ONLY: nnx, nny, nx, nxl, nxr, ny, nys, nyn, nz
	50
	51	USE transpose_indices, &
	52	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nys_x, nys_z, nyn_x, nyn_z, nzb_x, &
	53	nzb_y, nzt_x, nzt_y
	54
	55	USE tridia_solver, &
	56	ONLY: tridia_1dd, tridia_init, tridia_substi, tridia_substi_overlap
	57
[1]	58	IMPLICIT NONE
	59
[1111]	60	LOGICAL, SAVE :: poisfft_initialized = .FALSE.
	61
[1]	62	PRIVATE
[807]	63
[1]	64	PUBLIC poisfft, poisfft_init
	65
	66	INTERFACE poisfft
	67	MODULE PROCEDURE poisfft
	68	END INTERFACE poisfft
	69
	70	INTERFACE poisfft_init
	71	MODULE PROCEDURE poisfft_init
	72	END INTERFACE poisfft_init
	73
[807]	74
[1]	75	CONTAINS
	76
[1682]	77	!------------------------------------------------------------------------------!
	78	! Description:
	79	! ------------
[3241]	80	!> Setup coefficients for FFT and the tridiagonal solver
[1682]	81	!------------------------------------------------------------------------------!
[1]	82	SUBROUTINE poisfft_init
	83
[1111]	84	IMPLICIT NONE
	85
	86
[1]	87	CALL fft_init
	88
[1212]	89	CALL tridia_init
[1111]	90
	91	poisfft_initialized = .TRUE.
	92
[1]	93	END SUBROUTINE poisfft_init
	94
[1111]	95
[1804]	96
[1682]	97	!------------------------------------------------------------------------------!
	98	! Description:
	99	! ------------
	100	!> Two-dimensional Fourier Transformation in x- and y-direction.
	101	!------------------------------------------------------------------------------!
[1306]	102	SUBROUTINE poisfft( ar )
[1]	103
[1320]	104	USE control_parameters, &
[3241]	105	ONLY: transpose_compute_overlap
[1320]	106
	107	USE cpulog, &
	108	ONLY: cpu_log, cpu_log_nowait, log_point_s
	109
	110	USE kinds
	111
[1]	112	USE pegrid
	113
	114	IMPLICIT NONE
	115
[1682]	116	INTEGER(iwp) :: ii !<
	117	INTEGER(iwp) :: iind !<
	118	INTEGER(iwp) :: inew !<
	119	INTEGER(iwp) :: jj !<
	120	INTEGER(iwp) :: jind !<
	121	INTEGER(iwp) :: jnew !<
	122	INTEGER(iwp) :: ki !<
	123	INTEGER(iwp) :: kk !<
	124	INTEGER(iwp) :: knew !<
	125	INTEGER(iwp) :: n !<
	126	INTEGER(iwp) :: nblk !<
	127	INTEGER(iwp) :: nnx_y !<
	128	INTEGER(iwp) :: nny_z !<
	129	INTEGER(iwp) :: nnz_x !<
	130	INTEGER(iwp) :: nxl_y_bound !<
	131	INTEGER(iwp) :: nxr_y_bound !<
[1]	132
[1682]	133	INTEGER(iwp), DIMENSION(4) :: isave !<
[1320]	134
[1682]	135	REAL(wp), DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar !<
	136	REAL(wp), DIMENSION(nys:nyn,nxl:nxr,1:nz) :: ar_inv !<
[3690]	137
	138	#define __acc_fft_device ( defined( _OPENACC ) && ( defined ( __cuda_fft ) ) )
	139	#if __acc_fft_device
[3634]	140	!$ACC DECLARE CREATE(ar_inv)
[3690]	141	#endif
[1]	142
[1682]	143	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: ar1 !<
	144	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_in !<
	145	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_inv !<
	146	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_out_y !<
	147	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_out_z !<
[1216]	148
	149
[1]	150	CALL cpu_log( log_point_s(3), 'poisfft', 'start' )
	151
[1111]	152	IF ( .NOT. poisfft_initialized ) CALL poisfft_init
	153
[3690]	154	#if !__acc_fft_device
	155	!$ACC UPDATE HOST(ar)
	156	#endif
	157
[3634]	158	#ifndef _OPENACC
[1]	159	!
	160	!-- Two-dimensional Fourier Transformation in x- and y-direction.
[2118]	161	IF ( pdims(2) == 1 .AND. pdims(1) > 1 ) THEN
[1]	162
	163	!
	164	!-- 1d-domain-decomposition along x:
	165	!-- FFT along y and transposition y --> x
[1216]	166	CALL ffty_tr_yx( ar, ar )
[1]	167
	168	!
	169	!-- FFT along x, solving the tridiagonal system and backward FFT
	170	CALL fftx_tri_fftx( ar )
	171
	172	!
	173	!-- Transposition x --> y and backward FFT along y
[1216]	174	CALL tr_xy_ffty( ar, ar )
[1]	175
[2118]	176	ELSEIF ( pdims(1) == 1 .AND. pdims(2) > 1 ) THEN
[1]	177
	178	!
	179	!-- 1d-domain-decomposition along y:
	180	!-- FFT along x and transposition x --> y
[1216]	181	CALL fftx_tr_xy( ar, ar )
[1]	182
	183	!
	184	!-- FFT along y, solving the tridiagonal system and backward FFT
	185	CALL ffty_tri_ffty( ar )
	186
	187	!
	188	!-- Transposition y --> x and backward FFT along x
[1216]	189	CALL tr_yx_fftx( ar, ar )
[1]	190
[1216]	191	ELSEIF ( .NOT. transpose_compute_overlap ) THEN
[3634]	192	#endif
[1]	193
	194	!
[1111]	195	!-- 2d-domain-decomposition or no decomposition (1 PE run)
[1]	196	!-- Transposition z --> x
	197	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
[1216]	198	CALL resort_for_zx( ar, ar_inv )
	199	CALL transpose_zx( ar_inv, ar )
[1]	200	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
	201
	202	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
[1106]	203	CALL fft_x( ar, 'forward' )
[1]	204	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
	205
	206	!
	207	!-- Transposition x --> y
	208	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
[1216]	209	CALL resort_for_xy( ar, ar_inv )
	210	CALL transpose_xy( ar_inv, ar )
[1]	211	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
	212
	213	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
[1216]	214	CALL fft_y( ar, 'forward', ar_tr = ar, &
	215	nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
	216	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
[1]	217	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	218
	219	!
	220	!-- Transposition y --> z
	221	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
[1216]	222	CALL resort_for_yz( ar, ar_inv )
	223	CALL transpose_yz( ar_inv, ar )
[1]	224	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
	225
	226	!
[1106]	227	!-- Solve the tridiagonal equation system along z
[1]	228	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
[1212]	229	CALL tridia_substi( ar )
[1]	230	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
	231
	232	!
	233	!-- Inverse Fourier Transformation
	234	!-- Transposition z --> y
	235	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
[1216]	236	CALL transpose_zy( ar, ar_inv )
	237	CALL resort_for_zy( ar_inv, ar )
[1]	238	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
	239
	240	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
[1216]	241	CALL fft_y( ar, 'backward', ar_tr = ar, &
	242	nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
	243	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
[1]	244	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	245
	246	!
	247	!-- Transposition y --> x
	248	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
[1216]	249	CALL transpose_yx( ar, ar_inv )
	250	CALL resort_for_yx( ar_inv, ar )
[1]	251	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
	252
	253	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
[1106]	254	CALL fft_x( ar, 'backward' )
[1]	255	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	256
	257	!
	258	!-- Transposition x --> z
	259	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
[1216]	260	CALL transpose_xz( ar, ar_inv )
	261	CALL resort_for_xz( ar_inv, ar )
[1]	262	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
	263
[3634]	264	#ifndef _OPENACC
[1216]	265	ELSE
	266
	267	!
	268	!-- 2d-domain-decomposition or no decomposition (1 PE run) with
	269	!-- overlapping transposition / fft
[1318]	270	!-- cputime logging must not use barriers, which would prevent overlapping
[1216]	271	ALLOCATE( f_out_y(0:ny,nxl_y:nxr_y,nzb_y:nzt_y), &
	272	f_out_z(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
	273	!
	274	!-- Transposition z --> x + subsequent fft along x
	275	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
	276	CALL resort_for_zx( ar, f_inv )
	277	!
	278	!-- Save original indices and gridpoint counter
	279	isave(1) = nz
	280	isave(2) = nzb_x
	281	isave(3) = nzt_x
	282	isave(4) = sendrecvcount_zx
	283	!
	284	!-- Set new indices for transformation
	285	nblk = nz / pdims(1)
	286	nz = pdims(1)
	287	nnz_x = 1
	288	nzb_x = 1 + myidx * nnz_x
	289	nzt_x = ( myidx + 1 ) * nnz_x
	290	sendrecvcount_zx = nnx * nny * nnz_x
	291
[1306]	292	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
[1216]	293	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
	294
[1306]	295	DO kk = 1, nblk
[1216]	296
[1306]	297	IF ( kk == 1 ) THEN
[1318]	298	CALL cpu_log( log_point_s(5), 'transpo forward', 'start', cpu_log_nowait )
[1306]	299	ELSE
[1318]	300	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
[1306]	301	ENDIF
[1216]	302
[1306]	303	DO knew = 1, nz
	304	ki = kk + nblk * ( knew - 1 )
	305	f_in(:,:,knew) = f_inv(:,:,ki)
	306	ENDDO
[1216]	307
[1306]	308	CALL transpose_zx( f_in, ar1(:,:,:))
	309	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
[1216]	310
[1306]	311	IF ( kk == 1 ) THEN
[1318]	312	CALL cpu_log( log_point_s(4), 'fft_x', 'start', cpu_log_nowait )
[1306]	313	ELSE
[1318]	314	CALL cpu_log( log_point_s(4), 'fft_x', 'continue', cpu_log_nowait )
[1216]	315	ENDIF
	316
[1306]	317	n = isave(2) + kk - 1
	318	CALL fft_x( ar1(:,:,:), 'forward', ar_2d = f_out_z(:,:,n))
	319	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
[1216]	320
	321	ENDDO
	322	!
	323	!-- Restore original indices/counters
	324	nz = isave(1)
	325	nzb_x = isave(2)
	326	nzt_x = isave(3)
	327	sendrecvcount_zx = isave(4)
	328
	329	DEALLOCATE( ar1, f_in, f_inv )
	330
	331	!
	332	!-- Transposition x --> y + subsequent fft along y
	333	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
	334	CALL resort_for_xy( f_out_z, f_inv )
	335	!
	336	!-- Save original indices and gridpoint counter
	337	isave(1) = nx
	338	isave(2) = nxl_y
	339	isave(3) = nxr_y
	340	isave(4) = sendrecvcount_xy
	341	!
	342	!-- Set new indices for transformation
	343	nblk = ( ( nx+1 ) / pdims(2) ) - 1
	344	nx = pdims(2)
	345	nnx_y = 1
	346	nxl_y = myidy * nnx_y
	347	nxr_y = ( myidy + 1 ) * nnx_y - 1
	348	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
	349
[1306]	350	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
[1216]	351	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
	352
[1306]	353	DO ii = 0, nblk
[1216]	354
[1318]	355	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
[1216]	356
[1306]	357	DO inew = 0, nx-1
	358	iind = ii + ( nblk + 1 ) * inew
	359	f_in(:,:,inew) = f_inv(:,:,iind)
	360	ENDDO
[1216]	361
[1306]	362	CALL transpose_xy( f_in, ar1(:,:,:) )
[1216]	363
[1306]	364	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
[1216]	365
[1306]	366	IF ( ii == 1 ) THEN
[1318]	367	CALL cpu_log( log_point_s(7), 'fft_y', 'start', cpu_log_nowait )
[1306]	368	ELSE
[1318]	369	CALL cpu_log( log_point_s(7), 'fft_y', 'continue', cpu_log_nowait )
[1216]	370	ENDIF
	371
[1306]	372	nxl_y_bound = isave(2)
	373	nxr_y_bound = isave(3)
	374	n = isave(2) + ii
	375	CALL fft_y( ar1(:,:,:), 'forward', ar_tr = f_out_y, &
	376	nxl_y_bound = nxl_y_bound, nxr_y_bound = nxr_y_bound, &
	377	nxl_y_l = n, nxr_y_l = n )
[1216]	378
[1306]	379	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
[1216]	380
	381	ENDDO
	382	!
	383	!-- Restore original indices/counters
	384	nx = isave(1)
	385	nxl_y = isave(2)
	386	nxr_y = isave(3)
	387	sendrecvcount_xy = isave(4)
	388
	389	DEALLOCATE( ar1, f_in, f_inv )
	390
	391	!
	392	!-- Transposition y --> z + subsequent tridia + resort for z --> y
	393	ALLOCATE( f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
	394	CALL resort_for_yz( f_out_y, f_inv )
	395	!
	396	!-- Save original indices and gridpoint counter
	397	isave(1) = ny
	398	isave(2) = nys_z
	399	isave(3) = nyn_z
	400	isave(4) = sendrecvcount_yz
	401	!
	402	!-- Set new indices for transformation
	403	nblk = ( ( ny+1 ) / pdims(1) ) - 1
	404	ny = pdims(1)
	405	nny_z = 1
	406	nys_z = myidx * nny_z
	407	nyn_z = ( myidx + 1 ) * nny_z - 1
	408	sendrecvcount_yz = ( nxr_y-nxl_y+1 ) * nny_z * ( nzt_y-nzb_y+1 )
	409
[1306]	410	ALLOCATE( ar1(nxl_z:nxr_z,nys_z:nyn_z,1:nz) )
[1216]	411	ALLOCATE( f_in(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
	412
[1306]	413	DO jj = 0, nblk
[1216]	414	!
[1306]	415	!-- Forward Fourier Transformation
	416	!-- Transposition y --> z
[1318]	417	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
[1216]	418
[1306]	419	DO jnew = 0, ny-1
	420	jind = jj + ( nblk + 1 ) * jnew
	421	f_in(:,:,jnew) = f_inv(:,:,jind)
	422	ENDDO
[1216]	423
[1306]	424	CALL transpose_yz( f_in, ar1(:,:,:) )
[1216]	425
[1306]	426	IF ( jj == nblk ) THEN
	427	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
	428	ELSE
	429	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
[1216]	430	ENDIF
	431
	432	!
[1306]	433	!-- Solve the tridiagonal equation system along z
[1318]	434	CALL cpu_log( log_point_s(6), 'tridia', 'start', cpu_log_nowait )
[1216]	435
[1306]	436	n = isave(2) + jj
	437	CALL tridia_substi_overlap( ar1(:,:,:), n )
[1216]	438
[1306]	439	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
[1216]	440
[1306]	441	!
	442	!-- Inverse Fourier Transformation
	443	!-- Transposition z --> y
	444	!-- Only one thread should call MPI routines, therefore forward and
	445	!-- backward tranpose are in the same section
	446	IF ( jj == 0 ) THEN
[1318]	447	CALL cpu_log( log_point_s(8), 'transpo invers', 'start', cpu_log_nowait )
[1306]	448	ELSE
[1318]	449	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
[1216]	450	ENDIF
	451
[1306]	452	CALL transpose_zy( ar1(:,:,:), f_in )
[1216]	453
[1306]	454	DO jnew = 0, ny-1
	455	jind = jj + ( nblk + 1 ) * jnew
	456	f_inv(:,:,jind) = f_in(:,:,jnew)
	457	ENDDO
[1216]	458
[1306]	459	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
[1216]	460
	461	ENDDO
	462	!
	463	!-- Restore original indices/counters
	464	ny = isave(1)
	465	nys_z = isave(2)
	466	nyn_z = isave(3)
	467	sendrecvcount_yz = isave(4)
	468
	469	CALL resort_for_zy( f_inv, f_out_y )
	470
	471	DEALLOCATE( ar1, f_in, f_inv )
	472
	473	!
	474	!-- fft along y backward + subsequent transposition y --> x
	475	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
	476	!
	477	!-- Save original indices and gridpoint counter
	478	isave(1) = nx
	479	isave(2) = nxl_y
	480	isave(3) = nxr_y
	481	isave(4) = sendrecvcount_xy
	482	!
	483	!-- Set new indices for transformation
	484	nblk = (( nx+1 ) / pdims(2) ) - 1
	485	nx = pdims(2)
	486	nnx_y = 1
	487	nxl_y = myidy * nnx_y
	488	nxr_y = ( myidy + 1 ) * nnx_y - 1
	489	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
	490
[1306]	491	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
[1216]	492	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
	493
[1306]	494	DO ii = 0, nblk
[1216]	495
[1318]	496	CALL cpu_log( log_point_s(7), 'fft_y', 'continue', cpu_log_nowait )
[1216]	497
[1306]	498	n = isave(2) + ii
	499	nxl_y_bound = isave(2)
	500	nxr_y_bound = isave(3)
[1216]	501
[1306]	502	CALL fft_y( ar1(:,:,:), 'backward', ar_tr = f_out_y, &
	503	nxl_y_bound = nxl_y_bound, nxr_y_bound = nxr_y_bound, &
	504	nxl_y_l = n, nxr_y_l = n )
[1216]	505
[1306]	506	IF ( ii == nblk ) THEN
	507	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	508	ELSE
	509	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
[1216]	510	ENDIF
	511
[1318]	512	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
[1216]	513
[1306]	514	CALL transpose_yx( ar1(:,:,:), f_in )
[1216]	515
[1306]	516	DO inew = 0, nx-1
	517	iind = ii + (nblk+1) * inew
	518	f_inv(:,:,iind) = f_in(:,:,inew)
	519	ENDDO
[1216]	520
[1306]	521	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
[1216]	522
	523	ENDDO
	524	!
	525	!-- Restore original indices/counters
	526	nx = isave(1)
	527	nxl_y = isave(2)
	528	nxr_y = isave(3)
	529	sendrecvcount_xy = isave(4)
	530
	531	CALL resort_for_yx( f_inv, f_out_z )
	532
	533	DEALLOCATE( ar1, f_in, f_inv )
	534
	535	!
	536	!-- fft along x backward + subsequent final transposition x --> z
	537	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
	538	!
	539	!-- Save original indices and gridpoint counter
	540	isave(1) = nz
	541	isave(2) = nzb_x
	542	isave(3) = nzt_x
	543	isave(4) = sendrecvcount_zx
	544	!
	545	!-- Set new indices for transformation
	546	nblk = nz / pdims(1)
	547	nz = pdims(1)
	548	nnz_x = 1
	549	nzb_x = 1 + myidx * nnz_x
	550	nzt_x = ( myidx + 1 ) * nnz_x
	551	sendrecvcount_zx = nnx * nny * nnz_x
	552
[1306]	553	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
[1216]	554	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
	555
[1306]	556	DO kk = 1, nblk
[1216]	557
[1318]	558	CALL cpu_log( log_point_s(4), 'fft_x', 'continue', cpu_log_nowait )
[1216]	559
[1306]	560	n = isave(2) + kk - 1
	561	CALL fft_x( ar1(:,:,:), 'backward', f_out_z(:,:,n))
[1216]	562
[1306]	563	IF ( kk == nblk ) THEN
	564	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	565	ELSE
	566	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
[1216]	567	ENDIF
	568
[1318]	569	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
[1216]	570
[1306]	571	CALL transpose_xz( ar1(:,:,:), f_in )
[1216]	572
[1306]	573	DO knew = 1, nz
	574	ki = kk + nblk * (knew-1)
	575	f_inv(:,:,ki) = f_in(:,:,knew)
	576	ENDDO
[1216]	577
[1306]	578	IF ( kk == nblk ) THEN
	579	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
	580	ELSE
	581	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
[1216]	582	ENDIF
	583
	584	ENDDO
	585	!
	586	!-- Restore original indices/counters
	587	nz = isave(1)
	588	nzb_x = isave(2)
	589	nzt_x = isave(3)
	590	sendrecvcount_zx = isave(4)
	591
	592	CALL resort_for_xz( f_inv, ar )
	593
	594	DEALLOCATE( ar1, f_in, f_inv )
	595
[1]	596	ENDIF
[3634]	597	#endif
[1]	598
[3690]	599	#if !__acc_fft_device
	600	!$ACC UPDATE DEVICE(ar)
	601	#endif
	602
[1]	603	CALL cpu_log( log_point_s(3), 'poisfft', 'stop' )
	604
	605	END SUBROUTINE poisfft
	606
	607
	608	!------------------------------------------------------------------------------!
[1682]	609	! Description:
	610	! ------------
	611	!> Fourier-transformation along y with subsequent transposition y --> x for
	612	!> a 1d-decomposition along x.
	613	!>
	614	!> @attention The performance of this routine is much faster on the NEC-SX6,
	615	!> if the first index of work_ffty_vec is odd. Otherwise
	616	!> memory bank conflicts may occur (especially if the index is a
	617	!> multiple of 128). That's why work_ffty_vec is dimensioned as
	618	!> 0:ny+1.
	619	!> Of course, this will not work if users are using an odd number
	620	!> of gridpoints along y.
[1]	621	!------------------------------------------------------------------------------!
[1682]	622	SUBROUTINE ffty_tr_yx( f_in, f_out )
[1]	623
[1320]	624	USE control_parameters, &
[2300]	625	ONLY: loop_optimization
[1320]	626
	627	USE cpulog, &
	628	ONLY: cpu_log, log_point_s
	629
	630	USE kinds
	631
[1]	632	USE pegrid
	633
	634	IMPLICIT NONE
	635
[1682]	636	INTEGER(iwp) :: i !<
	637	INTEGER(iwp) :: iend !<
	638	INTEGER(iwp) :: iouter !<
	639	INTEGER(iwp) :: ir !<
	640	INTEGER(iwp) :: j !<
	641	INTEGER(iwp) :: k !<
[1]	642
[1682]	643	INTEGER(iwp), PARAMETER :: stridex = 4 !<
[1320]	644
[1682]	645	REAL(wp), DIMENSION(1:nz,0:ny,nxl:nxr) :: f_in !<
	646	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_out !<
	647	REAL(wp), DIMENSION(nxl:nxr,1:nz,0:ny) :: work !<
[1]	648
[2300]	649	REAL(wp), DIMENSION(:,:), ALLOCATABLE :: work_ffty !<
	650	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: work_ffty_vec !<
	651
[1]	652	!
	653	!-- Carry out the FFT along y, where all data are present due to the
	654	!-- 1d-decomposition along x. Resort the data in a way that x becomes
	655	!-- the first index.
[1106]	656	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'start' )
[1]	657
[2300]	658	IF ( loop_optimization == 'vector' ) THEN
	659
	660	ALLOCATE( work_ffty_vec(0:ny+1,1:nz,nxl:nxr) )
[1]	661	!
	662	!-- Code optimized for vector processors
[2300]	663	!$OMP PARALLEL PRIVATE ( i, j, k )
	664	!$OMP DO
[1]	665	DO i = nxl, nxr
	666
	667	DO j = 0, ny
	668	DO k = 1, nz
	669	work_ffty_vec(j,k,i) = f_in(k,j,i)
	670	ENDDO
	671	ENDDO
	672
	673	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' )
	674
	675	ENDDO
	676
[2300]	677	!$OMP DO
[1]	678	DO k = 1, nz
	679	DO j = 0, ny
	680	DO i = nxl, nxr
	681	work(i,k,j) = work_ffty_vec(j,k,i)
	682	ENDDO
	683	ENDDO
	684	ENDDO
[2300]	685	!$OMP END PARALLEL
[1]	686
[2300]	687	DEALLOCATE( work_ffty_vec )
	688
[1]	689	ELSE
	690	!
	691	!-- Cache optimized code.
[2300]	692	ALLOCATE( work_ffty(0:ny,stridex) )
	693	!
[1]	694	!-- The i-(x-)direction is split into a strided outer loop and an inner
	695	!-- loop for better cache performance
[2300]	696	!$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty)
	697	!$OMP DO
[1]	698	DO iouter = nxl, nxr, stridex
	699
	700	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
	701
	702	DO k = 1, nz
	703
	704	DO i = iouter, iend
	705
	706	ir = i-iouter+1 ! counter within a stride
	707	DO j = 0, ny
	708	work_ffty(j,ir) = f_in(k,j,i)
	709	ENDDO
	710	!
	711	!-- FFT along y
[1106]	712	CALL fft_y_1d( work_ffty(:,ir), 'forward' )
[1]	713
	714	ENDDO
	715
	716	!
	717	!-- Resort
	718	DO j = 0, ny
	719	DO i = iouter, iend
	720	work(i,k,j) = work_ffty(j,i-iouter+1)
	721	ENDDO
	722	ENDDO
	723
	724	ENDDO
	725
	726	ENDDO
[2300]	727	!$OMP END PARALLEL
[1]	728
[2300]	729	DEALLOCATE( work_ffty )
	730
[1]	731	ENDIF
[2300]	732
[1106]	733	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'pause' )
[1]	734
	735	!
	736	!-- Transpose array
[1111]	737	#if defined( __parallel )
[1]	738	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	739	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	740	CALL MPI_ALLTOALL( work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
	741	f_out(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
	742	comm1dx, ierr )
	743	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1111]	744	#endif
[1]	745
	746	END SUBROUTINE ffty_tr_yx
	747
	748
	749	!------------------------------------------------------------------------------!
[1682]	750	! Description:
	751	! ------------
	752	!> Transposition x --> y with a subsequent backward Fourier transformation for
	753	!> a 1d-decomposition along x
[1]	754	!------------------------------------------------------------------------------!
[1682]	755	SUBROUTINE tr_xy_ffty( f_in, f_out )
[1]	756
[1320]	757	USE control_parameters, &
[2300]	758	ONLY: loop_optimization
[1320]	759
	760	USE cpulog, &
	761	ONLY: cpu_log, log_point_s
	762
	763	USE kinds
	764
[1]	765	USE pegrid
	766
	767	IMPLICIT NONE
	768
[1682]	769	INTEGER(iwp) :: i !<
	770	INTEGER(iwp) :: iend !<
	771	INTEGER(iwp) :: iouter !<
	772	INTEGER(iwp) :: ir !<
	773	INTEGER(iwp) :: j !<
	774	INTEGER(iwp) :: k !<
[1]	775
[1682]	776	INTEGER(iwp), PARAMETER :: stridex = 4 !<
[1320]	777
[1682]	778	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_in !<
	779	REAL(wp), DIMENSION(1:nz,0:ny,nxl:nxr) :: f_out !<
	780	REAL(wp), DIMENSION(nxl:nxr,1:nz,0:ny) :: work !<
[1]	781
[2300]	782	REAL(wp), DIMENSION(:,:), ALLOCATABLE :: work_ffty !<
	783	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: work_ffty_vec !<
	784
[1]	785	!
	786	!-- Transpose array
[1111]	787	#if defined( __parallel )
[1]	788	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	789	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	790	CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
	791	work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
	792	comm1dx, ierr )
	793	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1111]	794	#endif
[1]	795
	796	!
	797	!-- Resort the data in a way that y becomes the first index and carry out the
	798	!-- backward fft along y.
[1106]	799	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'continue' )
[1]	800
[2300]	801	IF ( loop_optimization == 'vector' ) THEN
	802
	803	ALLOCATE( work_ffty_vec(0:ny+1,1:nz,nxl:nxr) )
[1]	804	!
	805	!-- Code optimized for vector processors
[2300]	806	!$OMP PARALLEL PRIVATE ( i, j, k )
	807	!$OMP DO
[1]	808	DO k = 1, nz
	809	DO j = 0, ny
	810	DO i = nxl, nxr
	811	work_ffty_vec(j,k,i) = work(i,k,j)
	812	ENDDO
	813	ENDDO
	814	ENDDO
	815
[2300]	816	!$OMP DO
[1]	817	DO i = nxl, nxr
	818
	819	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' )
	820
	821	DO j = 0, ny
	822	DO k = 1, nz
	823	f_out(k,j,i) = work_ffty_vec(j,k,i)
	824	ENDDO
	825	ENDDO
	826
	827	ENDDO
[2300]	828	!$OMP END PARALLEL
[1]	829
[2300]	830	DEALLOCATE( work_ffty_vec )
	831
[1]	832	ELSE
	833	!
	834	!-- Cache optimized code.
[2300]	835	ALLOCATE( work_ffty(0:ny,stridex) )
	836	!
[1]	837	!-- The i-(x-)direction is split into a strided outer loop and an inner
	838	!-- loop for better cache performance
[2300]	839	!$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty )
	840	!$OMP DO
[1]	841	DO iouter = nxl, nxr, stridex
	842
	843	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
	844
	845	DO k = 1, nz
	846	!
	847	!-- Resort
	848	DO j = 0, ny
	849	DO i = iouter, iend
	850	work_ffty(j,i-iouter+1) = work(i,k,j)
	851	ENDDO
	852	ENDDO
	853
	854	DO i = iouter, iend
	855
	856	!
	857	!-- FFT along y
	858	ir = i-iouter+1 ! counter within a stride
[1106]	859	CALL fft_y_1d( work_ffty(:,ir), 'backward' )
[1]	860
	861	DO j = 0, ny
	862	f_out(k,j,i) = work_ffty(j,ir)
	863	ENDDO
	864	ENDDO
	865
	866	ENDDO
	867
	868	ENDDO
[2300]	869	!$OMP END PARALLEL
[1]	870
[2300]	871	DEALLOCATE( work_ffty )
	872
[1]	873	ENDIF
	874
[1106]	875	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'stop' )
[1]	876
	877	END SUBROUTINE tr_xy_ffty
	878
	879
	880	!------------------------------------------------------------------------------!
[1682]	881	! Description:
	882	! ------------
	883	!> FFT along x, solution of the tridiagonal system and backward FFT for
	884	!> a 1d-decomposition along x
	885	!>
	886	!> @warning this subroutine may still not work for hybrid parallelization
	887	!> with OpenMP (for possible necessary changes see the original
	888	!> routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
[1]	889	!------------------------------------------------------------------------------!
[1682]	890	SUBROUTINE fftx_tri_fftx( ar )
[1]	891
[1320]	892	USE control_parameters, &
[2300]	893	ONLY: loop_optimization
[1320]	894
	895	USE cpulog, &
	896	ONLY: cpu_log, log_point_s
	897
	898	USE grid_variables, &
	899	ONLY: ddx2, ddy2
	900
	901	USE kinds
	902
[1]	903	USE pegrid
	904
	905	IMPLICIT NONE
	906
[1682]	907	INTEGER(iwp) :: i !<
	908	INTEGER(iwp) :: j !<
	909	INTEGER(iwp) :: k !<
	910	INTEGER(iwp) :: m !<
	911	INTEGER(iwp) :: n !<
[3241]	912	!$ INTEGER(iwp) :: omp_get_thread_num !<
[1682]	913	INTEGER(iwp) :: tn !<
[1]	914
[1682]	915	REAL(wp), DIMENSION(0:nx) :: work_fftx !<
	916	REAL(wp), DIMENSION(0:nx,1:nz) :: work_trix !<
	917	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: ar !<
	918	REAL(wp), DIMENSION(:,:,:,:), ALLOCATABLE :: tri !<
[1]	919
	920
[1106]	921	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'start' )
[1]	922
	923	ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) )
	924
	925	tn = 0 ! Default thread number in case of one thread
	926	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_fftx, work_trix )
	927	DO j = nys_x, nyn_x
	928
	929	!$ tn = omp_get_thread_num()
	930
[2300]	931	IF ( loop_optimization == 'vector' ) THEN
[1]	932	!
	933	!-- Code optimized for vector processors
	934	DO k = 1, nz
	935
	936	m = 0
	937	DO n = 1, pdims(1)
[1003]	938	DO i = 1, nnx
[1]	939	work_trix(m,k) = ar(i,k,j,n)
	940	m = m + 1
	941	ENDDO
	942	ENDDO
	943
	944	ENDDO
	945
	946	CALL fft_x_m( work_trix, 'forward' )
	947
	948	ELSE
	949	!
	950	!-- Cache optimized code
	951	DO k = 1, nz
	952
	953	m = 0
	954	DO n = 1, pdims(1)
[1003]	955	DO i = 1, nnx
[1]	956	work_fftx(m) = ar(i,k,j,n)
	957	m = m + 1
	958	ENDDO
	959	ENDDO
	960
[1106]	961	CALL fft_x_1d( work_fftx, 'forward' )
[1]	962
	963	DO i = 0, nx
	964	work_trix(i,k) = work_fftx(i)
	965	ENDDO
	966
	967	ENDDO
	968
	969	ENDIF
	970
	971	!
	972	!-- Solve the linear equation system
	973	CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) )
	974
[2300]	975	IF ( loop_optimization == 'vector' ) THEN
[1]	976	!
	977	!-- Code optimized for vector processors
	978	CALL fft_x_m( work_trix, 'backward' )
	979
	980	DO k = 1, nz
	981
	982	m = 0
	983	DO n = 1, pdims(1)
[1003]	984	DO i = 1, nnx
[1]	985	ar(i,k,j,n) = work_trix(m,k)
	986	m = m + 1
	987	ENDDO
	988	ENDDO
	989
	990	ENDDO
	991
	992	ELSE
	993	!
	994	!-- Cache optimized code
	995	DO k = 1, nz
	996
	997	DO i = 0, nx
	998	work_fftx(i) = work_trix(i,k)
	999	ENDDO
	1000
[1106]	1001	CALL fft_x_1d( work_fftx, 'backward' )
[1]	1002
	1003	m = 0
	1004	DO n = 1, pdims(1)
[1003]	1005	DO i = 1, nnx
[1]	1006	ar(i,k,j,n) = work_fftx(m)
	1007	m = m + 1
	1008	ENDDO
	1009	ENDDO
	1010
	1011	ENDDO
	1012
	1013	ENDIF
	1014
	1015	ENDDO
	1016
	1017	DEALLOCATE( tri )
	1018
[1106]	1019	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'stop' )
[1]	1020
	1021	END SUBROUTINE fftx_tri_fftx
	1022
	1023
	1024	!------------------------------------------------------------------------------!
[1682]	1025	! Description:
	1026	! ------------
	1027	!> Fourier-transformation along x with subsequent transposition x --> y for
	1028	!> a 1d-decomposition along y.
	1029	!>
	1030	!> @attention NEC-branch of this routine may significantly profit from
	1031	!> further optimizations. So far, performance is much worse than
	1032	!> for routine ffty_tr_yx (more than three times slower).
[1]	1033	!------------------------------------------------------------------------------!
[1682]	1034	SUBROUTINE fftx_tr_xy( f_in, f_out )
[1]	1035
[1682]	1036
[1320]	1037	USE control_parameters, &
[2300]	1038	ONLY: loop_optimization
[1320]	1039
	1040	USE cpulog, &
	1041	ONLY: cpu_log, log_point_s
	1042
	1043	USE kinds
	1044
[1]	1045	USE pegrid
	1046
	1047	IMPLICIT NONE
	1048
[1682]	1049	INTEGER(iwp) :: i !<
	1050	INTEGER(iwp) :: j !<
	1051	INTEGER(iwp) :: k !<
[1]	1052
[1682]	1053	REAL(wp), DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx !<
	1054	REAL(wp), DIMENSION(1:nz,nys:nyn,0:nx) :: f_in !<
	1055	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_out !<
	1056	REAL(wp), DIMENSION(nys:nyn,1:nz,0:nx) :: work !<
[1]	1057
	1058	!
	1059	!-- Carry out the FFT along x, where all data are present due to the
	1060	!-- 1d-decomposition along y. Resort the data in a way that y becomes
	1061	!-- the first index.
[1106]	1062	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'start' )
[1]	1063
[2300]	1064	IF ( loop_optimization == 'vector' ) THEN
[1]	1065	!
	1066	!-- Code for vector processors
[85]	1067	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	1068	!$OMP DO
	1069	DO i = 0, nx
	1070
	1071	DO j = nys, nyn
	1072	DO k = 1, nz
	1073	work_fftx(i,k,j) = f_in(k,j,i)
	1074	ENDDO
	1075	ENDDO
	1076
	1077	ENDDO
	1078
	1079	!$OMP DO
	1080	DO j = nys, nyn
	1081
	1082	CALL fft_x_m( work_fftx(:,:,j), 'forward' )
	1083
	1084	DO k = 1, nz
	1085	DO i = 0, nx
	1086	work(j,k,i) = work_fftx(i,k,j)
	1087	ENDDO
	1088	ENDDO
	1089
	1090	ENDDO
	1091	!$OMP END PARALLEL
	1092
	1093	ELSE
	1094
	1095	!
	1096	!-- Cache optimized code (there might be still a potential for better
	1097	!-- optimization).
[696]	1098	!$OMP PARALLEL PRIVATE (i,j,k)
[1]	1099	!$OMP DO
	1100	DO i = 0, nx
	1101
	1102	DO j = nys, nyn
	1103	DO k = 1, nz
	1104	work_fftx(i,k,j) = f_in(k,j,i)
	1105	ENDDO
	1106	ENDDO
	1107
	1108	ENDDO
	1109
	1110	!$OMP DO
	1111	DO j = nys, nyn
	1112	DO k = 1, nz
	1113
[1106]	1114	CALL fft_x_1d( work_fftx(0:nx,k,j), 'forward' )
[1]	1115
	1116	DO i = 0, nx
	1117	work(j,k,i) = work_fftx(i,k,j)
	1118	ENDDO
	1119	ENDDO
	1120
	1121	ENDDO
	1122	!$OMP END PARALLEL
	1123
	1124	ENDIF
[1106]	1125	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'pause' )
[1]	1126
	1127	!
	1128	!-- Transpose array
[1111]	1129	#if defined( __parallel )
[1]	1130	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	1131	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	1132	CALL MPI_ALLTOALL( work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
	1133	f_out(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
	1134	comm1dy, ierr )
	1135	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1111]	1136	#endif
[1]	1137
	1138	END SUBROUTINE fftx_tr_xy
	1139
	1140
	1141	!------------------------------------------------------------------------------!
[1682]	1142	! Description:
	1143	! ------------
	1144	!> Transposition y --> x with a subsequent backward Fourier transformation for
	1145	!> a 1d-decomposition along x.
[1]	1146	!------------------------------------------------------------------------------!
[1682]	1147	SUBROUTINE tr_yx_fftx( f_in, f_out )
[1]	1148
[1682]	1149
[1320]	1150	USE control_parameters, &
[2300]	1151	ONLY: loop_optimization
[1320]	1152
	1153	USE cpulog, &
	1154	ONLY: cpu_log, log_point_s
	1155
	1156	USE kinds
	1157
[1]	1158	USE pegrid
	1159
	1160	IMPLICIT NONE
	1161
[1682]	1162	INTEGER(iwp) :: i !<
	1163	INTEGER(iwp) :: j !<
	1164	INTEGER(iwp) :: k !<
[1]	1165
[1682]	1166	REAL(wp), DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx !<
	1167	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_in !<
	1168	REAL(wp), DIMENSION(1:nz,nys:nyn,0:nx) :: f_out !<
	1169	REAL(wp), DIMENSION(nys:nyn,1:nz,0:nx) :: work !<
[1]	1170
	1171	!
	1172	!-- Transpose array
[1111]	1173	#if defined( __parallel )
[1]	1174	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	1175	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	1176	CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
	1177	work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
	1178	comm1dy, ierr )
	1179	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1111]	1180	#endif
[1]	1181
	1182	!
	1183	!-- Carry out the FFT along x, where all data are present due to the
	1184	!-- 1d-decomposition along y. Resort the data in a way that y becomes
	1185	!-- the first index.
[1106]	1186	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'continue' )
[1]	1187
[2300]	1188	IF ( loop_optimization == 'vector' ) THEN
[1]	1189	!
	1190	!-- Code optimized for vector processors
[85]	1191	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	1192	!$OMP DO
	1193	DO j = nys, nyn
	1194
	1195	DO k = 1, nz
	1196	DO i = 0, nx
	1197	work_fftx(i,k,j) = work(j,k,i)
	1198	ENDDO
	1199	ENDDO
	1200
	1201	CALL fft_x_m( work_fftx(:,:,j), 'backward' )
	1202
	1203	ENDDO
	1204
	1205	!$OMP DO
	1206	DO i = 0, nx
	1207	DO j = nys, nyn
	1208	DO k = 1, nz
	1209	f_out(k,j,i) = work_fftx(i,k,j)
	1210	ENDDO
	1211	ENDDO
	1212	ENDDO
	1213	!$OMP END PARALLEL
	1214
	1215	ELSE
	1216
	1217	!
	1218	!-- Cache optimized code (there might be still a potential for better
	1219	!-- optimization).
[696]	1220	!$OMP PARALLEL PRIVATE (i,j,k)
[1]	1221	!$OMP DO
	1222	DO j = nys, nyn
	1223	DO k = 1, nz
	1224
	1225	DO i = 0, nx
	1226	work_fftx(i,k,j) = work(j,k,i)
	1227	ENDDO
	1228
[1106]	1229	CALL fft_x_1d( work_fftx(0:nx,k,j), 'backward' )
[1]	1230
	1231	ENDDO
	1232	ENDDO
	1233
	1234	!$OMP DO
	1235	DO i = 0, nx
	1236	DO j = nys, nyn
	1237	DO k = 1, nz
	1238	f_out(k,j,i) = work_fftx(i,k,j)
	1239	ENDDO
	1240	ENDDO
	1241	ENDDO
	1242	!$OMP END PARALLEL
	1243
	1244	ENDIF
[1106]	1245	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'stop' )
[1]	1246
	1247	END SUBROUTINE tr_yx_fftx
	1248
	1249
	1250	!------------------------------------------------------------------------------!
[1682]	1251	! Description:
	1252	! ------------
	1253	!> FFT along y, solution of the tridiagonal system and backward FFT for
	1254	!> a 1d-decomposition along y.
	1255	!>
	1256	!> @warning this subroutine may still not work for hybrid parallelization
	1257	!> with OpenMP (for possible necessary changes see the original
	1258	!> routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
[1]	1259	!------------------------------------------------------------------------------!
[1682]	1260	SUBROUTINE ffty_tri_ffty( ar )
[1]	1261
[1682]	1262
[1320]	1263	USE control_parameters, &
[2300]	1264	ONLY: loop_optimization
[1320]	1265
	1266	USE cpulog, &
	1267	ONLY: cpu_log, log_point_s
	1268
	1269	USE grid_variables, &
	1270	ONLY: ddx2, ddy2
	1271
	1272	USE kinds
	1273
[1]	1274	USE pegrid
	1275
	1276	IMPLICIT NONE
	1277
[1682]	1278	INTEGER(iwp) :: i !<
	1279	INTEGER(iwp) :: j !<
	1280	INTEGER(iwp) :: k !<
	1281	INTEGER(iwp) :: m !<
	1282	INTEGER(iwp) :: n !<
[3241]	1283	!$ INTEGER(iwp) :: omp_get_thread_num !<
[1682]	1284	INTEGER(iwp) :: tn !<
[1]	1285
[1682]	1286	REAL(wp), DIMENSION(0:ny) :: work_ffty !<
	1287	REAL(wp), DIMENSION(0:ny,1:nz) :: work_triy !<
	1288	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: ar !<
	1289	REAL(wp), DIMENSION(:,:,:,:), ALLOCATABLE :: tri !<
[1]	1290
	1291
[1106]	1292	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'start' )
[1]	1293
	1294	ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) )
	1295
	1296	tn = 0 ! Default thread number in case of one thread
[696]	1297	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy )
[1]	1298	DO i = nxl_y, nxr_y
	1299
	1300	!$ tn = omp_get_thread_num()
	1301
[2300]	1302	IF ( loop_optimization == 'vector' ) THEN
[1]	1303	!
	1304	!-- Code optimized for vector processors
	1305	DO k = 1, nz
	1306
	1307	m = 0
	1308	DO n = 1, pdims(2)
[1003]	1309	DO j = 1, nny
[1]	1310	work_triy(m,k) = ar(j,k,i,n)
	1311	m = m + 1
	1312	ENDDO
	1313	ENDDO
	1314
	1315	ENDDO
	1316
	1317	CALL fft_y_m( work_triy, ny, 'forward' )
	1318
	1319	ELSE
	1320	!
	1321	!-- Cache optimized code
	1322	DO k = 1, nz
	1323
	1324	m = 0
	1325	DO n = 1, pdims(2)
[1003]	1326	DO j = 1, nny
[1]	1327	work_ffty(m) = ar(j,k,i,n)
	1328	m = m + 1
	1329	ENDDO
	1330	ENDDO
	1331
[1106]	1332	CALL fft_y_1d( work_ffty, 'forward' )
[1]	1333
	1334	DO j = 0, ny
	1335	work_triy(j,k) = work_ffty(j)
	1336	ENDDO
	1337
	1338	ENDDO
	1339
	1340	ENDIF
	1341
	1342	!
	1343	!-- Solve the linear equation system
	1344	CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) )
	1345
[2300]	1346	IF ( loop_optimization == 'vector' ) THEN
[1]	1347	!
	1348	!-- Code optimized for vector processors
	1349	CALL fft_y_m( work_triy, ny, 'backward' )
	1350
	1351	DO k = 1, nz
	1352
	1353	m = 0
	1354	DO n = 1, pdims(2)
[1003]	1355	DO j = 1, nny
[1]	1356	ar(j,k,i,n) = work_triy(m,k)
	1357	m = m + 1
	1358	ENDDO
	1359	ENDDO
	1360
	1361	ENDDO
	1362
	1363	ELSE
	1364	!
	1365	!-- Cache optimized code
	1366	DO k = 1, nz
	1367
	1368	DO j = 0, ny
	1369	work_ffty(j) = work_triy(j,k)
	1370	ENDDO
	1371
[1106]	1372	CALL fft_y_1d( work_ffty, 'backward' )
[1]	1373
	1374	m = 0
	1375	DO n = 1, pdims(2)
[1003]	1376	DO j = 1, nny
[1]	1377	ar(j,k,i,n) = work_ffty(m)
	1378	m = m + 1
	1379	ENDDO
	1380	ENDDO
	1381
	1382	ENDDO
	1383
	1384	ENDIF
	1385
	1386	ENDDO
	1387
	1388	DEALLOCATE( tri )
	1389
[1106]	1390	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'stop' )
[1]	1391
	1392	END SUBROUTINE ffty_tri_ffty
	1393
	1394	END MODULE poisfft_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |