Home

Context Navigation

source: palm/trunk/SOURCE/poisfft_mod.f90 @ 4311

Last change on this file since 4311 was 4182, checked in by scharf, 5 years ago
corrected "Former revisions" section minor formatting in "Former revisions" section added "Author" section
Property svn:keywords set to `Id`
File size: 41.9 KB

Rev	Line
[1850]	1	!> @file poisfft_mod.f90
[2000]	2	!------------------------------------------------------------------------------!
[2696]	3	! This file is part of the PALM model system.
[1036]	4	!
[2000]	5	! PALM is free software: you can redistribute it and/or modify it under the
	6	! terms of the GNU General Public License as published by the Free Software
	7	! Foundation, either version 3 of the License, or (at your option) any later
	8	! version.
[1036]	9	!
	10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
	11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
	12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
	13	!
	14	! You should have received a copy of the GNU General Public License along with
	15	! PALM. If not, see <http://www.gnu.org/licenses/>.
	16	!
[3655]	17	! Copyright 1997-2019 Leibniz Universitaet Hannover
[2000]	18	!------------------------------------------------------------------------------!
[1036]	19	!
[484]	20	! Current revisions:
[1]	21	! -----------------
[1683]	22	!
[2119]	23	!
[1321]	24	! Former revisions:
	25	! -----------------
	26	! $Id: poisfft_mod.f90 4182 2019-08-22 15:20:23Z maronga $
[4182]	27	! Corrected "Former revisions" section
	28	!
	29	! 3690 2019-01-22 22:56:42Z knoop
[3634]	30	! OpenACC port for SPEC
[2716]	31	!
[4182]	32	! Revision 1.1 1997/07/24 11:24:14 raasch
	33	! Initial revision
	34	!
	35	!
[1]	36	! Description:
	37	! ------------
[1682]	38	!> Solves the Poisson equation with a 2D spectral method
	39	!> d^2 p / dx^2 + d^2 p / dy^2 + d^2 p / dz^2 = s
	40	!>
	41	!> Input:
	42	!> real ar contains (nnz,nny,nnx) elements of the velocity divergence,
	43	!> starting from (1,nys,nxl)
	44	!>
	45	!> Output:
	46	!> real ar contains the solution for perturbation pressure p
[1]	47	!------------------------------------------------------------------------------!
[1682]	48	MODULE poisfft_mod
	49
[1]	50
[1320]	51	USE fft_xy, &
	52	ONLY: fft_init, fft_y, fft_y_1d, fft_y_m, fft_x, fft_x_1d, fft_x_m
[1]	53
[1320]	54	USE indices, &
	55	ONLY: nnx, nny, nx, nxl, nxr, ny, nys, nyn, nz
	56
	57	USE transpose_indices, &
	58	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nys_x, nys_z, nyn_x, nyn_z, nzb_x, &
	59	nzb_y, nzt_x, nzt_y
	60
	61	USE tridia_solver, &
	62	ONLY: tridia_1dd, tridia_init, tridia_substi, tridia_substi_overlap
	63
[1]	64	IMPLICIT NONE
	65
[1111]	66	LOGICAL, SAVE :: poisfft_initialized = .FALSE.
	67
[1]	68	PRIVATE
[807]	69
[1]	70	PUBLIC poisfft, poisfft_init
	71
	72	INTERFACE poisfft
	73	MODULE PROCEDURE poisfft
	74	END INTERFACE poisfft
	75
	76	INTERFACE poisfft_init
	77	MODULE PROCEDURE poisfft_init
	78	END INTERFACE poisfft_init
	79
[807]	80
[1]	81	CONTAINS
	82
[1682]	83	!------------------------------------------------------------------------------!
	84	! Description:
	85	! ------------
[3241]	86	!> Setup coefficients for FFT and the tridiagonal solver
[1682]	87	!------------------------------------------------------------------------------!
[1]	88	SUBROUTINE poisfft_init
	89
[1111]	90	IMPLICIT NONE
	91
	92
[1]	93	CALL fft_init
	94
[1212]	95	CALL tridia_init
[1111]	96
	97	poisfft_initialized = .TRUE.
	98
[1]	99	END SUBROUTINE poisfft_init
	100
[1111]	101
[1804]	102
[1682]	103	!------------------------------------------------------------------------------!
	104	! Description:
	105	! ------------
	106	!> Two-dimensional Fourier Transformation in x- and y-direction.
	107	!------------------------------------------------------------------------------!
[1306]	108	SUBROUTINE poisfft( ar )
[1]	109
[1320]	110	USE control_parameters, &
[3241]	111	ONLY: transpose_compute_overlap
[1320]	112
	113	USE cpulog, &
	114	ONLY: cpu_log, cpu_log_nowait, log_point_s
	115
	116	USE kinds
	117
[1]	118	USE pegrid
	119
	120	IMPLICIT NONE
	121
[1682]	122	INTEGER(iwp) :: ii !<
	123	INTEGER(iwp) :: iind !<
	124	INTEGER(iwp) :: inew !<
	125	INTEGER(iwp) :: jj !<
	126	INTEGER(iwp) :: jind !<
	127	INTEGER(iwp) :: jnew !<
	128	INTEGER(iwp) :: ki !<
	129	INTEGER(iwp) :: kk !<
	130	INTEGER(iwp) :: knew !<
	131	INTEGER(iwp) :: n !<
	132	INTEGER(iwp) :: nblk !<
	133	INTEGER(iwp) :: nnx_y !<
	134	INTEGER(iwp) :: nny_z !<
	135	INTEGER(iwp) :: nnz_x !<
	136	INTEGER(iwp) :: nxl_y_bound !<
	137	INTEGER(iwp) :: nxr_y_bound !<
[1]	138
[1682]	139	INTEGER(iwp), DIMENSION(4) :: isave !<
[1320]	140
[1682]	141	REAL(wp), DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar !<
	142	REAL(wp), DIMENSION(nys:nyn,nxl:nxr,1:nz) :: ar_inv !<
[3690]	143
	144	#define __acc_fft_device ( defined( _OPENACC ) && ( defined ( __cuda_fft ) ) )
	145	#if __acc_fft_device
[3634]	146	!$ACC DECLARE CREATE(ar_inv)
[3690]	147	#endif
[1]	148
[1682]	149	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: ar1 !<
	150	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_in !<
	151	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_inv !<
	152	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_out_y !<
	153	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_out_z !<
[1216]	154
	155
[1]	156	CALL cpu_log( log_point_s(3), 'poisfft', 'start' )
	157
[1111]	158	IF ( .NOT. poisfft_initialized ) CALL poisfft_init
	159
[3690]	160	#if !__acc_fft_device
	161	!$ACC UPDATE HOST(ar)
	162	#endif
	163
[3634]	164	#ifndef _OPENACC
[1]	165	!
	166	!-- Two-dimensional Fourier Transformation in x- and y-direction.
[2118]	167	IF ( pdims(2) == 1 .AND. pdims(1) > 1 ) THEN
[1]	168
	169	!
	170	!-- 1d-domain-decomposition along x:
	171	!-- FFT along y and transposition y --> x
[1216]	172	CALL ffty_tr_yx( ar, ar )
[1]	173
	174	!
	175	!-- FFT along x, solving the tridiagonal system and backward FFT
	176	CALL fftx_tri_fftx( ar )
	177
	178	!
	179	!-- Transposition x --> y and backward FFT along y
[1216]	180	CALL tr_xy_ffty( ar, ar )
[1]	181
[2118]	182	ELSEIF ( pdims(1) == 1 .AND. pdims(2) > 1 ) THEN
[1]	183
	184	!
	185	!-- 1d-domain-decomposition along y:
	186	!-- FFT along x and transposition x --> y
[1216]	187	CALL fftx_tr_xy( ar, ar )
[1]	188
	189	!
	190	!-- FFT along y, solving the tridiagonal system and backward FFT
	191	CALL ffty_tri_ffty( ar )
	192
	193	!
	194	!-- Transposition y --> x and backward FFT along x
[1216]	195	CALL tr_yx_fftx( ar, ar )
[1]	196
[1216]	197	ELSEIF ( .NOT. transpose_compute_overlap ) THEN
[3634]	198	#endif
[1]	199
	200	!
[1111]	201	!-- 2d-domain-decomposition or no decomposition (1 PE run)
[1]	202	!-- Transposition z --> x
	203	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
[1216]	204	CALL resort_for_zx( ar, ar_inv )
	205	CALL transpose_zx( ar_inv, ar )
[1]	206	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
	207
	208	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
[1106]	209	CALL fft_x( ar, 'forward' )
[1]	210	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
	211
	212	!
	213	!-- Transposition x --> y
	214	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
[1216]	215	CALL resort_for_xy( ar, ar_inv )
	216	CALL transpose_xy( ar_inv, ar )
[1]	217	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
	218
	219	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
[1216]	220	CALL fft_y( ar, 'forward', ar_tr = ar, &
	221	nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
	222	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
[1]	223	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	224
	225	!
	226	!-- Transposition y --> z
	227	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
[1216]	228	CALL resort_for_yz( ar, ar_inv )
	229	CALL transpose_yz( ar_inv, ar )
[1]	230	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
	231
	232	!
[1106]	233	!-- Solve the tridiagonal equation system along z
[1]	234	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
[1212]	235	CALL tridia_substi( ar )
[1]	236	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
	237
	238	!
	239	!-- Inverse Fourier Transformation
	240	!-- Transposition z --> y
	241	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
[1216]	242	CALL transpose_zy( ar, ar_inv )
	243	CALL resort_for_zy( ar_inv, ar )
[1]	244	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
	245
	246	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
[1216]	247	CALL fft_y( ar, 'backward', ar_tr = ar, &
	248	nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
	249	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
[1]	250	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	251
	252	!
	253	!-- Transposition y --> x
	254	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
[1216]	255	CALL transpose_yx( ar, ar_inv )
	256	CALL resort_for_yx( ar_inv, ar )
[1]	257	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
	258
	259	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
[1106]	260	CALL fft_x( ar, 'backward' )
[1]	261	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	262
	263	!
	264	!-- Transposition x --> z
	265	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
[1216]	266	CALL transpose_xz( ar, ar_inv )
	267	CALL resort_for_xz( ar_inv, ar )
[1]	268	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
	269
[3634]	270	#ifndef _OPENACC
[1216]	271	ELSE
	272
	273	!
	274	!-- 2d-domain-decomposition or no decomposition (1 PE run) with
	275	!-- overlapping transposition / fft
[1318]	276	!-- cputime logging must not use barriers, which would prevent overlapping
[1216]	277	ALLOCATE( f_out_y(0:ny,nxl_y:nxr_y,nzb_y:nzt_y), &
	278	f_out_z(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
	279	!
	280	!-- Transposition z --> x + subsequent fft along x
	281	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
	282	CALL resort_for_zx( ar, f_inv )
	283	!
	284	!-- Save original indices and gridpoint counter
	285	isave(1) = nz
	286	isave(2) = nzb_x
	287	isave(3) = nzt_x
	288	isave(4) = sendrecvcount_zx
	289	!
	290	!-- Set new indices for transformation
	291	nblk = nz / pdims(1)
	292	nz = pdims(1)
	293	nnz_x = 1
	294	nzb_x = 1 + myidx * nnz_x
	295	nzt_x = ( myidx + 1 ) * nnz_x
	296	sendrecvcount_zx = nnx * nny * nnz_x
	297
[1306]	298	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
[1216]	299	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
	300
[1306]	301	DO kk = 1, nblk
[1216]	302
[1306]	303	IF ( kk == 1 ) THEN
[1318]	304	CALL cpu_log( log_point_s(5), 'transpo forward', 'start', cpu_log_nowait )
[1306]	305	ELSE
[1318]	306	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
[1306]	307	ENDIF
[1216]	308
[1306]	309	DO knew = 1, nz
	310	ki = kk + nblk * ( knew - 1 )
	311	f_in(:,:,knew) = f_inv(:,:,ki)
	312	ENDDO
[1216]	313
[1306]	314	CALL transpose_zx( f_in, ar1(:,:,:))
	315	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
[1216]	316
[1306]	317	IF ( kk == 1 ) THEN
[1318]	318	CALL cpu_log( log_point_s(4), 'fft_x', 'start', cpu_log_nowait )
[1306]	319	ELSE
[1318]	320	CALL cpu_log( log_point_s(4), 'fft_x', 'continue', cpu_log_nowait )
[1216]	321	ENDIF
	322
[1306]	323	n = isave(2) + kk - 1
	324	CALL fft_x( ar1(:,:,:), 'forward', ar_2d = f_out_z(:,:,n))
	325	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
[1216]	326
	327	ENDDO
	328	!
	329	!-- Restore original indices/counters
	330	nz = isave(1)
	331	nzb_x = isave(2)
	332	nzt_x = isave(3)
	333	sendrecvcount_zx = isave(4)
	334
	335	DEALLOCATE( ar1, f_in, f_inv )
	336
	337	!
	338	!-- Transposition x --> y + subsequent fft along y
	339	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
	340	CALL resort_for_xy( f_out_z, f_inv )
	341	!
	342	!-- Save original indices and gridpoint counter
	343	isave(1) = nx
	344	isave(2) = nxl_y
	345	isave(3) = nxr_y
	346	isave(4) = sendrecvcount_xy
	347	!
	348	!-- Set new indices for transformation
	349	nblk = ( ( nx+1 ) / pdims(2) ) - 1
	350	nx = pdims(2)
	351	nnx_y = 1
	352	nxl_y = myidy * nnx_y
	353	nxr_y = ( myidy + 1 ) * nnx_y - 1
	354	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
	355
[1306]	356	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
[1216]	357	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
	358
[1306]	359	DO ii = 0, nblk
[1216]	360
[1318]	361	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
[1216]	362
[1306]	363	DO inew = 0, nx-1
	364	iind = ii + ( nblk + 1 ) * inew
	365	f_in(:,:,inew) = f_inv(:,:,iind)
	366	ENDDO
[1216]	367
[1306]	368	CALL transpose_xy( f_in, ar1(:,:,:) )
[1216]	369
[1306]	370	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
[1216]	371
[1306]	372	IF ( ii == 1 ) THEN
[1318]	373	CALL cpu_log( log_point_s(7), 'fft_y', 'start', cpu_log_nowait )
[1306]	374	ELSE
[1318]	375	CALL cpu_log( log_point_s(7), 'fft_y', 'continue', cpu_log_nowait )
[1216]	376	ENDIF
	377
[1306]	378	nxl_y_bound = isave(2)
	379	nxr_y_bound = isave(3)
	380	n = isave(2) + ii
	381	CALL fft_y( ar1(:,:,:), 'forward', ar_tr = f_out_y, &
	382	nxl_y_bound = nxl_y_bound, nxr_y_bound = nxr_y_bound, &
	383	nxl_y_l = n, nxr_y_l = n )
[1216]	384
[1306]	385	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
[1216]	386
	387	ENDDO
	388	!
	389	!-- Restore original indices/counters
	390	nx = isave(1)
	391	nxl_y = isave(2)
	392	nxr_y = isave(3)
	393	sendrecvcount_xy = isave(4)
	394
	395	DEALLOCATE( ar1, f_in, f_inv )
	396
	397	!
	398	!-- Transposition y --> z + subsequent tridia + resort for z --> y
	399	ALLOCATE( f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
	400	CALL resort_for_yz( f_out_y, f_inv )
	401	!
	402	!-- Save original indices and gridpoint counter
	403	isave(1) = ny
	404	isave(2) = nys_z
	405	isave(3) = nyn_z
	406	isave(4) = sendrecvcount_yz
	407	!
	408	!-- Set new indices for transformation
	409	nblk = ( ( ny+1 ) / pdims(1) ) - 1
	410	ny = pdims(1)
	411	nny_z = 1
	412	nys_z = myidx * nny_z
	413	nyn_z = ( myidx + 1 ) * nny_z - 1
	414	sendrecvcount_yz = ( nxr_y-nxl_y+1 ) * nny_z * ( nzt_y-nzb_y+1 )
	415
[1306]	416	ALLOCATE( ar1(nxl_z:nxr_z,nys_z:nyn_z,1:nz) )
[1216]	417	ALLOCATE( f_in(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
	418
[1306]	419	DO jj = 0, nblk
[1216]	420	!
[1306]	421	!-- Forward Fourier Transformation
	422	!-- Transposition y --> z
[1318]	423	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
[1216]	424
[1306]	425	DO jnew = 0, ny-1
	426	jind = jj + ( nblk + 1 ) * jnew
	427	f_in(:,:,jnew) = f_inv(:,:,jind)
	428	ENDDO
[1216]	429
[1306]	430	CALL transpose_yz( f_in, ar1(:,:,:) )
[1216]	431
[1306]	432	IF ( jj == nblk ) THEN
	433	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
	434	ELSE
	435	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
[1216]	436	ENDIF
	437
	438	!
[1306]	439	!-- Solve the tridiagonal equation system along z
[1318]	440	CALL cpu_log( log_point_s(6), 'tridia', 'start', cpu_log_nowait )
[1216]	441
[1306]	442	n = isave(2) + jj
	443	CALL tridia_substi_overlap( ar1(:,:,:), n )
[1216]	444
[1306]	445	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
[1216]	446
[1306]	447	!
	448	!-- Inverse Fourier Transformation
	449	!-- Transposition z --> y
	450	!-- Only one thread should call MPI routines, therefore forward and
	451	!-- backward tranpose are in the same section
	452	IF ( jj == 0 ) THEN
[1318]	453	CALL cpu_log( log_point_s(8), 'transpo invers', 'start', cpu_log_nowait )
[1306]	454	ELSE
[1318]	455	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
[1216]	456	ENDIF
	457
[1306]	458	CALL transpose_zy( ar1(:,:,:), f_in )
[1216]	459
[1306]	460	DO jnew = 0, ny-1
	461	jind = jj + ( nblk + 1 ) * jnew
	462	f_inv(:,:,jind) = f_in(:,:,jnew)
	463	ENDDO
[1216]	464
[1306]	465	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
[1216]	466
	467	ENDDO
	468	!
	469	!-- Restore original indices/counters
	470	ny = isave(1)
	471	nys_z = isave(2)
	472	nyn_z = isave(3)
	473	sendrecvcount_yz = isave(4)
	474
	475	CALL resort_for_zy( f_inv, f_out_y )
	476
	477	DEALLOCATE( ar1, f_in, f_inv )
	478
	479	!
	480	!-- fft along y backward + subsequent transposition y --> x
	481	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
	482	!
	483	!-- Save original indices and gridpoint counter
	484	isave(1) = nx
	485	isave(2) = nxl_y
	486	isave(3) = nxr_y
	487	isave(4) = sendrecvcount_xy
	488	!
	489	!-- Set new indices for transformation
	490	nblk = (( nx+1 ) / pdims(2) ) - 1
	491	nx = pdims(2)
	492	nnx_y = 1
	493	nxl_y = myidy * nnx_y
	494	nxr_y = ( myidy + 1 ) * nnx_y - 1
	495	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
	496
[1306]	497	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
[1216]	498	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
	499
[1306]	500	DO ii = 0, nblk
[1216]	501
[1318]	502	CALL cpu_log( log_point_s(7), 'fft_y', 'continue', cpu_log_nowait )
[1216]	503
[1306]	504	n = isave(2) + ii
	505	nxl_y_bound = isave(2)
	506	nxr_y_bound = isave(3)
[1216]	507
[1306]	508	CALL fft_y( ar1(:,:,:), 'backward', ar_tr = f_out_y, &
	509	nxl_y_bound = nxl_y_bound, nxr_y_bound = nxr_y_bound, &
	510	nxl_y_l = n, nxr_y_l = n )
[1216]	511
[1306]	512	IF ( ii == nblk ) THEN
	513	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	514	ELSE
	515	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
[1216]	516	ENDIF
	517
[1318]	518	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
[1216]	519
[1306]	520	CALL transpose_yx( ar1(:,:,:), f_in )
[1216]	521
[1306]	522	DO inew = 0, nx-1
	523	iind = ii + (nblk+1) * inew
	524	f_inv(:,:,iind) = f_in(:,:,inew)
	525	ENDDO
[1216]	526
[1306]	527	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
[1216]	528
	529	ENDDO
	530	!
	531	!-- Restore original indices/counters
	532	nx = isave(1)
	533	nxl_y = isave(2)
	534	nxr_y = isave(3)
	535	sendrecvcount_xy = isave(4)
	536
	537	CALL resort_for_yx( f_inv, f_out_z )
	538
	539	DEALLOCATE( ar1, f_in, f_inv )
	540
	541	!
	542	!-- fft along x backward + subsequent final transposition x --> z
	543	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
	544	!
	545	!-- Save original indices and gridpoint counter
	546	isave(1) = nz
	547	isave(2) = nzb_x
	548	isave(3) = nzt_x
	549	isave(4) = sendrecvcount_zx
	550	!
	551	!-- Set new indices for transformation
	552	nblk = nz / pdims(1)
	553	nz = pdims(1)
	554	nnz_x = 1
	555	nzb_x = 1 + myidx * nnz_x
	556	nzt_x = ( myidx + 1 ) * nnz_x
	557	sendrecvcount_zx = nnx * nny * nnz_x
	558
[1306]	559	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
[1216]	560	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
	561
[1306]	562	DO kk = 1, nblk
[1216]	563
[1318]	564	CALL cpu_log( log_point_s(4), 'fft_x', 'continue', cpu_log_nowait )
[1216]	565
[1306]	566	n = isave(2) + kk - 1
	567	CALL fft_x( ar1(:,:,:), 'backward', f_out_z(:,:,n))
[1216]	568
[1306]	569	IF ( kk == nblk ) THEN
	570	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	571	ELSE
	572	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
[1216]	573	ENDIF
	574
[1318]	575	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
[1216]	576
[1306]	577	CALL transpose_xz( ar1(:,:,:), f_in )
[1216]	578
[1306]	579	DO knew = 1, nz
	580	ki = kk + nblk * (knew-1)
	581	f_inv(:,:,ki) = f_in(:,:,knew)
	582	ENDDO
[1216]	583
[1306]	584	IF ( kk == nblk ) THEN
	585	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
	586	ELSE
	587	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
[1216]	588	ENDIF
	589
	590	ENDDO
	591	!
	592	!-- Restore original indices/counters
	593	nz = isave(1)
	594	nzb_x = isave(2)
	595	nzt_x = isave(3)
	596	sendrecvcount_zx = isave(4)
	597
	598	CALL resort_for_xz( f_inv, ar )
	599
	600	DEALLOCATE( ar1, f_in, f_inv )
	601
[1]	602	ENDIF
[3634]	603	#endif
[1]	604
[3690]	605	#if !__acc_fft_device
	606	!$ACC UPDATE DEVICE(ar)
	607	#endif
	608
[1]	609	CALL cpu_log( log_point_s(3), 'poisfft', 'stop' )
	610
	611	END SUBROUTINE poisfft
	612
	613
	614	!------------------------------------------------------------------------------!
[1682]	615	! Description:
	616	! ------------
	617	!> Fourier-transformation along y with subsequent transposition y --> x for
	618	!> a 1d-decomposition along x.
	619	!>
	620	!> @attention The performance of this routine is much faster on the NEC-SX6,
	621	!> if the first index of work_ffty_vec is odd. Otherwise
	622	!> memory bank conflicts may occur (especially if the index is a
	623	!> multiple of 128). That's why work_ffty_vec is dimensioned as
	624	!> 0:ny+1.
	625	!> Of course, this will not work if users are using an odd number
	626	!> of gridpoints along y.
[1]	627	!------------------------------------------------------------------------------!
[1682]	628	SUBROUTINE ffty_tr_yx( f_in, f_out )
[1]	629
[1320]	630	USE control_parameters, &
[2300]	631	ONLY: loop_optimization
[1320]	632
	633	USE cpulog, &
	634	ONLY: cpu_log, log_point_s
	635
	636	USE kinds
	637
[1]	638	USE pegrid
	639
	640	IMPLICIT NONE
	641
[1682]	642	INTEGER(iwp) :: i !<
	643	INTEGER(iwp) :: iend !<
	644	INTEGER(iwp) :: iouter !<
	645	INTEGER(iwp) :: ir !<
	646	INTEGER(iwp) :: j !<
	647	INTEGER(iwp) :: k !<
[1]	648
[1682]	649	INTEGER(iwp), PARAMETER :: stridex = 4 !<
[1320]	650
[1682]	651	REAL(wp), DIMENSION(1:nz,0:ny,nxl:nxr) :: f_in !<
	652	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_out !<
	653	REAL(wp), DIMENSION(nxl:nxr,1:nz,0:ny) :: work !<
[1]	654
[2300]	655	REAL(wp), DIMENSION(:,:), ALLOCATABLE :: work_ffty !<
	656	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: work_ffty_vec !<
	657
[1]	658	!
	659	!-- Carry out the FFT along y, where all data are present due to the
	660	!-- 1d-decomposition along x. Resort the data in a way that x becomes
	661	!-- the first index.
[1106]	662	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'start' )
[1]	663
[2300]	664	IF ( loop_optimization == 'vector' ) THEN
	665
	666	ALLOCATE( work_ffty_vec(0:ny+1,1:nz,nxl:nxr) )
[1]	667	!
	668	!-- Code optimized for vector processors
[2300]	669	!$OMP PARALLEL PRIVATE ( i, j, k )
	670	!$OMP DO
[1]	671	DO i = nxl, nxr
	672
	673	DO j = 0, ny
	674	DO k = 1, nz
	675	work_ffty_vec(j,k,i) = f_in(k,j,i)
	676	ENDDO
	677	ENDDO
	678
	679	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' )
	680
	681	ENDDO
	682
[2300]	683	!$OMP DO
[1]	684	DO k = 1, nz
	685	DO j = 0, ny
	686	DO i = nxl, nxr
	687	work(i,k,j) = work_ffty_vec(j,k,i)
	688	ENDDO
	689	ENDDO
	690	ENDDO
[2300]	691	!$OMP END PARALLEL
[1]	692
[2300]	693	DEALLOCATE( work_ffty_vec )
	694
[1]	695	ELSE
	696	!
	697	!-- Cache optimized code.
[2300]	698	ALLOCATE( work_ffty(0:ny,stridex) )
	699	!
[1]	700	!-- The i-(x-)direction is split into a strided outer loop and an inner
	701	!-- loop for better cache performance
[2300]	702	!$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty)
	703	!$OMP DO
[1]	704	DO iouter = nxl, nxr, stridex
	705
	706	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
	707
	708	DO k = 1, nz
	709
	710	DO i = iouter, iend
	711
	712	ir = i-iouter+1 ! counter within a stride
	713	DO j = 0, ny
	714	work_ffty(j,ir) = f_in(k,j,i)
	715	ENDDO
	716	!
	717	!-- FFT along y
[1106]	718	CALL fft_y_1d( work_ffty(:,ir), 'forward' )
[1]	719
	720	ENDDO
	721
	722	!
	723	!-- Resort
	724	DO j = 0, ny
	725	DO i = iouter, iend
	726	work(i,k,j) = work_ffty(j,i-iouter+1)
	727	ENDDO
	728	ENDDO
	729
	730	ENDDO
	731
	732	ENDDO
[2300]	733	!$OMP END PARALLEL
[1]	734
[2300]	735	DEALLOCATE( work_ffty )
	736
[1]	737	ENDIF
[2300]	738
[1106]	739	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'pause' )
[1]	740
	741	!
	742	!-- Transpose array
[1111]	743	#if defined( __parallel )
[1]	744	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	745	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	746	CALL MPI_ALLTOALL( work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
	747	f_out(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
	748	comm1dx, ierr )
	749	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1111]	750	#endif
[1]	751
	752	END SUBROUTINE ffty_tr_yx
	753
	754
	755	!------------------------------------------------------------------------------!
[1682]	756	! Description:
	757	! ------------
	758	!> Transposition x --> y with a subsequent backward Fourier transformation for
	759	!> a 1d-decomposition along x
[1]	760	!------------------------------------------------------------------------------!
[1682]	761	SUBROUTINE tr_xy_ffty( f_in, f_out )
[1]	762
[1320]	763	USE control_parameters, &
[2300]	764	ONLY: loop_optimization
[1320]	765
	766	USE cpulog, &
	767	ONLY: cpu_log, log_point_s
	768
	769	USE kinds
	770
[1]	771	USE pegrid
	772
	773	IMPLICIT NONE
	774
[1682]	775	INTEGER(iwp) :: i !<
	776	INTEGER(iwp) :: iend !<
	777	INTEGER(iwp) :: iouter !<
	778	INTEGER(iwp) :: ir !<
	779	INTEGER(iwp) :: j !<
	780	INTEGER(iwp) :: k !<
[1]	781
[1682]	782	INTEGER(iwp), PARAMETER :: stridex = 4 !<
[1320]	783
[1682]	784	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_in !<
	785	REAL(wp), DIMENSION(1:nz,0:ny,nxl:nxr) :: f_out !<
	786	REAL(wp), DIMENSION(nxl:nxr,1:nz,0:ny) :: work !<
[1]	787
[2300]	788	REAL(wp), DIMENSION(:,:), ALLOCATABLE :: work_ffty !<
	789	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: work_ffty_vec !<
	790
[1]	791	!
	792	!-- Transpose array
[1111]	793	#if defined( __parallel )
[1]	794	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	795	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	796	CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
	797	work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
	798	comm1dx, ierr )
	799	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1111]	800	#endif
[1]	801
	802	!
	803	!-- Resort the data in a way that y becomes the first index and carry out the
	804	!-- backward fft along y.
[1106]	805	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'continue' )
[1]	806
[2300]	807	IF ( loop_optimization == 'vector' ) THEN
	808
	809	ALLOCATE( work_ffty_vec(0:ny+1,1:nz,nxl:nxr) )
[1]	810	!
	811	!-- Code optimized for vector processors
[2300]	812	!$OMP PARALLEL PRIVATE ( i, j, k )
	813	!$OMP DO
[1]	814	DO k = 1, nz
	815	DO j = 0, ny
	816	DO i = nxl, nxr
	817	work_ffty_vec(j,k,i) = work(i,k,j)
	818	ENDDO
	819	ENDDO
	820	ENDDO
	821
[2300]	822	!$OMP DO
[1]	823	DO i = nxl, nxr
	824
	825	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' )
	826
	827	DO j = 0, ny
	828	DO k = 1, nz
	829	f_out(k,j,i) = work_ffty_vec(j,k,i)
	830	ENDDO
	831	ENDDO
	832
	833	ENDDO
[2300]	834	!$OMP END PARALLEL
[1]	835
[2300]	836	DEALLOCATE( work_ffty_vec )
	837
[1]	838	ELSE
	839	!
	840	!-- Cache optimized code.
[2300]	841	ALLOCATE( work_ffty(0:ny,stridex) )
	842	!
[1]	843	!-- The i-(x-)direction is split into a strided outer loop and an inner
	844	!-- loop for better cache performance
[2300]	845	!$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty )
	846	!$OMP DO
[1]	847	DO iouter = nxl, nxr, stridex
	848
	849	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
	850
	851	DO k = 1, nz
	852	!
	853	!-- Resort
	854	DO j = 0, ny
	855	DO i = iouter, iend
	856	work_ffty(j,i-iouter+1) = work(i,k,j)
	857	ENDDO
	858	ENDDO
	859
	860	DO i = iouter, iend
	861
	862	!
	863	!-- FFT along y
	864	ir = i-iouter+1 ! counter within a stride
[1106]	865	CALL fft_y_1d( work_ffty(:,ir), 'backward' )
[1]	866
	867	DO j = 0, ny
	868	f_out(k,j,i) = work_ffty(j,ir)
	869	ENDDO
	870	ENDDO
	871
	872	ENDDO
	873
	874	ENDDO
[2300]	875	!$OMP END PARALLEL
[1]	876
[2300]	877	DEALLOCATE( work_ffty )
	878
[1]	879	ENDIF
	880
[1106]	881	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'stop' )
[1]	882
	883	END SUBROUTINE tr_xy_ffty
	884
	885
	886	!------------------------------------------------------------------------------!
[1682]	887	! Description:
	888	! ------------
	889	!> FFT along x, solution of the tridiagonal system and backward FFT for
	890	!> a 1d-decomposition along x
	891	!>
	892	!> @warning this subroutine may still not work for hybrid parallelization
	893	!> with OpenMP (for possible necessary changes see the original
	894	!> routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
[1]	895	!------------------------------------------------------------------------------!
[1682]	896	SUBROUTINE fftx_tri_fftx( ar )
[1]	897
[1320]	898	USE control_parameters, &
[2300]	899	ONLY: loop_optimization
[1320]	900
	901	USE cpulog, &
	902	ONLY: cpu_log, log_point_s
	903
	904	USE grid_variables, &
	905	ONLY: ddx2, ddy2
	906
	907	USE kinds
	908
[1]	909	USE pegrid
	910
	911	IMPLICIT NONE
	912
[1682]	913	INTEGER(iwp) :: i !<
	914	INTEGER(iwp) :: j !<
	915	INTEGER(iwp) :: k !<
	916	INTEGER(iwp) :: m !<
	917	INTEGER(iwp) :: n !<
[3241]	918	!$ INTEGER(iwp) :: omp_get_thread_num !<
[1682]	919	INTEGER(iwp) :: tn !<
[1]	920
[1682]	921	REAL(wp), DIMENSION(0:nx) :: work_fftx !<
	922	REAL(wp), DIMENSION(0:nx,1:nz) :: work_trix !<
	923	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: ar !<
	924	REAL(wp), DIMENSION(:,:,:,:), ALLOCATABLE :: tri !<
[1]	925
	926
[1106]	927	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'start' )
[1]	928
	929	ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) )
	930
	931	tn = 0 ! Default thread number in case of one thread
	932	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_fftx, work_trix )
	933	DO j = nys_x, nyn_x
	934
	935	!$ tn = omp_get_thread_num()
	936
[2300]	937	IF ( loop_optimization == 'vector' ) THEN
[1]	938	!
	939	!-- Code optimized for vector processors
	940	DO k = 1, nz
	941
	942	m = 0
	943	DO n = 1, pdims(1)
[1003]	944	DO i = 1, nnx
[1]	945	work_trix(m,k) = ar(i,k,j,n)
	946	m = m + 1
	947	ENDDO
	948	ENDDO
	949
	950	ENDDO
	951
	952	CALL fft_x_m( work_trix, 'forward' )
	953
	954	ELSE
	955	!
	956	!-- Cache optimized code
	957	DO k = 1, nz
	958
	959	m = 0
	960	DO n = 1, pdims(1)
[1003]	961	DO i = 1, nnx
[1]	962	work_fftx(m) = ar(i,k,j,n)
	963	m = m + 1
	964	ENDDO
	965	ENDDO
	966
[1106]	967	CALL fft_x_1d( work_fftx, 'forward' )
[1]	968
	969	DO i = 0, nx
	970	work_trix(i,k) = work_fftx(i)
	971	ENDDO
	972
	973	ENDDO
	974
	975	ENDIF
	976
	977	!
	978	!-- Solve the linear equation system
	979	CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) )
	980
[2300]	981	IF ( loop_optimization == 'vector' ) THEN
[1]	982	!
	983	!-- Code optimized for vector processors
	984	CALL fft_x_m( work_trix, 'backward' )
	985
	986	DO k = 1, nz
	987
	988	m = 0
	989	DO n = 1, pdims(1)
[1003]	990	DO i = 1, nnx
[1]	991	ar(i,k,j,n) = work_trix(m,k)
	992	m = m + 1
	993	ENDDO
	994	ENDDO
	995
	996	ENDDO
	997
	998	ELSE
	999	!
	1000	!-- Cache optimized code
	1001	DO k = 1, nz
	1002
	1003	DO i = 0, nx
	1004	work_fftx(i) = work_trix(i,k)
	1005	ENDDO
	1006
[1106]	1007	CALL fft_x_1d( work_fftx, 'backward' )
[1]	1008
	1009	m = 0
	1010	DO n = 1, pdims(1)
[1003]	1011	DO i = 1, nnx
[1]	1012	ar(i,k,j,n) = work_fftx(m)
	1013	m = m + 1
	1014	ENDDO
	1015	ENDDO
	1016
	1017	ENDDO
	1018
	1019	ENDIF
	1020
	1021	ENDDO
	1022
	1023	DEALLOCATE( tri )
	1024
[1106]	1025	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'stop' )
[1]	1026
	1027	END SUBROUTINE fftx_tri_fftx
	1028
	1029
	1030	!------------------------------------------------------------------------------!
[1682]	1031	! Description:
	1032	! ------------
	1033	!> Fourier-transformation along x with subsequent transposition x --> y for
	1034	!> a 1d-decomposition along y.
	1035	!>
	1036	!> @attention NEC-branch of this routine may significantly profit from
	1037	!> further optimizations. So far, performance is much worse than
	1038	!> for routine ffty_tr_yx (more than three times slower).
[1]	1039	!------------------------------------------------------------------------------!
[1682]	1040	SUBROUTINE fftx_tr_xy( f_in, f_out )
[1]	1041
[1682]	1042
[1320]	1043	USE control_parameters, &
[2300]	1044	ONLY: loop_optimization
[1320]	1045
	1046	USE cpulog, &
	1047	ONLY: cpu_log, log_point_s
	1048
	1049	USE kinds
	1050
[1]	1051	USE pegrid
	1052
	1053	IMPLICIT NONE
	1054
[1682]	1055	INTEGER(iwp) :: i !<
	1056	INTEGER(iwp) :: j !<
	1057	INTEGER(iwp) :: k !<
[1]	1058
[1682]	1059	REAL(wp), DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx !<
	1060	REAL(wp), DIMENSION(1:nz,nys:nyn,0:nx) :: f_in !<
	1061	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_out !<
	1062	REAL(wp), DIMENSION(nys:nyn,1:nz,0:nx) :: work !<
[1]	1063
	1064	!
	1065	!-- Carry out the FFT along x, where all data are present due to the
	1066	!-- 1d-decomposition along y. Resort the data in a way that y becomes
	1067	!-- the first index.
[1106]	1068	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'start' )
[1]	1069
[2300]	1070	IF ( loop_optimization == 'vector' ) THEN
[1]	1071	!
	1072	!-- Code for vector processors
[85]	1073	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	1074	!$OMP DO
	1075	DO i = 0, nx
	1076
	1077	DO j = nys, nyn
	1078	DO k = 1, nz
	1079	work_fftx(i,k,j) = f_in(k,j,i)
	1080	ENDDO
	1081	ENDDO
	1082
	1083	ENDDO
	1084
	1085	!$OMP DO
	1086	DO j = nys, nyn
	1087
	1088	CALL fft_x_m( work_fftx(:,:,j), 'forward' )
	1089
	1090	DO k = 1, nz
	1091	DO i = 0, nx
	1092	work(j,k,i) = work_fftx(i,k,j)
	1093	ENDDO
	1094	ENDDO
	1095
	1096	ENDDO
	1097	!$OMP END PARALLEL
	1098
	1099	ELSE
	1100
	1101	!
	1102	!-- Cache optimized code (there might be still a potential for better
	1103	!-- optimization).
[696]	1104	!$OMP PARALLEL PRIVATE (i,j,k)
[1]	1105	!$OMP DO
	1106	DO i = 0, nx
	1107
	1108	DO j = nys, nyn
	1109	DO k = 1, nz
	1110	work_fftx(i,k,j) = f_in(k,j,i)
	1111	ENDDO
	1112	ENDDO
	1113
	1114	ENDDO
	1115
	1116	!$OMP DO
	1117	DO j = nys, nyn
	1118	DO k = 1, nz
	1119
[1106]	1120	CALL fft_x_1d( work_fftx(0:nx,k,j), 'forward' )
[1]	1121
	1122	DO i = 0, nx
	1123	work(j,k,i) = work_fftx(i,k,j)
	1124	ENDDO
	1125	ENDDO
	1126
	1127	ENDDO
	1128	!$OMP END PARALLEL
	1129
	1130	ENDIF
[1106]	1131	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'pause' )
[1]	1132
	1133	!
	1134	!-- Transpose array
[1111]	1135	#if defined( __parallel )
[1]	1136	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	1137	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	1138	CALL MPI_ALLTOALL( work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
	1139	f_out(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
	1140	comm1dy, ierr )
	1141	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1111]	1142	#endif
[1]	1143
	1144	END SUBROUTINE fftx_tr_xy
	1145
	1146
	1147	!------------------------------------------------------------------------------!
[1682]	1148	! Description:
	1149	! ------------
	1150	!> Transposition y --> x with a subsequent backward Fourier transformation for
	1151	!> a 1d-decomposition along x.
[1]	1152	!------------------------------------------------------------------------------!
[1682]	1153	SUBROUTINE tr_yx_fftx( f_in, f_out )
[1]	1154
[1682]	1155
[1320]	1156	USE control_parameters, &
[2300]	1157	ONLY: loop_optimization
[1320]	1158
	1159	USE cpulog, &
	1160	ONLY: cpu_log, log_point_s
	1161
	1162	USE kinds
	1163
[1]	1164	USE pegrid
	1165
	1166	IMPLICIT NONE
	1167
[1682]	1168	INTEGER(iwp) :: i !<
	1169	INTEGER(iwp) :: j !<
	1170	INTEGER(iwp) :: k !<
[1]	1171
[1682]	1172	REAL(wp), DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx !<
	1173	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_in !<
	1174	REAL(wp), DIMENSION(1:nz,nys:nyn,0:nx) :: f_out !<
	1175	REAL(wp), DIMENSION(nys:nyn,1:nz,0:nx) :: work !<
[1]	1176
	1177	!
	1178	!-- Transpose array
[1111]	1179	#if defined( __parallel )
[1]	1180	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	1181	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	1182	CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
	1183	work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
	1184	comm1dy, ierr )
	1185	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1111]	1186	#endif
[1]	1187
	1188	!
	1189	!-- Carry out the FFT along x, where all data are present due to the
	1190	!-- 1d-decomposition along y. Resort the data in a way that y becomes
	1191	!-- the first index.
[1106]	1192	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'continue' )
[1]	1193
[2300]	1194	IF ( loop_optimization == 'vector' ) THEN
[1]	1195	!
	1196	!-- Code optimized for vector processors
[85]	1197	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	1198	!$OMP DO
	1199	DO j = nys, nyn
	1200
	1201	DO k = 1, nz
	1202	DO i = 0, nx
	1203	work_fftx(i,k,j) = work(j,k,i)
	1204	ENDDO
	1205	ENDDO
	1206
	1207	CALL fft_x_m( work_fftx(:,:,j), 'backward' )
	1208
	1209	ENDDO
	1210
	1211	!$OMP DO
	1212	DO i = 0, nx
	1213	DO j = nys, nyn
	1214	DO k = 1, nz
	1215	f_out(k,j,i) = work_fftx(i,k,j)
	1216	ENDDO
	1217	ENDDO
	1218	ENDDO
	1219	!$OMP END PARALLEL
	1220
	1221	ELSE
	1222
	1223	!
	1224	!-- Cache optimized code (there might be still a potential for better
	1225	!-- optimization).
[696]	1226	!$OMP PARALLEL PRIVATE (i,j,k)
[1]	1227	!$OMP DO
	1228	DO j = nys, nyn
	1229	DO k = 1, nz
	1230
	1231	DO i = 0, nx
	1232	work_fftx(i,k,j) = work(j,k,i)
	1233	ENDDO
	1234
[1106]	1235	CALL fft_x_1d( work_fftx(0:nx,k,j), 'backward' )
[1]	1236
	1237	ENDDO
	1238	ENDDO
	1239
	1240	!$OMP DO
	1241	DO i = 0, nx
	1242	DO j = nys, nyn
	1243	DO k = 1, nz
	1244	f_out(k,j,i) = work_fftx(i,k,j)
	1245	ENDDO
	1246	ENDDO
	1247	ENDDO
	1248	!$OMP END PARALLEL
	1249
	1250	ENDIF
[1106]	1251	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'stop' )
[1]	1252
	1253	END SUBROUTINE tr_yx_fftx
	1254
	1255
	1256	!------------------------------------------------------------------------------!
[1682]	1257	! Description:
	1258	! ------------
	1259	!> FFT along y, solution of the tridiagonal system and backward FFT for
	1260	!> a 1d-decomposition along y.
	1261	!>
	1262	!> @warning this subroutine may still not work for hybrid parallelization
	1263	!> with OpenMP (for possible necessary changes see the original
	1264	!> routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
[1]	1265	!------------------------------------------------------------------------------!
[1682]	1266	SUBROUTINE ffty_tri_ffty( ar )
[1]	1267
[1682]	1268
[1320]	1269	USE control_parameters, &
[2300]	1270	ONLY: loop_optimization
[1320]	1271
	1272	USE cpulog, &
	1273	ONLY: cpu_log, log_point_s
	1274
	1275	USE grid_variables, &
	1276	ONLY: ddx2, ddy2
	1277
	1278	USE kinds
	1279
[1]	1280	USE pegrid
	1281
	1282	IMPLICIT NONE
	1283
[1682]	1284	INTEGER(iwp) :: i !<
	1285	INTEGER(iwp) :: j !<
	1286	INTEGER(iwp) :: k !<
	1287	INTEGER(iwp) :: m !<
	1288	INTEGER(iwp) :: n !<
[3241]	1289	!$ INTEGER(iwp) :: omp_get_thread_num !<
[1682]	1290	INTEGER(iwp) :: tn !<
[1]	1291
[1682]	1292	REAL(wp), DIMENSION(0:ny) :: work_ffty !<
	1293	REAL(wp), DIMENSION(0:ny,1:nz) :: work_triy !<
	1294	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: ar !<
	1295	REAL(wp), DIMENSION(:,:,:,:), ALLOCATABLE :: tri !<
[1]	1296
	1297
[1106]	1298	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'start' )
[1]	1299
	1300	ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) )
	1301
	1302	tn = 0 ! Default thread number in case of one thread
[696]	1303	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy )
[1]	1304	DO i = nxl_y, nxr_y
	1305
	1306	!$ tn = omp_get_thread_num()
	1307
[2300]	1308	IF ( loop_optimization == 'vector' ) THEN
[1]	1309	!
	1310	!-- Code optimized for vector processors
	1311	DO k = 1, nz
	1312
	1313	m = 0
	1314	DO n = 1, pdims(2)
[1003]	1315	DO j = 1, nny
[1]	1316	work_triy(m,k) = ar(j,k,i,n)
	1317	m = m + 1
	1318	ENDDO
	1319	ENDDO
	1320
	1321	ENDDO
	1322
	1323	CALL fft_y_m( work_triy, ny, 'forward' )
	1324
	1325	ELSE
	1326	!
	1327	!-- Cache optimized code
	1328	DO k = 1, nz
	1329
	1330	m = 0
	1331	DO n = 1, pdims(2)
[1003]	1332	DO j = 1, nny
[1]	1333	work_ffty(m) = ar(j,k,i,n)
	1334	m = m + 1
	1335	ENDDO
	1336	ENDDO
	1337
[1106]	1338	CALL fft_y_1d( work_ffty, 'forward' )
[1]	1339
	1340	DO j = 0, ny
	1341	work_triy(j,k) = work_ffty(j)
	1342	ENDDO
	1343
	1344	ENDDO
	1345
	1346	ENDIF
	1347
	1348	!
	1349	!-- Solve the linear equation system
	1350	CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) )
	1351
[2300]	1352	IF ( loop_optimization == 'vector' ) THEN
[1]	1353	!
	1354	!-- Code optimized for vector processors
	1355	CALL fft_y_m( work_triy, ny, 'backward' )
	1356
	1357	DO k = 1, nz
	1358
	1359	m = 0
	1360	DO n = 1, pdims(2)
[1003]	1361	DO j = 1, nny
[1]	1362	ar(j,k,i,n) = work_triy(m,k)
	1363	m = m + 1
	1364	ENDDO
	1365	ENDDO
	1366
	1367	ENDDO
	1368
	1369	ELSE
	1370	!
	1371	!-- Cache optimized code
	1372	DO k = 1, nz
	1373
	1374	DO j = 0, ny
	1375	work_ffty(j) = work_triy(j,k)
	1376	ENDDO
	1377
[1106]	1378	CALL fft_y_1d( work_ffty, 'backward' )
[1]	1379
	1380	m = 0
	1381	DO n = 1, pdims(2)
[1003]	1382	DO j = 1, nny
[1]	1383	ar(j,k,i,n) = work_ffty(m)
	1384	m = m + 1
	1385	ENDDO
	1386	ENDDO
	1387
	1388	ENDDO
	1389
	1390	ENDIF
	1391
	1392	ENDDO
	1393
	1394	DEALLOCATE( tri )
	1395
[1106]	1396	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'stop' )
[1]	1397
	1398	END SUBROUTINE ffty_tri_ffty
	1399
	1400	END MODULE poisfft_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |