Home

Context Navigation

source: palm/trunk/SOURCE/poisfft_mod.f90 @ 4098

Last change on this file since 4098 was 3690, checked in by knoop, 6 years ago
Enabled OpenACC usage without using the cudaFFT library. Added respective palmtest build configuration and testcase.
Property svn:keywords set to `Id`
File size: 45.9 KB

Rev	Line
[1850]	1	!> @file poisfft_mod.f90
[2000]	2	!------------------------------------------------------------------------------!
[2696]	3	! This file is part of the PALM model system.
[1036]	4	!
[2000]	5	! PALM is free software: you can redistribute it and/or modify it under the
	6	! terms of the GNU General Public License as published by the Free Software
	7	! Foundation, either version 3 of the License, or (at your option) any later
	8	! version.
[1036]	9	!
	10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
	11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
	12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
	13	!
	14	! You should have received a copy of the GNU General Public License along with
	15	! PALM. If not, see <http://www.gnu.org/licenses/>.
	16	!
[3655]	17	! Copyright 1997-2019 Leibniz Universitaet Hannover
[2000]	18	!------------------------------------------------------------------------------!
[1036]	19	!
[484]	20	! Current revisions:
[1]	21	! -----------------
[1683]	22	!
[2119]	23	!
[1321]	24	! Former revisions:
	25	! -----------------
	26	! $Id: poisfft_mod.f90 3690 2019-01-22 22:56:42Z Giersch $
[3634]	27	! OpenACC port for SPEC
	28	!
	29	! 3241 2018-09-12 15:02:00Z raasch
[3241]	30	! unused variables removed,
	31	! declarations of omp_get_thread_num now as omp-directive
	32	!
	33	! 2718 2018-01-02 08:49:38Z maronga
[2716]	34	! Corrected "Former revisions" section
	35	!
	36	! 2696 2017-12-14 17:12:51Z kanani
	37	! Change in file header (GPL part)
	38	!
	39	! 2300 2017-06-29 13:31:14Z raasch
[2300]	40	! settings depending on host variable removed or replaced by loop_optimization
	41	!
	42	! 2119 2017-01-17 16:51:50Z raasch
[1321]	43	!
[2119]	44	! 2118 2017-01-17 16:38:49Z raasch
	45	! OpenACC directives and related code removed
	46	!
[2001]	47	! 2000 2016-08-20 18:09:15Z knoop
	48	! Forced header and separation lines into 80 columns
	49	!
[1851]	50	! 1850 2016-04-08 13:29:27Z maronga
	51	! Module renamed
	52	!
	53	!
[1805]	54	! 1804 2016-04-05 16:30:18Z maronga
	55	! Removed code for parameter file check (__check)
	56	!
[1683]	57	! 1682 2015-10-07 23:56:08Z knoop
	58	! Code annotations made doxygen readable
	59	!
[1483]	60	! 1482 2014-10-18 12:34:45Z raasch
	61	! use 2d-decomposition, if accelerator boards are used
	62	!
[1407]	63	! 1406 2014-05-16 13:47:01Z raasch
	64	! bugfix for pgi 14.4: declare create moved after array declaration
	65	!
[1321]	66	! 1320 2014-03-20 08:40:49Z raasch
[1320]	67	! ONLY-attribute added to USE-statements,
	68	! kind-parameters added to all INTEGER and REAL declaration statements,
	69	! kinds are defined in new module kinds,
	70	! old module precision_kind is removed,
	71	! revision history before 2012 removed,
	72	! comment fields (!:) to be used for variable explanations added to
	73	! all variable declaration statements
[1217]	74	!
[1319]	75	! 1318 2014-03-17 13:35:16Z raasch
	76	! module interfaces removed
	77	!
[1307]	78	! 1306 2014-03-13 14:30:59Z raasch
	79	! openmp sections removed from the overlap branch,
	80	! second argument removed from parameter list
	81	!
[1217]	82	! 1216 2013-08-26 09:31:42Z raasch
[1216]	83	! resorting of arrays moved to separate routines resort_for_...,
	84	! one argument, used as temporary work array, removed from all transpose
	85	! routines
	86	! overlapping fft / transposition implemented
[1112]	87	!
[1213]	88	! 1212 2013-08-15 08:46:27Z raasch
	89	! tridia routines moved to seperate module tridia_solver
	90	!
[1209]	91	! 1208 2013-08-13 06:41:49Z raasch
	92	! acc-update clauses added for "ar" so that ffts other than cufft can also be
	93	! used (although they are not ported and will give a poor performance)
	94	!
[1112]	95	! 1111 2013-03-08 23:54:10Z raasch
[1111]	96	! further openACC porting of non-parallel (MPI) branch:
	97	! tridiagonal routines split into extermal subroutines (instead using CONTAINS),
	98	! no distinction between parallel/non-parallel in poisfft and tridia any more,
[1112]	99	! tridia routines moved to end of file because of probable bug in PGI compiler 12.5
[1111]	100	! (otherwise "invalid device function" is indicated during runtime),
	101	! optimization of tridia routines: constant elements and coefficients of tri are
	102	! stored in seperate arrays ddzuw and tric, last dimension of tri reduced from 5
	103	! to 2,
	104	! poisfft_init is now called internally from poisfft, maketri is called from
	105	! poisfft_init,
	106	! ibc_p_b = 2 removed
[1]	107	!
[1107]	108	! 1106 2013-03-04 05:31:38Z raasch
	109	! routines fftx, ffty, fftxp, fftyp removed, calls replaced by fft_x, fft_y,
	110	! in the 1D-decomposition routines fft_x, ffty are replaced by fft_x_1d,
	111	! fft_y_1d
	112	!
[1104]	113	! 1103 2013-02-20 02:15:53Z raasch
	114	! tri, ar, and ar1 arguments in tridia-routines (2d) are removed because they
	115	! sometimes cause segmentation faults with intel 12.1 compiler
	116	!
[1093]	117	! 1092 2013-02-02 11:24:22Z raasch
	118	! unused variables removed
	119	!
[1037]	120	! 1036 2012-10-22 13:43:42Z raasch
	121	! code put under GPL (PALM 3.9)
	122	!
[1014]	123	! 2012-09-21 07:03:55Z raasch
	124	! FLOAT type conversion replaced by REAL
	125	!
[1004]	126	! 1003 2012-09-14 14:35:53Z raasch
	127	! indices nxa, nya, etc. replaced by nx, ny, etc.
	128	!
[941]	129	! 940 2012-07-09 14:31:00Z raasch
	130	! special handling of tri-array as an argument in tridia_1dd routines switched
	131	! off because it caused segmentation faults with intel 12.1 compiler
	132	!
[878]	133	! 877 2012-04-03 11:21:44Z suehring
	134	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
	135	! pressure at the top of the model domain.
	136	!
[810]	137	! 809 2012-01-30 13:32:58Z maronga
	138	! Bugfix: replaced .AND. and .NOT. with && and ! in the preprocessor directives
	139	!
[808]	140	! 807 2012-01-25 11:53:51Z maronga
	141	! New cpp directive "__check" implemented which is used by check_namelist_files
	142	! (most of the code is unneeded by check_namelist_files).
	143	!
[1]	144	! Revision 1.1 1997/07/24 11:24:14 raasch
	145	! Initial revision
	146	!
	147	!
	148	! Description:
	149	! ------------
[1682]	150	!> Solves the Poisson equation with a 2D spectral method
	151	!> d^2 p / dx^2 + d^2 p / dy^2 + d^2 p / dz^2 = s
	152	!>
	153	!> Input:
	154	!> real ar contains (nnz,nny,nnx) elements of the velocity divergence,
	155	!> starting from (1,nys,nxl)
	156	!>
	157	!> Output:
	158	!> real ar contains the solution for perturbation pressure p
[1]	159	!------------------------------------------------------------------------------!
[1682]	160	MODULE poisfft_mod
	161
[1]	162
[1320]	163	USE fft_xy, &
	164	ONLY: fft_init, fft_y, fft_y_1d, fft_y_m, fft_x, fft_x_1d, fft_x_m
[1]	165
[1320]	166	USE indices, &
	167	ONLY: nnx, nny, nx, nxl, nxr, ny, nys, nyn, nz
	168
	169	USE transpose_indices, &
	170	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nys_x, nys_z, nyn_x, nyn_z, nzb_x, &
	171	nzb_y, nzt_x, nzt_y
	172
	173	USE tridia_solver, &
	174	ONLY: tridia_1dd, tridia_init, tridia_substi, tridia_substi_overlap
	175
[1]	176	IMPLICIT NONE
	177
[1111]	178	LOGICAL, SAVE :: poisfft_initialized = .FALSE.
	179
[1]	180	PRIVATE
[807]	181
[1]	182	PUBLIC poisfft, poisfft_init
	183
	184	INTERFACE poisfft
	185	MODULE PROCEDURE poisfft
	186	END INTERFACE poisfft
	187
	188	INTERFACE poisfft_init
	189	MODULE PROCEDURE poisfft_init
	190	END INTERFACE poisfft_init
	191
[807]	192
[1]	193	CONTAINS
	194
[1682]	195	!------------------------------------------------------------------------------!
	196	! Description:
	197	! ------------
[3241]	198	!> Setup coefficients for FFT and the tridiagonal solver
[1682]	199	!------------------------------------------------------------------------------!
[1]	200	SUBROUTINE poisfft_init
	201
[1111]	202	IMPLICIT NONE
	203
	204
[1]	205	CALL fft_init
	206
[1212]	207	CALL tridia_init
[1111]	208
	209	poisfft_initialized = .TRUE.
	210
[1]	211	END SUBROUTINE poisfft_init
	212
[1111]	213
[1804]	214
[1682]	215	!------------------------------------------------------------------------------!
	216	! Description:
	217	! ------------
	218	!> Two-dimensional Fourier Transformation in x- and y-direction.
	219	!------------------------------------------------------------------------------!
[1306]	220	SUBROUTINE poisfft( ar )
[1]	221
[1320]	222	USE control_parameters, &
[3241]	223	ONLY: transpose_compute_overlap
[1320]	224
	225	USE cpulog, &
	226	ONLY: cpu_log, cpu_log_nowait, log_point_s
	227
	228	USE kinds
	229
[1]	230	USE pegrid
	231
	232	IMPLICIT NONE
	233
[1682]	234	INTEGER(iwp) :: ii !<
	235	INTEGER(iwp) :: iind !<
	236	INTEGER(iwp) :: inew !<
	237	INTEGER(iwp) :: jj !<
	238	INTEGER(iwp) :: jind !<
	239	INTEGER(iwp) :: jnew !<
	240	INTEGER(iwp) :: ki !<
	241	INTEGER(iwp) :: kk !<
	242	INTEGER(iwp) :: knew !<
	243	INTEGER(iwp) :: n !<
	244	INTEGER(iwp) :: nblk !<
	245	INTEGER(iwp) :: nnx_y !<
	246	INTEGER(iwp) :: nny_z !<
	247	INTEGER(iwp) :: nnz_x !<
	248	INTEGER(iwp) :: nxl_y_bound !<
	249	INTEGER(iwp) :: nxr_y_bound !<
[1]	250
[1682]	251	INTEGER(iwp), DIMENSION(4) :: isave !<
[1320]	252
[1682]	253	REAL(wp), DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar !<
	254	REAL(wp), DIMENSION(nys:nyn,nxl:nxr,1:nz) :: ar_inv !<
[3690]	255
	256	#define __acc_fft_device ( defined( _OPENACC ) && ( defined ( __cuda_fft ) ) )
	257	#if __acc_fft_device
[3634]	258	!$ACC DECLARE CREATE(ar_inv)
[3690]	259	#endif
[1]	260
[1682]	261	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: ar1 !<
	262	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_in !<
	263	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_inv !<
	264	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_out_y !<
	265	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_out_z !<
[1216]	266
	267
[1]	268	CALL cpu_log( log_point_s(3), 'poisfft', 'start' )
	269
[1111]	270	IF ( .NOT. poisfft_initialized ) CALL poisfft_init
	271
[3690]	272	#if !__acc_fft_device
	273	!$ACC UPDATE HOST(ar)
	274	#endif
	275
[3634]	276	#ifndef _OPENACC
[1]	277	!
	278	!-- Two-dimensional Fourier Transformation in x- and y-direction.
[2118]	279	IF ( pdims(2) == 1 .AND. pdims(1) > 1 ) THEN
[1]	280
	281	!
	282	!-- 1d-domain-decomposition along x:
	283	!-- FFT along y and transposition y --> x
[1216]	284	CALL ffty_tr_yx( ar, ar )
[1]	285
	286	!
	287	!-- FFT along x, solving the tridiagonal system and backward FFT
	288	CALL fftx_tri_fftx( ar )
	289
	290	!
	291	!-- Transposition x --> y and backward FFT along y
[1216]	292	CALL tr_xy_ffty( ar, ar )
[1]	293
[2118]	294	ELSEIF ( pdims(1) == 1 .AND. pdims(2) > 1 ) THEN
[1]	295
	296	!
	297	!-- 1d-domain-decomposition along y:
	298	!-- FFT along x and transposition x --> y
[1216]	299	CALL fftx_tr_xy( ar, ar )
[1]	300
	301	!
	302	!-- FFT along y, solving the tridiagonal system and backward FFT
	303	CALL ffty_tri_ffty( ar )
	304
	305	!
	306	!-- Transposition y --> x and backward FFT along x
[1216]	307	CALL tr_yx_fftx( ar, ar )
[1]	308
[1216]	309	ELSEIF ( .NOT. transpose_compute_overlap ) THEN
[3634]	310	#endif
[1]	311
	312	!
[1111]	313	!-- 2d-domain-decomposition or no decomposition (1 PE run)
[1]	314	!-- Transposition z --> x
	315	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
[1216]	316	CALL resort_for_zx( ar, ar_inv )
	317	CALL transpose_zx( ar_inv, ar )
[1]	318	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
	319
	320	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
[1106]	321	CALL fft_x( ar, 'forward' )
[1]	322	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
	323
	324	!
	325	!-- Transposition x --> y
	326	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
[1216]	327	CALL resort_for_xy( ar, ar_inv )
	328	CALL transpose_xy( ar_inv, ar )
[1]	329	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
	330
	331	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
[1216]	332	CALL fft_y( ar, 'forward', ar_tr = ar, &
	333	nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
	334	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
[1]	335	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	336
	337	!
	338	!-- Transposition y --> z
	339	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
[1216]	340	CALL resort_for_yz( ar, ar_inv )
	341	CALL transpose_yz( ar_inv, ar )
[1]	342	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
	343
	344	!
[1106]	345	!-- Solve the tridiagonal equation system along z
[1]	346	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
[1212]	347	CALL tridia_substi( ar )
[1]	348	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
	349
	350	!
	351	!-- Inverse Fourier Transformation
	352	!-- Transposition z --> y
	353	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
[1216]	354	CALL transpose_zy( ar, ar_inv )
	355	CALL resort_for_zy( ar_inv, ar )
[1]	356	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
	357
	358	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
[1216]	359	CALL fft_y( ar, 'backward', ar_tr = ar, &
	360	nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
	361	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
[1]	362	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	363
	364	!
	365	!-- Transposition y --> x
	366	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
[1216]	367	CALL transpose_yx( ar, ar_inv )
	368	CALL resort_for_yx( ar_inv, ar )
[1]	369	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
	370
	371	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
[1106]	372	CALL fft_x( ar, 'backward' )
[1]	373	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	374
	375	!
	376	!-- Transposition x --> z
	377	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
[1216]	378	CALL transpose_xz( ar, ar_inv )
	379	CALL resort_for_xz( ar_inv, ar )
[1]	380	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
	381
[3634]	382	#ifndef _OPENACC
[1216]	383	ELSE
	384
	385	!
	386	!-- 2d-domain-decomposition or no decomposition (1 PE run) with
	387	!-- overlapping transposition / fft
[1318]	388	!-- cputime logging must not use barriers, which would prevent overlapping
[1216]	389	ALLOCATE( f_out_y(0:ny,nxl_y:nxr_y,nzb_y:nzt_y), &
	390	f_out_z(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
	391	!
	392	!-- Transposition z --> x + subsequent fft along x
	393	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
	394	CALL resort_for_zx( ar, f_inv )
	395	!
	396	!-- Save original indices and gridpoint counter
	397	isave(1) = nz
	398	isave(2) = nzb_x
	399	isave(3) = nzt_x
	400	isave(4) = sendrecvcount_zx
	401	!
	402	!-- Set new indices for transformation
	403	nblk = nz / pdims(1)
	404	nz = pdims(1)
	405	nnz_x = 1
	406	nzb_x = 1 + myidx * nnz_x
	407	nzt_x = ( myidx + 1 ) * nnz_x
	408	sendrecvcount_zx = nnx * nny * nnz_x
	409
[1306]	410	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
[1216]	411	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
	412
[1306]	413	DO kk = 1, nblk
[1216]	414
[1306]	415	IF ( kk == 1 ) THEN
[1318]	416	CALL cpu_log( log_point_s(5), 'transpo forward', 'start', cpu_log_nowait )
[1306]	417	ELSE
[1318]	418	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
[1306]	419	ENDIF
[1216]	420
[1306]	421	DO knew = 1, nz
	422	ki = kk + nblk * ( knew - 1 )
	423	f_in(:,:,knew) = f_inv(:,:,ki)
	424	ENDDO
[1216]	425
[1306]	426	CALL transpose_zx( f_in, ar1(:,:,:))
	427	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
[1216]	428
[1306]	429	IF ( kk == 1 ) THEN
[1318]	430	CALL cpu_log( log_point_s(4), 'fft_x', 'start', cpu_log_nowait )
[1306]	431	ELSE
[1318]	432	CALL cpu_log( log_point_s(4), 'fft_x', 'continue', cpu_log_nowait )
[1216]	433	ENDIF
	434
[1306]	435	n = isave(2) + kk - 1
	436	CALL fft_x( ar1(:,:,:), 'forward', ar_2d = f_out_z(:,:,n))
	437	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
[1216]	438
	439	ENDDO
	440	!
	441	!-- Restore original indices/counters
	442	nz = isave(1)
	443	nzb_x = isave(2)
	444	nzt_x = isave(3)
	445	sendrecvcount_zx = isave(4)
	446
	447	DEALLOCATE( ar1, f_in, f_inv )
	448
	449	!
	450	!-- Transposition x --> y + subsequent fft along y
	451	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
	452	CALL resort_for_xy( f_out_z, f_inv )
	453	!
	454	!-- Save original indices and gridpoint counter
	455	isave(1) = nx
	456	isave(2) = nxl_y
	457	isave(3) = nxr_y
	458	isave(4) = sendrecvcount_xy
	459	!
	460	!-- Set new indices for transformation
	461	nblk = ( ( nx+1 ) / pdims(2) ) - 1
	462	nx = pdims(2)
	463	nnx_y = 1
	464	nxl_y = myidy * nnx_y
	465	nxr_y = ( myidy + 1 ) * nnx_y - 1
	466	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
	467
[1306]	468	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
[1216]	469	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
	470
[1306]	471	DO ii = 0, nblk
[1216]	472
[1318]	473	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
[1216]	474
[1306]	475	DO inew = 0, nx-1
	476	iind = ii + ( nblk + 1 ) * inew
	477	f_in(:,:,inew) = f_inv(:,:,iind)
	478	ENDDO
[1216]	479
[1306]	480	CALL transpose_xy( f_in, ar1(:,:,:) )
[1216]	481
[1306]	482	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
[1216]	483
[1306]	484	IF ( ii == 1 ) THEN
[1318]	485	CALL cpu_log( log_point_s(7), 'fft_y', 'start', cpu_log_nowait )
[1306]	486	ELSE
[1318]	487	CALL cpu_log( log_point_s(7), 'fft_y', 'continue', cpu_log_nowait )
[1216]	488	ENDIF
	489
[1306]	490	nxl_y_bound = isave(2)
	491	nxr_y_bound = isave(3)
	492	n = isave(2) + ii
	493	CALL fft_y( ar1(:,:,:), 'forward', ar_tr = f_out_y, &
	494	nxl_y_bound = nxl_y_bound, nxr_y_bound = nxr_y_bound, &
	495	nxl_y_l = n, nxr_y_l = n )
[1216]	496
[1306]	497	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
[1216]	498
	499	ENDDO
	500	!
	501	!-- Restore original indices/counters
	502	nx = isave(1)
	503	nxl_y = isave(2)
	504	nxr_y = isave(3)
	505	sendrecvcount_xy = isave(4)
	506
	507	DEALLOCATE( ar1, f_in, f_inv )
	508
	509	!
	510	!-- Transposition y --> z + subsequent tridia + resort for z --> y
	511	ALLOCATE( f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
	512	CALL resort_for_yz( f_out_y, f_inv )
	513	!
	514	!-- Save original indices and gridpoint counter
	515	isave(1) = ny
	516	isave(2) = nys_z
	517	isave(3) = nyn_z
	518	isave(4) = sendrecvcount_yz
	519	!
	520	!-- Set new indices for transformation
	521	nblk = ( ( ny+1 ) / pdims(1) ) - 1
	522	ny = pdims(1)
	523	nny_z = 1
	524	nys_z = myidx * nny_z
	525	nyn_z = ( myidx + 1 ) * nny_z - 1
	526	sendrecvcount_yz = ( nxr_y-nxl_y+1 ) * nny_z * ( nzt_y-nzb_y+1 )
	527
[1306]	528	ALLOCATE( ar1(nxl_z:nxr_z,nys_z:nyn_z,1:nz) )
[1216]	529	ALLOCATE( f_in(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
	530
[1306]	531	DO jj = 0, nblk
[1216]	532	!
[1306]	533	!-- Forward Fourier Transformation
	534	!-- Transposition y --> z
[1318]	535	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
[1216]	536
[1306]	537	DO jnew = 0, ny-1
	538	jind = jj + ( nblk + 1 ) * jnew
	539	f_in(:,:,jnew) = f_inv(:,:,jind)
	540	ENDDO
[1216]	541
[1306]	542	CALL transpose_yz( f_in, ar1(:,:,:) )
[1216]	543
[1306]	544	IF ( jj == nblk ) THEN
	545	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
	546	ELSE
	547	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
[1216]	548	ENDIF
	549
	550	!
[1306]	551	!-- Solve the tridiagonal equation system along z
[1318]	552	CALL cpu_log( log_point_s(6), 'tridia', 'start', cpu_log_nowait )
[1216]	553
[1306]	554	n = isave(2) + jj
	555	CALL tridia_substi_overlap( ar1(:,:,:), n )
[1216]	556
[1306]	557	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
[1216]	558
[1306]	559	!
	560	!-- Inverse Fourier Transformation
	561	!-- Transposition z --> y
	562	!-- Only one thread should call MPI routines, therefore forward and
	563	!-- backward tranpose are in the same section
	564	IF ( jj == 0 ) THEN
[1318]	565	CALL cpu_log( log_point_s(8), 'transpo invers', 'start', cpu_log_nowait )
[1306]	566	ELSE
[1318]	567	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
[1216]	568	ENDIF
	569
[1306]	570	CALL transpose_zy( ar1(:,:,:), f_in )
[1216]	571
[1306]	572	DO jnew = 0, ny-1
	573	jind = jj + ( nblk + 1 ) * jnew
	574	f_inv(:,:,jind) = f_in(:,:,jnew)
	575	ENDDO
[1216]	576
[1306]	577	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
[1216]	578
	579	ENDDO
	580	!
	581	!-- Restore original indices/counters
	582	ny = isave(1)
	583	nys_z = isave(2)
	584	nyn_z = isave(3)
	585	sendrecvcount_yz = isave(4)
	586
	587	CALL resort_for_zy( f_inv, f_out_y )
	588
	589	DEALLOCATE( ar1, f_in, f_inv )
	590
	591	!
	592	!-- fft along y backward + subsequent transposition y --> x
	593	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
	594	!
	595	!-- Save original indices and gridpoint counter
	596	isave(1) = nx
	597	isave(2) = nxl_y
	598	isave(3) = nxr_y
	599	isave(4) = sendrecvcount_xy
	600	!
	601	!-- Set new indices for transformation
	602	nblk = (( nx+1 ) / pdims(2) ) - 1
	603	nx = pdims(2)
	604	nnx_y = 1
	605	nxl_y = myidy * nnx_y
	606	nxr_y = ( myidy + 1 ) * nnx_y - 1
	607	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
	608
[1306]	609	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
[1216]	610	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
	611
[1306]	612	DO ii = 0, nblk
[1216]	613
[1318]	614	CALL cpu_log( log_point_s(7), 'fft_y', 'continue', cpu_log_nowait )
[1216]	615
[1306]	616	n = isave(2) + ii
	617	nxl_y_bound = isave(2)
	618	nxr_y_bound = isave(3)
[1216]	619
[1306]	620	CALL fft_y( ar1(:,:,:), 'backward', ar_tr = f_out_y, &
	621	nxl_y_bound = nxl_y_bound, nxr_y_bound = nxr_y_bound, &
	622	nxl_y_l = n, nxr_y_l = n )
[1216]	623
[1306]	624	IF ( ii == nblk ) THEN
	625	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	626	ELSE
	627	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
[1216]	628	ENDIF
	629
[1318]	630	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
[1216]	631
[1306]	632	CALL transpose_yx( ar1(:,:,:), f_in )
[1216]	633
[1306]	634	DO inew = 0, nx-1
	635	iind = ii + (nblk+1) * inew
	636	f_inv(:,:,iind) = f_in(:,:,inew)
	637	ENDDO
[1216]	638
[1306]	639	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
[1216]	640
	641	ENDDO
	642	!
	643	!-- Restore original indices/counters
	644	nx = isave(1)
	645	nxl_y = isave(2)
	646	nxr_y = isave(3)
	647	sendrecvcount_xy = isave(4)
	648
	649	CALL resort_for_yx( f_inv, f_out_z )
	650
	651	DEALLOCATE( ar1, f_in, f_inv )
	652
	653	!
	654	!-- fft along x backward + subsequent final transposition x --> z
	655	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
	656	!
	657	!-- Save original indices and gridpoint counter
	658	isave(1) = nz
	659	isave(2) = nzb_x
	660	isave(3) = nzt_x
	661	isave(4) = sendrecvcount_zx
	662	!
	663	!-- Set new indices for transformation
	664	nblk = nz / pdims(1)
	665	nz = pdims(1)
	666	nnz_x = 1
	667	nzb_x = 1 + myidx * nnz_x
	668	nzt_x = ( myidx + 1 ) * nnz_x
	669	sendrecvcount_zx = nnx * nny * nnz_x
	670
[1306]	671	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
[1216]	672	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
	673
[1306]	674	DO kk = 1, nblk
[1216]	675
[1318]	676	CALL cpu_log( log_point_s(4), 'fft_x', 'continue', cpu_log_nowait )
[1216]	677
[1306]	678	n = isave(2) + kk - 1
	679	CALL fft_x( ar1(:,:,:), 'backward', f_out_z(:,:,n))
[1216]	680
[1306]	681	IF ( kk == nblk ) THEN
	682	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	683	ELSE
	684	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
[1216]	685	ENDIF
	686
[1318]	687	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
[1216]	688
[1306]	689	CALL transpose_xz( ar1(:,:,:), f_in )
[1216]	690
[1306]	691	DO knew = 1, nz
	692	ki = kk + nblk * (knew-1)
	693	f_inv(:,:,ki) = f_in(:,:,knew)
	694	ENDDO
[1216]	695
[1306]	696	IF ( kk == nblk ) THEN
	697	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
	698	ELSE
	699	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
[1216]	700	ENDIF
	701
	702	ENDDO
	703	!
	704	!-- Restore original indices/counters
	705	nz = isave(1)
	706	nzb_x = isave(2)
	707	nzt_x = isave(3)
	708	sendrecvcount_zx = isave(4)
	709
	710	CALL resort_for_xz( f_inv, ar )
	711
	712	DEALLOCATE( ar1, f_in, f_inv )
	713
[1]	714	ENDIF
[3634]	715	#endif
[1]	716
[3690]	717	#if !__acc_fft_device
	718	!$ACC UPDATE DEVICE(ar)
	719	#endif
	720
[1]	721	CALL cpu_log( log_point_s(3), 'poisfft', 'stop' )
	722
	723	END SUBROUTINE poisfft
	724
	725
	726	!------------------------------------------------------------------------------!
[1682]	727	! Description:
	728	! ------------
	729	!> Fourier-transformation along y with subsequent transposition y --> x for
	730	!> a 1d-decomposition along x.
	731	!>
	732	!> @attention The performance of this routine is much faster on the NEC-SX6,
	733	!> if the first index of work_ffty_vec is odd. Otherwise
	734	!> memory bank conflicts may occur (especially if the index is a
	735	!> multiple of 128). That's why work_ffty_vec is dimensioned as
	736	!> 0:ny+1.
	737	!> Of course, this will not work if users are using an odd number
	738	!> of gridpoints along y.
[1]	739	!------------------------------------------------------------------------------!
[1682]	740	SUBROUTINE ffty_tr_yx( f_in, f_out )
[1]	741
[1320]	742	USE control_parameters, &
[2300]	743	ONLY: loop_optimization
[1320]	744
	745	USE cpulog, &
	746	ONLY: cpu_log, log_point_s
	747
	748	USE kinds
	749
[1]	750	USE pegrid
	751
	752	IMPLICIT NONE
	753
[1682]	754	INTEGER(iwp) :: i !<
	755	INTEGER(iwp) :: iend !<
	756	INTEGER(iwp) :: iouter !<
	757	INTEGER(iwp) :: ir !<
	758	INTEGER(iwp) :: j !<
	759	INTEGER(iwp) :: k !<
[1]	760
[1682]	761	INTEGER(iwp), PARAMETER :: stridex = 4 !<
[1320]	762
[1682]	763	REAL(wp), DIMENSION(1:nz,0:ny,nxl:nxr) :: f_in !<
	764	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_out !<
	765	REAL(wp), DIMENSION(nxl:nxr,1:nz,0:ny) :: work !<
[1]	766
[2300]	767	REAL(wp), DIMENSION(:,:), ALLOCATABLE :: work_ffty !<
	768	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: work_ffty_vec !<
	769
[1]	770	!
	771	!-- Carry out the FFT along y, where all data are present due to the
	772	!-- 1d-decomposition along x. Resort the data in a way that x becomes
	773	!-- the first index.
[1106]	774	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'start' )
[1]	775
[2300]	776	IF ( loop_optimization == 'vector' ) THEN
	777
	778	ALLOCATE( work_ffty_vec(0:ny+1,1:nz,nxl:nxr) )
[1]	779	!
	780	!-- Code optimized for vector processors
[2300]	781	!$OMP PARALLEL PRIVATE ( i, j, k )
	782	!$OMP DO
[1]	783	DO i = nxl, nxr
	784
	785	DO j = 0, ny
	786	DO k = 1, nz
	787	work_ffty_vec(j,k,i) = f_in(k,j,i)
	788	ENDDO
	789	ENDDO
	790
	791	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' )
	792
	793	ENDDO
	794
[2300]	795	!$OMP DO
[1]	796	DO k = 1, nz
	797	DO j = 0, ny
	798	DO i = nxl, nxr
	799	work(i,k,j) = work_ffty_vec(j,k,i)
	800	ENDDO
	801	ENDDO
	802	ENDDO
[2300]	803	!$OMP END PARALLEL
[1]	804
[2300]	805	DEALLOCATE( work_ffty_vec )
	806
[1]	807	ELSE
	808	!
	809	!-- Cache optimized code.
[2300]	810	ALLOCATE( work_ffty(0:ny,stridex) )
	811	!
[1]	812	!-- The i-(x-)direction is split into a strided outer loop and an inner
	813	!-- loop for better cache performance
[2300]	814	!$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty)
	815	!$OMP DO
[1]	816	DO iouter = nxl, nxr, stridex
	817
	818	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
	819
	820	DO k = 1, nz
	821
	822	DO i = iouter, iend
	823
	824	ir = i-iouter+1 ! counter within a stride
	825	DO j = 0, ny
	826	work_ffty(j,ir) = f_in(k,j,i)
	827	ENDDO
	828	!
	829	!-- FFT along y
[1106]	830	CALL fft_y_1d( work_ffty(:,ir), 'forward' )
[1]	831
	832	ENDDO
	833
	834	!
	835	!-- Resort
	836	DO j = 0, ny
	837	DO i = iouter, iend
	838	work(i,k,j) = work_ffty(j,i-iouter+1)
	839	ENDDO
	840	ENDDO
	841
	842	ENDDO
	843
	844	ENDDO
[2300]	845	!$OMP END PARALLEL
[1]	846
[2300]	847	DEALLOCATE( work_ffty )
	848
[1]	849	ENDIF
[2300]	850
[1106]	851	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'pause' )
[1]	852
	853	!
	854	!-- Transpose array
[1111]	855	#if defined( __parallel )
[1]	856	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	857	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	858	CALL MPI_ALLTOALL( work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
	859	f_out(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
	860	comm1dx, ierr )
	861	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1111]	862	#endif
[1]	863
	864	END SUBROUTINE ffty_tr_yx
	865
	866
	867	!------------------------------------------------------------------------------!
[1682]	868	! Description:
	869	! ------------
	870	!> Transposition x --> y with a subsequent backward Fourier transformation for
	871	!> a 1d-decomposition along x
[1]	872	!------------------------------------------------------------------------------!
[1682]	873	SUBROUTINE tr_xy_ffty( f_in, f_out )
[1]	874
[1320]	875	USE control_parameters, &
[2300]	876	ONLY: loop_optimization
[1320]	877
	878	USE cpulog, &
	879	ONLY: cpu_log, log_point_s
	880
	881	USE kinds
	882
[1]	883	USE pegrid
	884
	885	IMPLICIT NONE
	886
[1682]	887	INTEGER(iwp) :: i !<
	888	INTEGER(iwp) :: iend !<
	889	INTEGER(iwp) :: iouter !<
	890	INTEGER(iwp) :: ir !<
	891	INTEGER(iwp) :: j !<
	892	INTEGER(iwp) :: k !<
[1]	893
[1682]	894	INTEGER(iwp), PARAMETER :: stridex = 4 !<
[1320]	895
[1682]	896	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_in !<
	897	REAL(wp), DIMENSION(1:nz,0:ny,nxl:nxr) :: f_out !<
	898	REAL(wp), DIMENSION(nxl:nxr,1:nz,0:ny) :: work !<
[1]	899
[2300]	900	REAL(wp), DIMENSION(:,:), ALLOCATABLE :: work_ffty !<
	901	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: work_ffty_vec !<
	902
[1]	903	!
	904	!-- Transpose array
[1111]	905	#if defined( __parallel )
[1]	906	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	907	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	908	CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
	909	work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
	910	comm1dx, ierr )
	911	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1111]	912	#endif
[1]	913
	914	!
	915	!-- Resort the data in a way that y becomes the first index and carry out the
	916	!-- backward fft along y.
[1106]	917	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'continue' )
[1]	918
[2300]	919	IF ( loop_optimization == 'vector' ) THEN
	920
	921	ALLOCATE( work_ffty_vec(0:ny+1,1:nz,nxl:nxr) )
[1]	922	!
	923	!-- Code optimized for vector processors
[2300]	924	!$OMP PARALLEL PRIVATE ( i, j, k )
	925	!$OMP DO
[1]	926	DO k = 1, nz
	927	DO j = 0, ny
	928	DO i = nxl, nxr
	929	work_ffty_vec(j,k,i) = work(i,k,j)
	930	ENDDO
	931	ENDDO
	932	ENDDO
	933
[2300]	934	!$OMP DO
[1]	935	DO i = nxl, nxr
	936
	937	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' )
	938
	939	DO j = 0, ny
	940	DO k = 1, nz
	941	f_out(k,j,i) = work_ffty_vec(j,k,i)
	942	ENDDO
	943	ENDDO
	944
	945	ENDDO
[2300]	946	!$OMP END PARALLEL
[1]	947
[2300]	948	DEALLOCATE( work_ffty_vec )
	949
[1]	950	ELSE
	951	!
	952	!-- Cache optimized code.
[2300]	953	ALLOCATE( work_ffty(0:ny,stridex) )
	954	!
[1]	955	!-- The i-(x-)direction is split into a strided outer loop and an inner
	956	!-- loop for better cache performance
[2300]	957	!$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty )
	958	!$OMP DO
[1]	959	DO iouter = nxl, nxr, stridex
	960
	961	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
	962
	963	DO k = 1, nz
	964	!
	965	!-- Resort
	966	DO j = 0, ny
	967	DO i = iouter, iend
	968	work_ffty(j,i-iouter+1) = work(i,k,j)
	969	ENDDO
	970	ENDDO
	971
	972	DO i = iouter, iend
	973
	974	!
	975	!-- FFT along y
	976	ir = i-iouter+1 ! counter within a stride
[1106]	977	CALL fft_y_1d( work_ffty(:,ir), 'backward' )
[1]	978
	979	DO j = 0, ny
	980	f_out(k,j,i) = work_ffty(j,ir)
	981	ENDDO
	982	ENDDO
	983
	984	ENDDO
	985
	986	ENDDO
[2300]	987	!$OMP END PARALLEL
[1]	988
[2300]	989	DEALLOCATE( work_ffty )
	990
[1]	991	ENDIF
	992
[1106]	993	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'stop' )
[1]	994
	995	END SUBROUTINE tr_xy_ffty
	996
	997
	998	!------------------------------------------------------------------------------!
[1682]	999	! Description:
	1000	! ------------
	1001	!> FFT along x, solution of the tridiagonal system and backward FFT for
	1002	!> a 1d-decomposition along x
	1003	!>
	1004	!> @warning this subroutine may still not work for hybrid parallelization
	1005	!> with OpenMP (for possible necessary changes see the original
	1006	!> routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
[1]	1007	!------------------------------------------------------------------------------!
[1682]	1008	SUBROUTINE fftx_tri_fftx( ar )
[1]	1009
[1320]	1010	USE control_parameters, &
[2300]	1011	ONLY: loop_optimization
[1320]	1012
	1013	USE cpulog, &
	1014	ONLY: cpu_log, log_point_s
	1015
	1016	USE grid_variables, &
	1017	ONLY: ddx2, ddy2
	1018
	1019	USE kinds
	1020
[1]	1021	USE pegrid
	1022
	1023	IMPLICIT NONE
	1024
[1682]	1025	INTEGER(iwp) :: i !<
	1026	INTEGER(iwp) :: j !<
	1027	INTEGER(iwp) :: k !<
	1028	INTEGER(iwp) :: m !<
	1029	INTEGER(iwp) :: n !<
[3241]	1030	!$ INTEGER(iwp) :: omp_get_thread_num !<
[1682]	1031	INTEGER(iwp) :: tn !<
[1]	1032
[1682]	1033	REAL(wp), DIMENSION(0:nx) :: work_fftx !<
	1034	REAL(wp), DIMENSION(0:nx,1:nz) :: work_trix !<
	1035	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: ar !<
	1036	REAL(wp), DIMENSION(:,:,:,:), ALLOCATABLE :: tri !<
[1]	1037
	1038
[1106]	1039	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'start' )
[1]	1040
	1041	ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) )
	1042
	1043	tn = 0 ! Default thread number in case of one thread
	1044	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_fftx, work_trix )
	1045	DO j = nys_x, nyn_x
	1046
	1047	!$ tn = omp_get_thread_num()
	1048
[2300]	1049	IF ( loop_optimization == 'vector' ) THEN
[1]	1050	!
	1051	!-- Code optimized for vector processors
	1052	DO k = 1, nz
	1053
	1054	m = 0
	1055	DO n = 1, pdims(1)
[1003]	1056	DO i = 1, nnx
[1]	1057	work_trix(m,k) = ar(i,k,j,n)
	1058	m = m + 1
	1059	ENDDO
	1060	ENDDO
	1061
	1062	ENDDO
	1063
	1064	CALL fft_x_m( work_trix, 'forward' )
	1065
	1066	ELSE
	1067	!
	1068	!-- Cache optimized code
	1069	DO k = 1, nz
	1070
	1071	m = 0
	1072	DO n = 1, pdims(1)
[1003]	1073	DO i = 1, nnx
[1]	1074	work_fftx(m) = ar(i,k,j,n)
	1075	m = m + 1
	1076	ENDDO
	1077	ENDDO
	1078
[1106]	1079	CALL fft_x_1d( work_fftx, 'forward' )
[1]	1080
	1081	DO i = 0, nx
	1082	work_trix(i,k) = work_fftx(i)
	1083	ENDDO
	1084
	1085	ENDDO
	1086
	1087	ENDIF
	1088
	1089	!
	1090	!-- Solve the linear equation system
	1091	CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) )
	1092
[2300]	1093	IF ( loop_optimization == 'vector' ) THEN
[1]	1094	!
	1095	!-- Code optimized for vector processors
	1096	CALL fft_x_m( work_trix, 'backward' )
	1097
	1098	DO k = 1, nz
	1099
	1100	m = 0
	1101	DO n = 1, pdims(1)
[1003]	1102	DO i = 1, nnx
[1]	1103	ar(i,k,j,n) = work_trix(m,k)
	1104	m = m + 1
	1105	ENDDO
	1106	ENDDO
	1107
	1108	ENDDO
	1109
	1110	ELSE
	1111	!
	1112	!-- Cache optimized code
	1113	DO k = 1, nz
	1114
	1115	DO i = 0, nx
	1116	work_fftx(i) = work_trix(i,k)
	1117	ENDDO
	1118
[1106]	1119	CALL fft_x_1d( work_fftx, 'backward' )
[1]	1120
	1121	m = 0
	1122	DO n = 1, pdims(1)
[1003]	1123	DO i = 1, nnx
[1]	1124	ar(i,k,j,n) = work_fftx(m)
	1125	m = m + 1
	1126	ENDDO
	1127	ENDDO
	1128
	1129	ENDDO
	1130
	1131	ENDIF
	1132
	1133	ENDDO
	1134
	1135	DEALLOCATE( tri )
	1136
[1106]	1137	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'stop' )
[1]	1138
	1139	END SUBROUTINE fftx_tri_fftx
	1140
	1141
	1142	!------------------------------------------------------------------------------!
[1682]	1143	! Description:
	1144	! ------------
	1145	!> Fourier-transformation along x with subsequent transposition x --> y for
	1146	!> a 1d-decomposition along y.
	1147	!>
	1148	!> @attention NEC-branch of this routine may significantly profit from
	1149	!> further optimizations. So far, performance is much worse than
	1150	!> for routine ffty_tr_yx (more than three times slower).
[1]	1151	!------------------------------------------------------------------------------!
[1682]	1152	SUBROUTINE fftx_tr_xy( f_in, f_out )
[1]	1153
[1682]	1154
[1320]	1155	USE control_parameters, &
[2300]	1156	ONLY: loop_optimization
[1320]	1157
	1158	USE cpulog, &
	1159	ONLY: cpu_log, log_point_s
	1160
	1161	USE kinds
	1162
[1]	1163	USE pegrid
	1164
	1165	IMPLICIT NONE
	1166
[1682]	1167	INTEGER(iwp) :: i !<
	1168	INTEGER(iwp) :: j !<
	1169	INTEGER(iwp) :: k !<
[1]	1170
[1682]	1171	REAL(wp), DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx !<
	1172	REAL(wp), DIMENSION(1:nz,nys:nyn,0:nx) :: f_in !<
	1173	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_out !<
	1174	REAL(wp), DIMENSION(nys:nyn,1:nz,0:nx) :: work !<
[1]	1175
	1176	!
	1177	!-- Carry out the FFT along x, where all data are present due to the
	1178	!-- 1d-decomposition along y. Resort the data in a way that y becomes
	1179	!-- the first index.
[1106]	1180	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'start' )
[1]	1181
[2300]	1182	IF ( loop_optimization == 'vector' ) THEN
[1]	1183	!
	1184	!-- Code for vector processors
[85]	1185	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	1186	!$OMP DO
	1187	DO i = 0, nx
	1188
	1189	DO j = nys, nyn
	1190	DO k = 1, nz
	1191	work_fftx(i,k,j) = f_in(k,j,i)
	1192	ENDDO
	1193	ENDDO
	1194
	1195	ENDDO
	1196
	1197	!$OMP DO
	1198	DO j = nys, nyn
	1199
	1200	CALL fft_x_m( work_fftx(:,:,j), 'forward' )
	1201
	1202	DO k = 1, nz
	1203	DO i = 0, nx
	1204	work(j,k,i) = work_fftx(i,k,j)
	1205	ENDDO
	1206	ENDDO
	1207
	1208	ENDDO
	1209	!$OMP END PARALLEL
	1210
	1211	ELSE
	1212
	1213	!
	1214	!-- Cache optimized code (there might be still a potential for better
	1215	!-- optimization).
[696]	1216	!$OMP PARALLEL PRIVATE (i,j,k)
[1]	1217	!$OMP DO
	1218	DO i = 0, nx
	1219
	1220	DO j = nys, nyn
	1221	DO k = 1, nz
	1222	work_fftx(i,k,j) = f_in(k,j,i)
	1223	ENDDO
	1224	ENDDO
	1225
	1226	ENDDO
	1227
	1228	!$OMP DO
	1229	DO j = nys, nyn
	1230	DO k = 1, nz
	1231
[1106]	1232	CALL fft_x_1d( work_fftx(0:nx,k,j), 'forward' )
[1]	1233
	1234	DO i = 0, nx
	1235	work(j,k,i) = work_fftx(i,k,j)
	1236	ENDDO
	1237	ENDDO
	1238
	1239	ENDDO
	1240	!$OMP END PARALLEL
	1241
	1242	ENDIF
[1106]	1243	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'pause' )
[1]	1244
	1245	!
	1246	!-- Transpose array
[1111]	1247	#if defined( __parallel )
[1]	1248	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	1249	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	1250	CALL MPI_ALLTOALL( work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
	1251	f_out(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
	1252	comm1dy, ierr )
	1253	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1111]	1254	#endif
[1]	1255
	1256	END SUBROUTINE fftx_tr_xy
	1257
	1258
	1259	!------------------------------------------------------------------------------!
[1682]	1260	! Description:
	1261	! ------------
	1262	!> Transposition y --> x with a subsequent backward Fourier transformation for
	1263	!> a 1d-decomposition along x.
[1]	1264	!------------------------------------------------------------------------------!
[1682]	1265	SUBROUTINE tr_yx_fftx( f_in, f_out )
[1]	1266
[1682]	1267
[1320]	1268	USE control_parameters, &
[2300]	1269	ONLY: loop_optimization
[1320]	1270
	1271	USE cpulog, &
	1272	ONLY: cpu_log, log_point_s
	1273
	1274	USE kinds
	1275
[1]	1276	USE pegrid
	1277
	1278	IMPLICIT NONE
	1279
[1682]	1280	INTEGER(iwp) :: i !<
	1281	INTEGER(iwp) :: j !<
	1282	INTEGER(iwp) :: k !<
[1]	1283
[1682]	1284	REAL(wp), DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx !<
	1285	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_in !<
	1286	REAL(wp), DIMENSION(1:nz,nys:nyn,0:nx) :: f_out !<
	1287	REAL(wp), DIMENSION(nys:nyn,1:nz,0:nx) :: work !<
[1]	1288
	1289	!
	1290	!-- Transpose array
[1111]	1291	#if defined( __parallel )
[1]	1292	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	1293	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	1294	CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
	1295	work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
	1296	comm1dy, ierr )
	1297	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1111]	1298	#endif
[1]	1299
	1300	!
	1301	!-- Carry out the FFT along x, where all data are present due to the
	1302	!-- 1d-decomposition along y. Resort the data in a way that y becomes
	1303	!-- the first index.
[1106]	1304	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'continue' )
[1]	1305
[2300]	1306	IF ( loop_optimization == 'vector' ) THEN
[1]	1307	!
	1308	!-- Code optimized for vector processors
[85]	1309	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	1310	!$OMP DO
	1311	DO j = nys, nyn
	1312
	1313	DO k = 1, nz
	1314	DO i = 0, nx
	1315	work_fftx(i,k,j) = work(j,k,i)
	1316	ENDDO
	1317	ENDDO
	1318
	1319	CALL fft_x_m( work_fftx(:,:,j), 'backward' )
	1320
	1321	ENDDO
	1322
	1323	!$OMP DO
	1324	DO i = 0, nx
	1325	DO j = nys, nyn
	1326	DO k = 1, nz
	1327	f_out(k,j,i) = work_fftx(i,k,j)
	1328	ENDDO
	1329	ENDDO
	1330	ENDDO
	1331	!$OMP END PARALLEL
	1332
	1333	ELSE
	1334
	1335	!
	1336	!-- Cache optimized code (there might be still a potential for better
	1337	!-- optimization).
[696]	1338	!$OMP PARALLEL PRIVATE (i,j,k)
[1]	1339	!$OMP DO
	1340	DO j = nys, nyn
	1341	DO k = 1, nz
	1342
	1343	DO i = 0, nx
	1344	work_fftx(i,k,j) = work(j,k,i)
	1345	ENDDO
	1346
[1106]	1347	CALL fft_x_1d( work_fftx(0:nx,k,j), 'backward' )
[1]	1348
	1349	ENDDO
	1350	ENDDO
	1351
	1352	!$OMP DO
	1353	DO i = 0, nx
	1354	DO j = nys, nyn
	1355	DO k = 1, nz
	1356	f_out(k,j,i) = work_fftx(i,k,j)
	1357	ENDDO
	1358	ENDDO
	1359	ENDDO
	1360	!$OMP END PARALLEL
	1361
	1362	ENDIF
[1106]	1363	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'stop' )
[1]	1364
	1365	END SUBROUTINE tr_yx_fftx
	1366
	1367
	1368	!------------------------------------------------------------------------------!
[1682]	1369	! Description:
	1370	! ------------
	1371	!> FFT along y, solution of the tridiagonal system and backward FFT for
	1372	!> a 1d-decomposition along y.
	1373	!>
	1374	!> @warning this subroutine may still not work for hybrid parallelization
	1375	!> with OpenMP (for possible necessary changes see the original
	1376	!> routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
[1]	1377	!------------------------------------------------------------------------------!
[1682]	1378	SUBROUTINE ffty_tri_ffty( ar )
[1]	1379
[1682]	1380
[1320]	1381	USE control_parameters, &
[2300]	1382	ONLY: loop_optimization
[1320]	1383
	1384	USE cpulog, &
	1385	ONLY: cpu_log, log_point_s
	1386
	1387	USE grid_variables, &
	1388	ONLY: ddx2, ddy2
	1389
	1390	USE kinds
	1391
[1]	1392	USE pegrid
	1393
	1394	IMPLICIT NONE
	1395
[1682]	1396	INTEGER(iwp) :: i !<
	1397	INTEGER(iwp) :: j !<
	1398	INTEGER(iwp) :: k !<
	1399	INTEGER(iwp) :: m !<
	1400	INTEGER(iwp) :: n !<
[3241]	1401	!$ INTEGER(iwp) :: omp_get_thread_num !<
[1682]	1402	INTEGER(iwp) :: tn !<
[1]	1403
[1682]	1404	REAL(wp), DIMENSION(0:ny) :: work_ffty !<
	1405	REAL(wp), DIMENSION(0:ny,1:nz) :: work_triy !<
	1406	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: ar !<
	1407	REAL(wp), DIMENSION(:,:,:,:), ALLOCATABLE :: tri !<
[1]	1408
	1409
[1106]	1410	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'start' )
[1]	1411
	1412	ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) )
	1413
	1414	tn = 0 ! Default thread number in case of one thread
[696]	1415	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy )
[1]	1416	DO i = nxl_y, nxr_y
	1417
	1418	!$ tn = omp_get_thread_num()
	1419
[2300]	1420	IF ( loop_optimization == 'vector' ) THEN
[1]	1421	!
	1422	!-- Code optimized for vector processors
	1423	DO k = 1, nz
	1424
	1425	m = 0
	1426	DO n = 1, pdims(2)
[1003]	1427	DO j = 1, nny
[1]	1428	work_triy(m,k) = ar(j,k,i,n)
	1429	m = m + 1
	1430	ENDDO
	1431	ENDDO
	1432
	1433	ENDDO
	1434
	1435	CALL fft_y_m( work_triy, ny, 'forward' )
	1436
	1437	ELSE
	1438	!
	1439	!-- Cache optimized code
	1440	DO k = 1, nz
	1441
	1442	m = 0
	1443	DO n = 1, pdims(2)
[1003]	1444	DO j = 1, nny
[1]	1445	work_ffty(m) = ar(j,k,i,n)
	1446	m = m + 1
	1447	ENDDO
	1448	ENDDO
	1449
[1106]	1450	CALL fft_y_1d( work_ffty, 'forward' )
[1]	1451
	1452	DO j = 0, ny
	1453	work_triy(j,k) = work_ffty(j)
	1454	ENDDO
	1455
	1456	ENDDO
	1457
	1458	ENDIF
	1459
	1460	!
	1461	!-- Solve the linear equation system
	1462	CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) )
	1463
[2300]	1464	IF ( loop_optimization == 'vector' ) THEN
[1]	1465	!
	1466	!-- Code optimized for vector processors
	1467	CALL fft_y_m( work_triy, ny, 'backward' )
	1468
	1469	DO k = 1, nz
	1470
	1471	m = 0
	1472	DO n = 1, pdims(2)
[1003]	1473	DO j = 1, nny
[1]	1474	ar(j,k,i,n) = work_triy(m,k)
	1475	m = m + 1
	1476	ENDDO
	1477	ENDDO
	1478
	1479	ENDDO
	1480
	1481	ELSE
	1482	!
	1483	!-- Cache optimized code
	1484	DO k = 1, nz
	1485
	1486	DO j = 0, ny
	1487	work_ffty(j) = work_triy(j,k)
	1488	ENDDO
	1489
[1106]	1490	CALL fft_y_1d( work_ffty, 'backward' )
[1]	1491
	1492	m = 0
	1493	DO n = 1, pdims(2)
[1003]	1494	DO j = 1, nny
[1]	1495	ar(j,k,i,n) = work_ffty(m)
	1496	m = m + 1
	1497	ENDDO
	1498	ENDDO
	1499
	1500	ENDDO
	1501
	1502	ENDIF
	1503
	1504	ENDDO
	1505
	1506	DEALLOCATE( tri )
	1507
[1106]	1508	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'stop' )
[1]	1509
	1510	END SUBROUTINE ffty_tri_ffty
	1511
	1512	END MODULE poisfft_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |