Home

Context Navigation

source: palm/trunk/SOURCE/poisfft_mod.f90 @ 3550

Last change on this file since 3550 was 3241, checked in by raasch, 6 years ago
various changes to avoid compiler warnings (mainly removal of unused variables)
Property svn:keywords set to `Id`
File size: 45.5 KB

Rev	Line
[1850]	1	!> @file poisfft_mod.f90
[2000]	2	!------------------------------------------------------------------------------!
[2696]	3	! This file is part of the PALM model system.
[1036]	4	!
[2000]	5	! PALM is free software: you can redistribute it and/or modify it under the
	6	! terms of the GNU General Public License as published by the Free Software
	7	! Foundation, either version 3 of the License, or (at your option) any later
	8	! version.
[1036]	9	!
	10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
	11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
	12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
	13	!
	14	! You should have received a copy of the GNU General Public License along with
	15	! PALM. If not, see <http://www.gnu.org/licenses/>.
	16	!
[2718]	17	! Copyright 1997-2018 Leibniz Universitaet Hannover
[2000]	18	!------------------------------------------------------------------------------!
[1036]	19	!
[484]	20	! Current revisions:
[1]	21	! -----------------
[1683]	22	!
[2119]	23	!
[1321]	24	! Former revisions:
	25	! -----------------
	26	! $Id: poisfft_mod.f90 3241 2018-09-12 15:02:00Z gronemeier $
[3241]	27	! unused variables removed,
	28	! declarations of omp_get_thread_num now as omp-directive
	29	!
	30	! 2718 2018-01-02 08:49:38Z maronga
[2716]	31	! Corrected "Former revisions" section
	32	!
	33	! 2696 2017-12-14 17:12:51Z kanani
	34	! Change in file header (GPL part)
	35	!
	36	! 2300 2017-06-29 13:31:14Z raasch
[2300]	37	! settings depending on host variable removed or replaced by loop_optimization
	38	!
	39	! 2119 2017-01-17 16:51:50Z raasch
[1321]	40	!
[2119]	41	! 2118 2017-01-17 16:38:49Z raasch
	42	! OpenACC directives and related code removed
	43	!
[2001]	44	! 2000 2016-08-20 18:09:15Z knoop
	45	! Forced header and separation lines into 80 columns
	46	!
[1851]	47	! 1850 2016-04-08 13:29:27Z maronga
	48	! Module renamed
	49	!
	50	!
[1805]	51	! 1804 2016-04-05 16:30:18Z maronga
	52	! Removed code for parameter file check (__check)
	53	!
[1683]	54	! 1682 2015-10-07 23:56:08Z knoop
	55	! Code annotations made doxygen readable
	56	!
[1483]	57	! 1482 2014-10-18 12:34:45Z raasch
	58	! use 2d-decomposition, if accelerator boards are used
	59	!
[1407]	60	! 1406 2014-05-16 13:47:01Z raasch
	61	! bugfix for pgi 14.4: declare create moved after array declaration
	62	!
[1321]	63	! 1320 2014-03-20 08:40:49Z raasch
[1320]	64	! ONLY-attribute added to USE-statements,
	65	! kind-parameters added to all INTEGER and REAL declaration statements,
	66	! kinds are defined in new module kinds,
	67	! old module precision_kind is removed,
	68	! revision history before 2012 removed,
	69	! comment fields (!:) to be used for variable explanations added to
	70	! all variable declaration statements
[1217]	71	!
[1319]	72	! 1318 2014-03-17 13:35:16Z raasch
	73	! module interfaces removed
	74	!
[1307]	75	! 1306 2014-03-13 14:30:59Z raasch
	76	! openmp sections removed from the overlap branch,
	77	! second argument removed from parameter list
	78	!
[1217]	79	! 1216 2013-08-26 09:31:42Z raasch
[1216]	80	! resorting of arrays moved to separate routines resort_for_...,
	81	! one argument, used as temporary work array, removed from all transpose
	82	! routines
	83	! overlapping fft / transposition implemented
[1112]	84	!
[1213]	85	! 1212 2013-08-15 08:46:27Z raasch
	86	! tridia routines moved to seperate module tridia_solver
	87	!
[1209]	88	! 1208 2013-08-13 06:41:49Z raasch
	89	! acc-update clauses added for "ar" so that ffts other than cufft can also be
	90	! used (although they are not ported and will give a poor performance)
	91	!
[1112]	92	! 1111 2013-03-08 23:54:10Z raasch
[1111]	93	! further openACC porting of non-parallel (MPI) branch:
	94	! tridiagonal routines split into extermal subroutines (instead using CONTAINS),
	95	! no distinction between parallel/non-parallel in poisfft and tridia any more,
[1112]	96	! tridia routines moved to end of file because of probable bug in PGI compiler 12.5
[1111]	97	! (otherwise "invalid device function" is indicated during runtime),
	98	! optimization of tridia routines: constant elements and coefficients of tri are
	99	! stored in seperate arrays ddzuw and tric, last dimension of tri reduced from 5
	100	! to 2,
	101	! poisfft_init is now called internally from poisfft, maketri is called from
	102	! poisfft_init,
	103	! ibc_p_b = 2 removed
[1]	104	!
[1107]	105	! 1106 2013-03-04 05:31:38Z raasch
	106	! routines fftx, ffty, fftxp, fftyp removed, calls replaced by fft_x, fft_y,
	107	! in the 1D-decomposition routines fft_x, ffty are replaced by fft_x_1d,
	108	! fft_y_1d
	109	!
[1104]	110	! 1103 2013-02-20 02:15:53Z raasch
	111	! tri, ar, and ar1 arguments in tridia-routines (2d) are removed because they
	112	! sometimes cause segmentation faults with intel 12.1 compiler
	113	!
[1093]	114	! 1092 2013-02-02 11:24:22Z raasch
	115	! unused variables removed
	116	!
[1037]	117	! 1036 2012-10-22 13:43:42Z raasch
	118	! code put under GPL (PALM 3.9)
	119	!
[1014]	120	! 2012-09-21 07:03:55Z raasch
	121	! FLOAT type conversion replaced by REAL
	122	!
[1004]	123	! 1003 2012-09-14 14:35:53Z raasch
	124	! indices nxa, nya, etc. replaced by nx, ny, etc.
	125	!
[941]	126	! 940 2012-07-09 14:31:00Z raasch
	127	! special handling of tri-array as an argument in tridia_1dd routines switched
	128	! off because it caused segmentation faults with intel 12.1 compiler
	129	!
[878]	130	! 877 2012-04-03 11:21:44Z suehring
	131	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
	132	! pressure at the top of the model domain.
	133	!
[810]	134	! 809 2012-01-30 13:32:58Z maronga
	135	! Bugfix: replaced .AND. and .NOT. with && and ! in the preprocessor directives
	136	!
[808]	137	! 807 2012-01-25 11:53:51Z maronga
	138	! New cpp directive "__check" implemented which is used by check_namelist_files
	139	! (most of the code is unneeded by check_namelist_files).
	140	!
[1]	141	! Revision 1.1 1997/07/24 11:24:14 raasch
	142	! Initial revision
	143	!
	144	!
	145	! Description:
	146	! ------------
[1682]	147	!> Solves the Poisson equation with a 2D spectral method
	148	!> d^2 p / dx^2 + d^2 p / dy^2 + d^2 p / dz^2 = s
	149	!>
	150	!> Input:
	151	!> real ar contains (nnz,nny,nnx) elements of the velocity divergence,
	152	!> starting from (1,nys,nxl)
	153	!>
	154	!> Output:
	155	!> real ar contains the solution for perturbation pressure p
[1]	156	!------------------------------------------------------------------------------!
[1682]	157	MODULE poisfft_mod
	158
[1]	159
[1320]	160	USE fft_xy, &
	161	ONLY: fft_init, fft_y, fft_y_1d, fft_y_m, fft_x, fft_x_1d, fft_x_m
[1]	162
[1320]	163	USE indices, &
	164	ONLY: nnx, nny, nx, nxl, nxr, ny, nys, nyn, nz
	165
	166	USE transpose_indices, &
	167	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nys_x, nys_z, nyn_x, nyn_z, nzb_x, &
	168	nzb_y, nzt_x, nzt_y
	169
	170	USE tridia_solver, &
	171	ONLY: tridia_1dd, tridia_init, tridia_substi, tridia_substi_overlap
	172
[1]	173	IMPLICIT NONE
	174
[1111]	175	LOGICAL, SAVE :: poisfft_initialized = .FALSE.
	176
[1]	177	PRIVATE
[807]	178
[1]	179	PUBLIC poisfft, poisfft_init
	180
	181	INTERFACE poisfft
	182	MODULE PROCEDURE poisfft
	183	END INTERFACE poisfft
	184
	185	INTERFACE poisfft_init
	186	MODULE PROCEDURE poisfft_init
	187	END INTERFACE poisfft_init
	188
[807]	189
[1]	190	CONTAINS
	191
[1682]	192	!------------------------------------------------------------------------------!
	193	! Description:
	194	! ------------
[3241]	195	!> Setup coefficients for FFT and the tridiagonal solver
[1682]	196	!------------------------------------------------------------------------------!
[1]	197	SUBROUTINE poisfft_init
	198
[1111]	199	IMPLICIT NONE
	200
	201
[1]	202	CALL fft_init
	203
[1212]	204	CALL tridia_init
[1111]	205
	206	poisfft_initialized = .TRUE.
	207
[1]	208	END SUBROUTINE poisfft_init
	209
[1111]	210
[1804]	211
[1682]	212	!------------------------------------------------------------------------------!
	213	! Description:
	214	! ------------
	215	!> Two-dimensional Fourier Transformation in x- and y-direction.
	216	!------------------------------------------------------------------------------!
[1306]	217	SUBROUTINE poisfft( ar )
[1]	218
[1320]	219	USE control_parameters, &
[3241]	220	ONLY: transpose_compute_overlap
[1320]	221
	222	USE cpulog, &
	223	ONLY: cpu_log, cpu_log_nowait, log_point_s
	224
	225	USE kinds
	226
[1]	227	USE pegrid
	228
	229	IMPLICIT NONE
	230
[1682]	231	INTEGER(iwp) :: ii !<
	232	INTEGER(iwp) :: iind !<
	233	INTEGER(iwp) :: inew !<
	234	INTEGER(iwp) :: jj !<
	235	INTEGER(iwp) :: jind !<
	236	INTEGER(iwp) :: jnew !<
	237	INTEGER(iwp) :: ki !<
	238	INTEGER(iwp) :: kk !<
	239	INTEGER(iwp) :: knew !<
	240	INTEGER(iwp) :: n !<
	241	INTEGER(iwp) :: nblk !<
	242	INTEGER(iwp) :: nnx_y !<
	243	INTEGER(iwp) :: nny_z !<
	244	INTEGER(iwp) :: nnz_x !<
	245	INTEGER(iwp) :: nxl_y_bound !<
	246	INTEGER(iwp) :: nxr_y_bound !<
[1]	247
[1682]	248	INTEGER(iwp), DIMENSION(4) :: isave !<
[1320]	249
[1682]	250	REAL(wp), DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar !<
	251	REAL(wp), DIMENSION(nys:nyn,nxl:nxr,1:nz) :: ar_inv !<
[1]	252
[1682]	253	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: ar1 !<
	254	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_in !<
	255	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_inv !<
	256	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_out_y !<
	257	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_out_z !<
[1216]	258
	259
[1]	260	CALL cpu_log( log_point_s(3), 'poisfft', 'start' )
	261
[1111]	262	IF ( .NOT. poisfft_initialized ) CALL poisfft_init
	263
[1]	264	!
	265	!-- Two-dimensional Fourier Transformation in x- and y-direction.
[2118]	266	IF ( pdims(2) == 1 .AND. pdims(1) > 1 ) THEN
[1]	267
	268	!
	269	!-- 1d-domain-decomposition along x:
	270	!-- FFT along y and transposition y --> x
[1216]	271	CALL ffty_tr_yx( ar, ar )
[1]	272
	273	!
	274	!-- FFT along x, solving the tridiagonal system and backward FFT
	275	CALL fftx_tri_fftx( ar )
	276
	277	!
	278	!-- Transposition x --> y and backward FFT along y
[1216]	279	CALL tr_xy_ffty( ar, ar )
[1]	280
[2118]	281	ELSEIF ( pdims(1) == 1 .AND. pdims(2) > 1 ) THEN
[1]	282
	283	!
	284	!-- 1d-domain-decomposition along y:
	285	!-- FFT along x and transposition x --> y
[1216]	286	CALL fftx_tr_xy( ar, ar )
[1]	287
	288	!
	289	!-- FFT along y, solving the tridiagonal system and backward FFT
	290	CALL ffty_tri_ffty( ar )
	291
	292	!
	293	!-- Transposition y --> x and backward FFT along x
[1216]	294	CALL tr_yx_fftx( ar, ar )
[1]	295
[1216]	296	ELSEIF ( .NOT. transpose_compute_overlap ) THEN
[1]	297
	298	!
[1111]	299	!-- 2d-domain-decomposition or no decomposition (1 PE run)
[1]	300	!-- Transposition z --> x
	301	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
[1216]	302	CALL resort_for_zx( ar, ar_inv )
	303	CALL transpose_zx( ar_inv, ar )
[1]	304	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
	305
	306	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
[1106]	307	CALL fft_x( ar, 'forward' )
[1]	308	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
	309
	310	!
	311	!-- Transposition x --> y
	312	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
[1216]	313	CALL resort_for_xy( ar, ar_inv )
	314	CALL transpose_xy( ar_inv, ar )
[1]	315	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
	316
	317	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
[1216]	318	CALL fft_y( ar, 'forward', ar_tr = ar, &
	319	nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
	320	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
[1]	321	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	322
	323	!
	324	!-- Transposition y --> z
	325	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
[1216]	326	CALL resort_for_yz( ar, ar_inv )
	327	CALL transpose_yz( ar_inv, ar )
[1]	328	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
	329
	330	!
[1106]	331	!-- Solve the tridiagonal equation system along z
[1]	332	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
[1212]	333	CALL tridia_substi( ar )
[1]	334	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
	335
	336	!
	337	!-- Inverse Fourier Transformation
	338	!-- Transposition z --> y
	339	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
[1216]	340	CALL transpose_zy( ar, ar_inv )
	341	CALL resort_for_zy( ar_inv, ar )
[1]	342	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
	343
	344	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
[1216]	345	CALL fft_y( ar, 'backward', ar_tr = ar, &
	346	nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
	347	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
[1]	348	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	349
	350	!
	351	!-- Transposition y --> x
	352	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
[1216]	353	CALL transpose_yx( ar, ar_inv )
	354	CALL resort_for_yx( ar_inv, ar )
[1]	355	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
	356
	357	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
[1106]	358	CALL fft_x( ar, 'backward' )
[1]	359	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	360
	361	!
	362	!-- Transposition x --> z
	363	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
[1216]	364	CALL transpose_xz( ar, ar_inv )
	365	CALL resort_for_xz( ar_inv, ar )
[1]	366	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
	367
[1216]	368	ELSE
	369
	370	!
	371	!-- 2d-domain-decomposition or no decomposition (1 PE run) with
	372	!-- overlapping transposition / fft
[1318]	373	!-- cputime logging must not use barriers, which would prevent overlapping
[1216]	374	ALLOCATE( f_out_y(0:ny,nxl_y:nxr_y,nzb_y:nzt_y), &
	375	f_out_z(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
	376	!
	377	!-- Transposition z --> x + subsequent fft along x
	378	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
	379	CALL resort_for_zx( ar, f_inv )
	380	!
	381	!-- Save original indices and gridpoint counter
	382	isave(1) = nz
	383	isave(2) = nzb_x
	384	isave(3) = nzt_x
	385	isave(4) = sendrecvcount_zx
	386	!
	387	!-- Set new indices for transformation
	388	nblk = nz / pdims(1)
	389	nz = pdims(1)
	390	nnz_x = 1
	391	nzb_x = 1 + myidx * nnz_x
	392	nzt_x = ( myidx + 1 ) * nnz_x
	393	sendrecvcount_zx = nnx * nny * nnz_x
	394
[1306]	395	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
[1216]	396	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
	397
[1306]	398	DO kk = 1, nblk
[1216]	399
[1306]	400	IF ( kk == 1 ) THEN
[1318]	401	CALL cpu_log( log_point_s(5), 'transpo forward', 'start', cpu_log_nowait )
[1306]	402	ELSE
[1318]	403	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
[1306]	404	ENDIF
[1216]	405
[1306]	406	DO knew = 1, nz
	407	ki = kk + nblk * ( knew - 1 )
	408	f_in(:,:,knew) = f_inv(:,:,ki)
	409	ENDDO
[1216]	410
[1306]	411	CALL transpose_zx( f_in, ar1(:,:,:))
	412	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
[1216]	413
[1306]	414	IF ( kk == 1 ) THEN
[1318]	415	CALL cpu_log( log_point_s(4), 'fft_x', 'start', cpu_log_nowait )
[1306]	416	ELSE
[1318]	417	CALL cpu_log( log_point_s(4), 'fft_x', 'continue', cpu_log_nowait )
[1216]	418	ENDIF
	419
[1306]	420	n = isave(2) + kk - 1
	421	CALL fft_x( ar1(:,:,:), 'forward', ar_2d = f_out_z(:,:,n))
	422	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
[1216]	423
	424	ENDDO
	425	!
	426	!-- Restore original indices/counters
	427	nz = isave(1)
	428	nzb_x = isave(2)
	429	nzt_x = isave(3)
	430	sendrecvcount_zx = isave(4)
	431
	432	DEALLOCATE( ar1, f_in, f_inv )
	433
	434	!
	435	!-- Transposition x --> y + subsequent fft along y
	436	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
	437	CALL resort_for_xy( f_out_z, f_inv )
	438	!
	439	!-- Save original indices and gridpoint counter
	440	isave(1) = nx
	441	isave(2) = nxl_y
	442	isave(3) = nxr_y
	443	isave(4) = sendrecvcount_xy
	444	!
	445	!-- Set new indices for transformation
	446	nblk = ( ( nx+1 ) / pdims(2) ) - 1
	447	nx = pdims(2)
	448	nnx_y = 1
	449	nxl_y = myidy * nnx_y
	450	nxr_y = ( myidy + 1 ) * nnx_y - 1
	451	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
	452
[1306]	453	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
[1216]	454	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
	455
[1306]	456	DO ii = 0, nblk
[1216]	457
[1318]	458	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
[1216]	459
[1306]	460	DO inew = 0, nx-1
	461	iind = ii + ( nblk + 1 ) * inew
	462	f_in(:,:,inew) = f_inv(:,:,iind)
	463	ENDDO
[1216]	464
[1306]	465	CALL transpose_xy( f_in, ar1(:,:,:) )
[1216]	466
[1306]	467	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
[1216]	468
[1306]	469	IF ( ii == 1 ) THEN
[1318]	470	CALL cpu_log( log_point_s(7), 'fft_y', 'start', cpu_log_nowait )
[1306]	471	ELSE
[1318]	472	CALL cpu_log( log_point_s(7), 'fft_y', 'continue', cpu_log_nowait )
[1216]	473	ENDIF
	474
[1306]	475	nxl_y_bound = isave(2)
	476	nxr_y_bound = isave(3)
	477	n = isave(2) + ii
	478	CALL fft_y( ar1(:,:,:), 'forward', ar_tr = f_out_y, &
	479	nxl_y_bound = nxl_y_bound, nxr_y_bound = nxr_y_bound, &
	480	nxl_y_l = n, nxr_y_l = n )
[1216]	481
[1306]	482	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
[1216]	483
	484	ENDDO
	485	!
	486	!-- Restore original indices/counters
	487	nx = isave(1)
	488	nxl_y = isave(2)
	489	nxr_y = isave(3)
	490	sendrecvcount_xy = isave(4)
	491
	492	DEALLOCATE( ar1, f_in, f_inv )
	493
	494	!
	495	!-- Transposition y --> z + subsequent tridia + resort for z --> y
	496	ALLOCATE( f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
	497	CALL resort_for_yz( f_out_y, f_inv )
	498	!
	499	!-- Save original indices and gridpoint counter
	500	isave(1) = ny
	501	isave(2) = nys_z
	502	isave(3) = nyn_z
	503	isave(4) = sendrecvcount_yz
	504	!
	505	!-- Set new indices for transformation
	506	nblk = ( ( ny+1 ) / pdims(1) ) - 1
	507	ny = pdims(1)
	508	nny_z = 1
	509	nys_z = myidx * nny_z
	510	nyn_z = ( myidx + 1 ) * nny_z - 1
	511	sendrecvcount_yz = ( nxr_y-nxl_y+1 ) * nny_z * ( nzt_y-nzb_y+1 )
	512
[1306]	513	ALLOCATE( ar1(nxl_z:nxr_z,nys_z:nyn_z,1:nz) )
[1216]	514	ALLOCATE( f_in(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
	515
[1306]	516	DO jj = 0, nblk
[1216]	517	!
[1306]	518	!-- Forward Fourier Transformation
	519	!-- Transposition y --> z
[1318]	520	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
[1216]	521
[1306]	522	DO jnew = 0, ny-1
	523	jind = jj + ( nblk + 1 ) * jnew
	524	f_in(:,:,jnew) = f_inv(:,:,jind)
	525	ENDDO
[1216]	526
[1306]	527	CALL transpose_yz( f_in, ar1(:,:,:) )
[1216]	528
[1306]	529	IF ( jj == nblk ) THEN
	530	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
	531	ELSE
	532	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
[1216]	533	ENDIF
	534
	535	!
[1306]	536	!-- Solve the tridiagonal equation system along z
[1318]	537	CALL cpu_log( log_point_s(6), 'tridia', 'start', cpu_log_nowait )
[1216]	538
[1306]	539	n = isave(2) + jj
	540	CALL tridia_substi_overlap( ar1(:,:,:), n )
[1216]	541
[1306]	542	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
[1216]	543
[1306]	544	!
	545	!-- Inverse Fourier Transformation
	546	!-- Transposition z --> y
	547	!-- Only one thread should call MPI routines, therefore forward and
	548	!-- backward tranpose are in the same section
	549	IF ( jj == 0 ) THEN
[1318]	550	CALL cpu_log( log_point_s(8), 'transpo invers', 'start', cpu_log_nowait )
[1306]	551	ELSE
[1318]	552	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
[1216]	553	ENDIF
	554
[1306]	555	CALL transpose_zy( ar1(:,:,:), f_in )
[1216]	556
[1306]	557	DO jnew = 0, ny-1
	558	jind = jj + ( nblk + 1 ) * jnew
	559	f_inv(:,:,jind) = f_in(:,:,jnew)
	560	ENDDO
[1216]	561
[1306]	562	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
[1216]	563
	564	ENDDO
	565	!
	566	!-- Restore original indices/counters
	567	ny = isave(1)
	568	nys_z = isave(2)
	569	nyn_z = isave(3)
	570	sendrecvcount_yz = isave(4)
	571
	572	CALL resort_for_zy( f_inv, f_out_y )
	573
	574	DEALLOCATE( ar1, f_in, f_inv )
	575
	576	!
	577	!-- fft along y backward + subsequent transposition y --> x
	578	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
	579	!
	580	!-- Save original indices and gridpoint counter
	581	isave(1) = nx
	582	isave(2) = nxl_y
	583	isave(3) = nxr_y
	584	isave(4) = sendrecvcount_xy
	585	!
	586	!-- Set new indices for transformation
	587	nblk = (( nx+1 ) / pdims(2) ) - 1
	588	nx = pdims(2)
	589	nnx_y = 1
	590	nxl_y = myidy * nnx_y
	591	nxr_y = ( myidy + 1 ) * nnx_y - 1
	592	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
	593
[1306]	594	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
[1216]	595	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
	596
[1306]	597	DO ii = 0, nblk
[1216]	598
[1318]	599	CALL cpu_log( log_point_s(7), 'fft_y', 'continue', cpu_log_nowait )
[1216]	600
[1306]	601	n = isave(2) + ii
	602	nxl_y_bound = isave(2)
	603	nxr_y_bound = isave(3)
[1216]	604
[1306]	605	CALL fft_y( ar1(:,:,:), 'backward', ar_tr = f_out_y, &
	606	nxl_y_bound = nxl_y_bound, nxr_y_bound = nxr_y_bound, &
	607	nxl_y_l = n, nxr_y_l = n )
[1216]	608
[1306]	609	IF ( ii == nblk ) THEN
	610	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	611	ELSE
	612	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
[1216]	613	ENDIF
	614
[1318]	615	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
[1216]	616
[1306]	617	CALL transpose_yx( ar1(:,:,:), f_in )
[1216]	618
[1306]	619	DO inew = 0, nx-1
	620	iind = ii + (nblk+1) * inew
	621	f_inv(:,:,iind) = f_in(:,:,inew)
	622	ENDDO
[1216]	623
[1306]	624	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
[1216]	625
	626	ENDDO
	627	!
	628	!-- Restore original indices/counters
	629	nx = isave(1)
	630	nxl_y = isave(2)
	631	nxr_y = isave(3)
	632	sendrecvcount_xy = isave(4)
	633
	634	CALL resort_for_yx( f_inv, f_out_z )
	635
	636	DEALLOCATE( ar1, f_in, f_inv )
	637
	638	!
	639	!-- fft along x backward + subsequent final transposition x --> z
	640	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
	641	!
	642	!-- Save original indices and gridpoint counter
	643	isave(1) = nz
	644	isave(2) = nzb_x
	645	isave(3) = nzt_x
	646	isave(4) = sendrecvcount_zx
	647	!
	648	!-- Set new indices for transformation
	649	nblk = nz / pdims(1)
	650	nz = pdims(1)
	651	nnz_x = 1
	652	nzb_x = 1 + myidx * nnz_x
	653	nzt_x = ( myidx + 1 ) * nnz_x
	654	sendrecvcount_zx = nnx * nny * nnz_x
	655
[1306]	656	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
[1216]	657	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
	658
[1306]	659	DO kk = 1, nblk
[1216]	660
[1318]	661	CALL cpu_log( log_point_s(4), 'fft_x', 'continue', cpu_log_nowait )
[1216]	662
[1306]	663	n = isave(2) + kk - 1
	664	CALL fft_x( ar1(:,:,:), 'backward', f_out_z(:,:,n))
[1216]	665
[1306]	666	IF ( kk == nblk ) THEN
	667	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	668	ELSE
	669	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
[1216]	670	ENDIF
	671
[1318]	672	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
[1216]	673
[1306]	674	CALL transpose_xz( ar1(:,:,:), f_in )
[1216]	675
[1306]	676	DO knew = 1, nz
	677	ki = kk + nblk * (knew-1)
	678	f_inv(:,:,ki) = f_in(:,:,knew)
	679	ENDDO
[1216]	680
[1306]	681	IF ( kk == nblk ) THEN
	682	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
	683	ELSE
	684	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
[1216]	685	ENDIF
	686
	687	ENDDO
	688	!
	689	!-- Restore original indices/counters
	690	nz = isave(1)
	691	nzb_x = isave(2)
	692	nzt_x = isave(3)
	693	sendrecvcount_zx = isave(4)
	694
	695	CALL resort_for_xz( f_inv, ar )
	696
	697	DEALLOCATE( ar1, f_in, f_inv )
	698
[1]	699	ENDIF
	700
	701	CALL cpu_log( log_point_s(3), 'poisfft', 'stop' )
	702
	703	END SUBROUTINE poisfft
	704
	705
	706	!------------------------------------------------------------------------------!
[1682]	707	! Description:
	708	! ------------
	709	!> Fourier-transformation along y with subsequent transposition y --> x for
	710	!> a 1d-decomposition along x.
	711	!>
	712	!> @attention The performance of this routine is much faster on the NEC-SX6,
	713	!> if the first index of work_ffty_vec is odd. Otherwise
	714	!> memory bank conflicts may occur (especially if the index is a
	715	!> multiple of 128). That's why work_ffty_vec is dimensioned as
	716	!> 0:ny+1.
	717	!> Of course, this will not work if users are using an odd number
	718	!> of gridpoints along y.
[1]	719	!------------------------------------------------------------------------------!
[1682]	720	SUBROUTINE ffty_tr_yx( f_in, f_out )
[1]	721
[1320]	722	USE control_parameters, &
[2300]	723	ONLY: loop_optimization
[1320]	724
	725	USE cpulog, &
	726	ONLY: cpu_log, log_point_s
	727
	728	USE kinds
	729
[1]	730	USE pegrid
	731
	732	IMPLICIT NONE
	733
[1682]	734	INTEGER(iwp) :: i !<
	735	INTEGER(iwp) :: iend !<
	736	INTEGER(iwp) :: iouter !<
	737	INTEGER(iwp) :: ir !<
	738	INTEGER(iwp) :: j !<
	739	INTEGER(iwp) :: k !<
[1]	740
[1682]	741	INTEGER(iwp), PARAMETER :: stridex = 4 !<
[1320]	742
[1682]	743	REAL(wp), DIMENSION(1:nz,0:ny,nxl:nxr) :: f_in !<
	744	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_out !<
	745	REAL(wp), DIMENSION(nxl:nxr,1:nz,0:ny) :: work !<
[1]	746
[2300]	747	REAL(wp), DIMENSION(:,:), ALLOCATABLE :: work_ffty !<
	748	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: work_ffty_vec !<
	749
[1]	750	!
	751	!-- Carry out the FFT along y, where all data are present due to the
	752	!-- 1d-decomposition along x. Resort the data in a way that x becomes
	753	!-- the first index.
[1106]	754	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'start' )
[1]	755
[2300]	756	IF ( loop_optimization == 'vector' ) THEN
	757
	758	ALLOCATE( work_ffty_vec(0:ny+1,1:nz,nxl:nxr) )
[1]	759	!
	760	!-- Code optimized for vector processors
[2300]	761	!$OMP PARALLEL PRIVATE ( i, j, k )
	762	!$OMP DO
[1]	763	DO i = nxl, nxr
	764
	765	DO j = 0, ny
	766	DO k = 1, nz
	767	work_ffty_vec(j,k,i) = f_in(k,j,i)
	768	ENDDO
	769	ENDDO
	770
	771	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' )
	772
	773	ENDDO
	774
[2300]	775	!$OMP DO
[1]	776	DO k = 1, nz
	777	DO j = 0, ny
	778	DO i = nxl, nxr
	779	work(i,k,j) = work_ffty_vec(j,k,i)
	780	ENDDO
	781	ENDDO
	782	ENDDO
[2300]	783	!$OMP END PARALLEL
[1]	784
[2300]	785	DEALLOCATE( work_ffty_vec )
	786
[1]	787	ELSE
	788	!
	789	!-- Cache optimized code.
[2300]	790	ALLOCATE( work_ffty(0:ny,stridex) )
	791	!
[1]	792	!-- The i-(x-)direction is split into a strided outer loop and an inner
	793	!-- loop for better cache performance
[2300]	794	!$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty)
	795	!$OMP DO
[1]	796	DO iouter = nxl, nxr, stridex
	797
	798	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
	799
	800	DO k = 1, nz
	801
	802	DO i = iouter, iend
	803
	804	ir = i-iouter+1 ! counter within a stride
	805	DO j = 0, ny
	806	work_ffty(j,ir) = f_in(k,j,i)
	807	ENDDO
	808	!
	809	!-- FFT along y
[1106]	810	CALL fft_y_1d( work_ffty(:,ir), 'forward' )
[1]	811
	812	ENDDO
	813
	814	!
	815	!-- Resort
	816	DO j = 0, ny
	817	DO i = iouter, iend
	818	work(i,k,j) = work_ffty(j,i-iouter+1)
	819	ENDDO
	820	ENDDO
	821
	822	ENDDO
	823
	824	ENDDO
[2300]	825	!$OMP END PARALLEL
[1]	826
[2300]	827	DEALLOCATE( work_ffty )
	828
[1]	829	ENDIF
[2300]	830
[1106]	831	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'pause' )
[1]	832
	833	!
	834	!-- Transpose array
[1111]	835	#if defined( __parallel )
[1]	836	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	837	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	838	CALL MPI_ALLTOALL( work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
	839	f_out(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
	840	comm1dx, ierr )
	841	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1111]	842	#endif
[1]	843
	844	END SUBROUTINE ffty_tr_yx
	845
	846
	847	!------------------------------------------------------------------------------!
[1682]	848	! Description:
	849	! ------------
	850	!> Transposition x --> y with a subsequent backward Fourier transformation for
	851	!> a 1d-decomposition along x
[1]	852	!------------------------------------------------------------------------------!
[1682]	853	SUBROUTINE tr_xy_ffty( f_in, f_out )
[1]	854
[1320]	855	USE control_parameters, &
[2300]	856	ONLY: loop_optimization
[1320]	857
	858	USE cpulog, &
	859	ONLY: cpu_log, log_point_s
	860
	861	USE kinds
	862
[1]	863	USE pegrid
	864
	865	IMPLICIT NONE
	866
[1682]	867	INTEGER(iwp) :: i !<
	868	INTEGER(iwp) :: iend !<
	869	INTEGER(iwp) :: iouter !<
	870	INTEGER(iwp) :: ir !<
	871	INTEGER(iwp) :: j !<
	872	INTEGER(iwp) :: k !<
[1]	873
[1682]	874	INTEGER(iwp), PARAMETER :: stridex = 4 !<
[1320]	875
[1682]	876	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_in !<
	877	REAL(wp), DIMENSION(1:nz,0:ny,nxl:nxr) :: f_out !<
	878	REAL(wp), DIMENSION(nxl:nxr,1:nz,0:ny) :: work !<
[1]	879
[2300]	880	REAL(wp), DIMENSION(:,:), ALLOCATABLE :: work_ffty !<
	881	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: work_ffty_vec !<
	882
[1]	883	!
	884	!-- Transpose array
[1111]	885	#if defined( __parallel )
[1]	886	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	887	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	888	CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
	889	work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
	890	comm1dx, ierr )
	891	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1111]	892	#endif
[1]	893
	894	!
	895	!-- Resort the data in a way that y becomes the first index and carry out the
	896	!-- backward fft along y.
[1106]	897	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'continue' )
[1]	898
[2300]	899	IF ( loop_optimization == 'vector' ) THEN
	900
	901	ALLOCATE( work_ffty_vec(0:ny+1,1:nz,nxl:nxr) )
[1]	902	!
	903	!-- Code optimized for vector processors
[2300]	904	!$OMP PARALLEL PRIVATE ( i, j, k )
	905	!$OMP DO
[1]	906	DO k = 1, nz
	907	DO j = 0, ny
	908	DO i = nxl, nxr
	909	work_ffty_vec(j,k,i) = work(i,k,j)
	910	ENDDO
	911	ENDDO
	912	ENDDO
	913
[2300]	914	!$OMP DO
[1]	915	DO i = nxl, nxr
	916
	917	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' )
	918
	919	DO j = 0, ny
	920	DO k = 1, nz
	921	f_out(k,j,i) = work_ffty_vec(j,k,i)
	922	ENDDO
	923	ENDDO
	924
	925	ENDDO
[2300]	926	!$OMP END PARALLEL
[1]	927
[2300]	928	DEALLOCATE( work_ffty_vec )
	929
[1]	930	ELSE
	931	!
	932	!-- Cache optimized code.
[2300]	933	ALLOCATE( work_ffty(0:ny,stridex) )
	934	!
[1]	935	!-- The i-(x-)direction is split into a strided outer loop and an inner
	936	!-- loop for better cache performance
[2300]	937	!$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty )
	938	!$OMP DO
[1]	939	DO iouter = nxl, nxr, stridex
	940
	941	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
	942
	943	DO k = 1, nz
	944	!
	945	!-- Resort
	946	DO j = 0, ny
	947	DO i = iouter, iend
	948	work_ffty(j,i-iouter+1) = work(i,k,j)
	949	ENDDO
	950	ENDDO
	951
	952	DO i = iouter, iend
	953
	954	!
	955	!-- FFT along y
	956	ir = i-iouter+1 ! counter within a stride
[1106]	957	CALL fft_y_1d( work_ffty(:,ir), 'backward' )
[1]	958
	959	DO j = 0, ny
	960	f_out(k,j,i) = work_ffty(j,ir)
	961	ENDDO
	962	ENDDO
	963
	964	ENDDO
	965
	966	ENDDO
[2300]	967	!$OMP END PARALLEL
[1]	968
[2300]	969	DEALLOCATE( work_ffty )
	970
[1]	971	ENDIF
	972
[1106]	973	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'stop' )
[1]	974
	975	END SUBROUTINE tr_xy_ffty
	976
	977
	978	!------------------------------------------------------------------------------!
[1682]	979	! Description:
	980	! ------------
	981	!> FFT along x, solution of the tridiagonal system and backward FFT for
	982	!> a 1d-decomposition along x
	983	!>
	984	!> @warning this subroutine may still not work for hybrid parallelization
	985	!> with OpenMP (for possible necessary changes see the original
	986	!> routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
[1]	987	!------------------------------------------------------------------------------!
[1682]	988	SUBROUTINE fftx_tri_fftx( ar )
[1]	989
[1320]	990	USE control_parameters, &
[2300]	991	ONLY: loop_optimization
[1320]	992
	993	USE cpulog, &
	994	ONLY: cpu_log, log_point_s
	995
	996	USE grid_variables, &
	997	ONLY: ddx2, ddy2
	998
	999	USE kinds
	1000
[1]	1001	USE pegrid
	1002
	1003	IMPLICIT NONE
	1004
[1682]	1005	INTEGER(iwp) :: i !<
	1006	INTEGER(iwp) :: j !<
	1007	INTEGER(iwp) :: k !<
	1008	INTEGER(iwp) :: m !<
	1009	INTEGER(iwp) :: n !<
[3241]	1010	!$ INTEGER(iwp) :: omp_get_thread_num !<
[1682]	1011	INTEGER(iwp) :: tn !<
[1]	1012
[1682]	1013	REAL(wp), DIMENSION(0:nx) :: work_fftx !<
	1014	REAL(wp), DIMENSION(0:nx,1:nz) :: work_trix !<
	1015	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: ar !<
	1016	REAL(wp), DIMENSION(:,:,:,:), ALLOCATABLE :: tri !<
[1]	1017
	1018
[1106]	1019	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'start' )
[1]	1020
	1021	ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) )
	1022
	1023	tn = 0 ! Default thread number in case of one thread
	1024	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_fftx, work_trix )
	1025	DO j = nys_x, nyn_x
	1026
	1027	!$ tn = omp_get_thread_num()
	1028
[2300]	1029	IF ( loop_optimization == 'vector' ) THEN
[1]	1030	!
	1031	!-- Code optimized for vector processors
	1032	DO k = 1, nz
	1033
	1034	m = 0
	1035	DO n = 1, pdims(1)
[1003]	1036	DO i = 1, nnx
[1]	1037	work_trix(m,k) = ar(i,k,j,n)
	1038	m = m + 1
	1039	ENDDO
	1040	ENDDO
	1041
	1042	ENDDO
	1043
	1044	CALL fft_x_m( work_trix, 'forward' )
	1045
	1046	ELSE
	1047	!
	1048	!-- Cache optimized code
	1049	DO k = 1, nz
	1050
	1051	m = 0
	1052	DO n = 1, pdims(1)
[1003]	1053	DO i = 1, nnx
[1]	1054	work_fftx(m) = ar(i,k,j,n)
	1055	m = m + 1
	1056	ENDDO
	1057	ENDDO
	1058
[1106]	1059	CALL fft_x_1d( work_fftx, 'forward' )
[1]	1060
	1061	DO i = 0, nx
	1062	work_trix(i,k) = work_fftx(i)
	1063	ENDDO
	1064
	1065	ENDDO
	1066
	1067	ENDIF
	1068
	1069	!
	1070	!-- Solve the linear equation system
	1071	CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) )
	1072
[2300]	1073	IF ( loop_optimization == 'vector' ) THEN
[1]	1074	!
	1075	!-- Code optimized for vector processors
	1076	CALL fft_x_m( work_trix, 'backward' )
	1077
	1078	DO k = 1, nz
	1079
	1080	m = 0
	1081	DO n = 1, pdims(1)
[1003]	1082	DO i = 1, nnx
[1]	1083	ar(i,k,j,n) = work_trix(m,k)
	1084	m = m + 1
	1085	ENDDO
	1086	ENDDO
	1087
	1088	ENDDO
	1089
	1090	ELSE
	1091	!
	1092	!-- Cache optimized code
	1093	DO k = 1, nz
	1094
	1095	DO i = 0, nx
	1096	work_fftx(i) = work_trix(i,k)
	1097	ENDDO
	1098
[1106]	1099	CALL fft_x_1d( work_fftx, 'backward' )
[1]	1100
	1101	m = 0
	1102	DO n = 1, pdims(1)
[1003]	1103	DO i = 1, nnx
[1]	1104	ar(i,k,j,n) = work_fftx(m)
	1105	m = m + 1
	1106	ENDDO
	1107	ENDDO
	1108
	1109	ENDDO
	1110
	1111	ENDIF
	1112
	1113	ENDDO
	1114
	1115	DEALLOCATE( tri )
	1116
[1106]	1117	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'stop' )
[1]	1118
	1119	END SUBROUTINE fftx_tri_fftx
	1120
	1121
	1122	!------------------------------------------------------------------------------!
[1682]	1123	! Description:
	1124	! ------------
	1125	!> Fourier-transformation along x with subsequent transposition x --> y for
	1126	!> a 1d-decomposition along y.
	1127	!>
	1128	!> @attention NEC-branch of this routine may significantly profit from
	1129	!> further optimizations. So far, performance is much worse than
	1130	!> for routine ffty_tr_yx (more than three times slower).
[1]	1131	!------------------------------------------------------------------------------!
[1682]	1132	SUBROUTINE fftx_tr_xy( f_in, f_out )
[1]	1133
[1682]	1134
[1320]	1135	USE control_parameters, &
[2300]	1136	ONLY: loop_optimization
[1320]	1137
	1138	USE cpulog, &
	1139	ONLY: cpu_log, log_point_s
	1140
	1141	USE kinds
	1142
[1]	1143	USE pegrid
	1144
	1145	IMPLICIT NONE
	1146
[1682]	1147	INTEGER(iwp) :: i !<
	1148	INTEGER(iwp) :: j !<
	1149	INTEGER(iwp) :: k !<
[1]	1150
[1682]	1151	REAL(wp), DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx !<
	1152	REAL(wp), DIMENSION(1:nz,nys:nyn,0:nx) :: f_in !<
	1153	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_out !<
	1154	REAL(wp), DIMENSION(nys:nyn,1:nz,0:nx) :: work !<
[1]	1155
	1156	!
	1157	!-- Carry out the FFT along x, where all data are present due to the
	1158	!-- 1d-decomposition along y. Resort the data in a way that y becomes
	1159	!-- the first index.
[1106]	1160	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'start' )
[1]	1161
[2300]	1162	IF ( loop_optimization == 'vector' ) THEN
[1]	1163	!
	1164	!-- Code for vector processors
[85]	1165	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	1166	!$OMP DO
	1167	DO i = 0, nx
	1168
	1169	DO j = nys, nyn
	1170	DO k = 1, nz
	1171	work_fftx(i,k,j) = f_in(k,j,i)
	1172	ENDDO
	1173	ENDDO
	1174
	1175	ENDDO
	1176
	1177	!$OMP DO
	1178	DO j = nys, nyn
	1179
	1180	CALL fft_x_m( work_fftx(:,:,j), 'forward' )
	1181
	1182	DO k = 1, nz
	1183	DO i = 0, nx
	1184	work(j,k,i) = work_fftx(i,k,j)
	1185	ENDDO
	1186	ENDDO
	1187
	1188	ENDDO
	1189	!$OMP END PARALLEL
	1190
	1191	ELSE
	1192
	1193	!
	1194	!-- Cache optimized code (there might be still a potential for better
	1195	!-- optimization).
[696]	1196	!$OMP PARALLEL PRIVATE (i,j,k)
[1]	1197	!$OMP DO
	1198	DO i = 0, nx
	1199
	1200	DO j = nys, nyn
	1201	DO k = 1, nz
	1202	work_fftx(i,k,j) = f_in(k,j,i)
	1203	ENDDO
	1204	ENDDO
	1205
	1206	ENDDO
	1207
	1208	!$OMP DO
	1209	DO j = nys, nyn
	1210	DO k = 1, nz
	1211
[1106]	1212	CALL fft_x_1d( work_fftx(0:nx,k,j), 'forward' )
[1]	1213
	1214	DO i = 0, nx
	1215	work(j,k,i) = work_fftx(i,k,j)
	1216	ENDDO
	1217	ENDDO
	1218
	1219	ENDDO
	1220	!$OMP END PARALLEL
	1221
	1222	ENDIF
[1106]	1223	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'pause' )
[1]	1224
	1225	!
	1226	!-- Transpose array
[1111]	1227	#if defined( __parallel )
[1]	1228	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	1229	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	1230	CALL MPI_ALLTOALL( work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
	1231	f_out(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
	1232	comm1dy, ierr )
	1233	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1111]	1234	#endif
[1]	1235
	1236	END SUBROUTINE fftx_tr_xy
	1237
	1238
	1239	!------------------------------------------------------------------------------!
[1682]	1240	! Description:
	1241	! ------------
	1242	!> Transposition y --> x with a subsequent backward Fourier transformation for
	1243	!> a 1d-decomposition along x.
[1]	1244	!------------------------------------------------------------------------------!
[1682]	1245	SUBROUTINE tr_yx_fftx( f_in, f_out )
[1]	1246
[1682]	1247
[1320]	1248	USE control_parameters, &
[2300]	1249	ONLY: loop_optimization
[1320]	1250
	1251	USE cpulog, &
	1252	ONLY: cpu_log, log_point_s
	1253
	1254	USE kinds
	1255
[1]	1256	USE pegrid
	1257
	1258	IMPLICIT NONE
	1259
[1682]	1260	INTEGER(iwp) :: i !<
	1261	INTEGER(iwp) :: j !<
	1262	INTEGER(iwp) :: k !<
[1]	1263
[1682]	1264	REAL(wp), DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx !<
	1265	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_in !<
	1266	REAL(wp), DIMENSION(1:nz,nys:nyn,0:nx) :: f_out !<
	1267	REAL(wp), DIMENSION(nys:nyn,1:nz,0:nx) :: work !<
[1]	1268
	1269	!
	1270	!-- Transpose array
[1111]	1271	#if defined( __parallel )
[1]	1272	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	1273	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	1274	CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
	1275	work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
	1276	comm1dy, ierr )
	1277	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1111]	1278	#endif
[1]	1279
	1280	!
	1281	!-- Carry out the FFT along x, where all data are present due to the
	1282	!-- 1d-decomposition along y. Resort the data in a way that y becomes
	1283	!-- the first index.
[1106]	1284	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'continue' )
[1]	1285
[2300]	1286	IF ( loop_optimization == 'vector' ) THEN
[1]	1287	!
	1288	!-- Code optimized for vector processors
[85]	1289	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	1290	!$OMP DO
	1291	DO j = nys, nyn
	1292
	1293	DO k = 1, nz
	1294	DO i = 0, nx
	1295	work_fftx(i,k,j) = work(j,k,i)
	1296	ENDDO
	1297	ENDDO
	1298
	1299	CALL fft_x_m( work_fftx(:,:,j), 'backward' )
	1300
	1301	ENDDO
	1302
	1303	!$OMP DO
	1304	DO i = 0, nx
	1305	DO j = nys, nyn
	1306	DO k = 1, nz
	1307	f_out(k,j,i) = work_fftx(i,k,j)
	1308	ENDDO
	1309	ENDDO
	1310	ENDDO
	1311	!$OMP END PARALLEL
	1312
	1313	ELSE
	1314
	1315	!
	1316	!-- Cache optimized code (there might be still a potential for better
	1317	!-- optimization).
[696]	1318	!$OMP PARALLEL PRIVATE (i,j,k)
[1]	1319	!$OMP DO
	1320	DO j = nys, nyn
	1321	DO k = 1, nz
	1322
	1323	DO i = 0, nx
	1324	work_fftx(i,k,j) = work(j,k,i)
	1325	ENDDO
	1326
[1106]	1327	CALL fft_x_1d( work_fftx(0:nx,k,j), 'backward' )
[1]	1328
	1329	ENDDO
	1330	ENDDO
	1331
	1332	!$OMP DO
	1333	DO i = 0, nx
	1334	DO j = nys, nyn
	1335	DO k = 1, nz
	1336	f_out(k,j,i) = work_fftx(i,k,j)
	1337	ENDDO
	1338	ENDDO
	1339	ENDDO
	1340	!$OMP END PARALLEL
	1341
	1342	ENDIF
[1106]	1343	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'stop' )
[1]	1344
	1345	END SUBROUTINE tr_yx_fftx
	1346
	1347
	1348	!------------------------------------------------------------------------------!
[1682]	1349	! Description:
	1350	! ------------
	1351	!> FFT along y, solution of the tridiagonal system and backward FFT for
	1352	!> a 1d-decomposition along y.
	1353	!>
	1354	!> @warning this subroutine may still not work for hybrid parallelization
	1355	!> with OpenMP (for possible necessary changes see the original
	1356	!> routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
[1]	1357	!------------------------------------------------------------------------------!
[1682]	1358	SUBROUTINE ffty_tri_ffty( ar )
[1]	1359
[1682]	1360
[1320]	1361	USE control_parameters, &
[2300]	1362	ONLY: loop_optimization
[1320]	1363
	1364	USE cpulog, &
	1365	ONLY: cpu_log, log_point_s
	1366
	1367	USE grid_variables, &
	1368	ONLY: ddx2, ddy2
	1369
	1370	USE kinds
	1371
[1]	1372	USE pegrid
	1373
	1374	IMPLICIT NONE
	1375
[1682]	1376	INTEGER(iwp) :: i !<
	1377	INTEGER(iwp) :: j !<
	1378	INTEGER(iwp) :: k !<
	1379	INTEGER(iwp) :: m !<
	1380	INTEGER(iwp) :: n !<
[3241]	1381	!$ INTEGER(iwp) :: omp_get_thread_num !<
[1682]	1382	INTEGER(iwp) :: tn !<
[1]	1383
[1682]	1384	REAL(wp), DIMENSION(0:ny) :: work_ffty !<
	1385	REAL(wp), DIMENSION(0:ny,1:nz) :: work_triy !<
	1386	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: ar !<
	1387	REAL(wp), DIMENSION(:,:,:,:), ALLOCATABLE :: tri !<
[1]	1388
	1389
[1106]	1390	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'start' )
[1]	1391
	1392	ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) )
	1393
	1394	tn = 0 ! Default thread number in case of one thread
[696]	1395	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy )
[1]	1396	DO i = nxl_y, nxr_y
	1397
	1398	!$ tn = omp_get_thread_num()
	1399
[2300]	1400	IF ( loop_optimization == 'vector' ) THEN
[1]	1401	!
	1402	!-- Code optimized for vector processors
	1403	DO k = 1, nz
	1404
	1405	m = 0
	1406	DO n = 1, pdims(2)
[1003]	1407	DO j = 1, nny
[1]	1408	work_triy(m,k) = ar(j,k,i,n)
	1409	m = m + 1
	1410	ENDDO
	1411	ENDDO
	1412
	1413	ENDDO
	1414
	1415	CALL fft_y_m( work_triy, ny, 'forward' )
	1416
	1417	ELSE
	1418	!
	1419	!-- Cache optimized code
	1420	DO k = 1, nz
	1421
	1422	m = 0
	1423	DO n = 1, pdims(2)
[1003]	1424	DO j = 1, nny
[1]	1425	work_ffty(m) = ar(j,k,i,n)
	1426	m = m + 1
	1427	ENDDO
	1428	ENDDO
	1429
[1106]	1430	CALL fft_y_1d( work_ffty, 'forward' )
[1]	1431
	1432	DO j = 0, ny
	1433	work_triy(j,k) = work_ffty(j)
	1434	ENDDO
	1435
	1436	ENDDO
	1437
	1438	ENDIF
	1439
	1440	!
	1441	!-- Solve the linear equation system
	1442	CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) )
	1443
[2300]	1444	IF ( loop_optimization == 'vector' ) THEN
[1]	1445	!
	1446	!-- Code optimized for vector processors
	1447	CALL fft_y_m( work_triy, ny, 'backward' )
	1448
	1449	DO k = 1, nz
	1450
	1451	m = 0
	1452	DO n = 1, pdims(2)
[1003]	1453	DO j = 1, nny
[1]	1454	ar(j,k,i,n) = work_triy(m,k)
	1455	m = m + 1
	1456	ENDDO
	1457	ENDDO
	1458
	1459	ENDDO
	1460
	1461	ELSE
	1462	!
	1463	!-- Cache optimized code
	1464	DO k = 1, nz
	1465
	1466	DO j = 0, ny
	1467	work_ffty(j) = work_triy(j,k)
	1468	ENDDO
	1469
[1106]	1470	CALL fft_y_1d( work_ffty, 'backward' )
[1]	1471
	1472	m = 0
	1473	DO n = 1, pdims(2)
[1003]	1474	DO j = 1, nny
[1]	1475	ar(j,k,i,n) = work_ffty(m)
	1476	m = m + 1
	1477	ENDDO
	1478	ENDDO
	1479
	1480	ENDDO
	1481
	1482	ENDIF
	1483
	1484	ENDDO
	1485
	1486	DEALLOCATE( tri )
	1487
[1106]	1488	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'stop' )
[1]	1489
	1490	END SUBROUTINE ffty_tri_ffty
	1491
	1492	END MODULE poisfft_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |