Home

Context Navigation

source: palm/trunk/SOURCE/poisfft_mod.f90 @ 2300

Last change on this file since 2300 was 2300, checked in by raasch, 7 years ago
NEC related code partly removed, host variable partly removed, host specific code completely removed, default values for host, loop_optimization and termination time_needed changed
Property svn:keywords set to `Id`
File size: 45.4 KB

Rev	Line
[1850]	1	!> @file poisfft_mod.f90
[2000]	2	!------------------------------------------------------------------------------!
[1036]	3	! This file is part of PALM.
	4	!
[2000]	5	! PALM is free software: you can redistribute it and/or modify it under the
	6	! terms of the GNU General Public License as published by the Free Software
	7	! Foundation, either version 3 of the License, or (at your option) any later
	8	! version.
[1036]	9	!
	10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
	11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
	12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
	13	!
	14	! You should have received a copy of the GNU General Public License along with
	15	! PALM. If not, see <http://www.gnu.org/licenses/>.
	16	!
[2101]	17	! Copyright 1997-2017 Leibniz Universitaet Hannover
[2000]	18	!------------------------------------------------------------------------------!
[1036]	19	!
[484]	20	! Current revisions:
[1]	21	! -----------------
[1683]	22	!
[2119]	23	!
[1321]	24	! Former revisions:
	25	! -----------------
	26	! $Id: poisfft_mod.f90 2300 2017-06-29 13:31:14Z raasch $
[2300]	27	! settings depending on host variable removed or replaced by loop_optimization
	28	!
	29	! 2119 2017-01-17 16:51:50Z raasch
[1321]	30	!
[2119]	31	! 2118 2017-01-17 16:38:49Z raasch
	32	! OpenACC directives and related code removed
	33	!
[2001]	34	! 2000 2016-08-20 18:09:15Z knoop
	35	! Forced header and separation lines into 80 columns
	36	!
[1851]	37	! 1850 2016-04-08 13:29:27Z maronga
	38	! Module renamed
	39	!
	40	!
[1805]	41	! 1804 2016-04-05 16:30:18Z maronga
	42	! Removed code for parameter file check (__check)
	43	!
[1683]	44	! 1682 2015-10-07 23:56:08Z knoop
	45	! Code annotations made doxygen readable
	46	!
[1483]	47	! 1482 2014-10-18 12:34:45Z raasch
	48	! use 2d-decomposition, if accelerator boards are used
	49	!
[1407]	50	! 1406 2014-05-16 13:47:01Z raasch
	51	! bugfix for pgi 14.4: declare create moved after array declaration
	52	!
[1321]	53	! 1320 2014-03-20 08:40:49Z raasch
[1320]	54	! ONLY-attribute added to USE-statements,
	55	! kind-parameters added to all INTEGER and REAL declaration statements,
	56	! kinds are defined in new module kinds,
	57	! old module precision_kind is removed,
	58	! revision history before 2012 removed,
	59	! comment fields (!:) to be used for variable explanations added to
	60	! all variable declaration statements
[1217]	61	!
[1319]	62	! 1318 2014-03-17 13:35:16Z raasch
	63	! module interfaces removed
	64	!
[1307]	65	! 1306 2014-03-13 14:30:59Z raasch
	66	! openmp sections removed from the overlap branch,
	67	! second argument removed from parameter list
	68	!
[1217]	69	! 1216 2013-08-26 09:31:42Z raasch
[1216]	70	! resorting of arrays moved to separate routines resort_for_...,
	71	! one argument, used as temporary work array, removed from all transpose
	72	! routines
	73	! overlapping fft / transposition implemented
[1112]	74	!
[1213]	75	! 1212 2013-08-15 08:46:27Z raasch
	76	! tridia routines moved to seperate module tridia_solver
	77	!
[1209]	78	! 1208 2013-08-13 06:41:49Z raasch
	79	! acc-update clauses added for "ar" so that ffts other than cufft can also be
	80	! used (although they are not ported and will give a poor performance)
	81	!
[1112]	82	! 1111 2013-03-08 23:54:10Z raasch
[1111]	83	! further openACC porting of non-parallel (MPI) branch:
	84	! tridiagonal routines split into extermal subroutines (instead using CONTAINS),
	85	! no distinction between parallel/non-parallel in poisfft and tridia any more,
[1112]	86	! tridia routines moved to end of file because of probable bug in PGI compiler 12.5
[1111]	87	! (otherwise "invalid device function" is indicated during runtime),
	88	! optimization of tridia routines: constant elements and coefficients of tri are
	89	! stored in seperate arrays ddzuw and tric, last dimension of tri reduced from 5
	90	! to 2,
	91	! poisfft_init is now called internally from poisfft, maketri is called from
	92	! poisfft_init,
	93	! ibc_p_b = 2 removed
[1]	94	!
[1107]	95	! 1106 2013-03-04 05:31:38Z raasch
	96	! routines fftx, ffty, fftxp, fftyp removed, calls replaced by fft_x, fft_y,
	97	! in the 1D-decomposition routines fft_x, ffty are replaced by fft_x_1d,
	98	! fft_y_1d
	99	!
[1104]	100	! 1103 2013-02-20 02:15:53Z raasch
	101	! tri, ar, and ar1 arguments in tridia-routines (2d) are removed because they
	102	! sometimes cause segmentation faults with intel 12.1 compiler
	103	!
[1093]	104	! 1092 2013-02-02 11:24:22Z raasch
	105	! unused variables removed
	106	!
[1037]	107	! 1036 2012-10-22 13:43:42Z raasch
	108	! code put under GPL (PALM 3.9)
	109	!
[1014]	110	! 2012-09-21 07:03:55Z raasch
	111	! FLOAT type conversion replaced by REAL
	112	!
[1004]	113	! 1003 2012-09-14 14:35:53Z raasch
	114	! indices nxa, nya, etc. replaced by nx, ny, etc.
	115	!
[941]	116	! 940 2012-07-09 14:31:00Z raasch
	117	! special handling of tri-array as an argument in tridia_1dd routines switched
	118	! off because it caused segmentation faults with intel 12.1 compiler
	119	!
[878]	120	! 877 2012-04-03 11:21:44Z suehring
	121	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
	122	! pressure at the top of the model domain.
	123	!
[810]	124	! 809 2012-01-30 13:32:58Z maronga
	125	! Bugfix: replaced .AND. and .NOT. with && and ! in the preprocessor directives
	126	!
[808]	127	! 807 2012-01-25 11:53:51Z maronga
	128	! New cpp directive "__check" implemented which is used by check_namelist_files
	129	! (most of the code is unneeded by check_namelist_files).
	130	!
[1]	131	! Revision 1.1 1997/07/24 11:24:14 raasch
	132	! Initial revision
	133	!
	134	!
	135	! Description:
	136	! ------------
[1682]	137	!> Solves the Poisson equation with a 2D spectral method
	138	!> d^2 p / dx^2 + d^2 p / dy^2 + d^2 p / dz^2 = s
	139	!>
	140	!> Input:
	141	!> real ar contains (nnz,nny,nnx) elements of the velocity divergence,
	142	!> starting from (1,nys,nxl)
	143	!>
	144	!> Output:
	145	!> real ar contains the solution for perturbation pressure p
[1]	146	!------------------------------------------------------------------------------!
[1682]	147	MODULE poisfft_mod
	148
[1]	149
[1320]	150	USE fft_xy, &
	151	ONLY: fft_init, fft_y, fft_y_1d, fft_y_m, fft_x, fft_x_1d, fft_x_m
[1]	152
[1320]	153	USE indices, &
	154	ONLY: nnx, nny, nx, nxl, nxr, ny, nys, nyn, nz
	155
	156	USE transpose_indices, &
	157	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nys_x, nys_z, nyn_x, nyn_z, nzb_x, &
	158	nzb_y, nzt_x, nzt_y
	159
	160	USE tridia_solver, &
	161	ONLY: tridia_1dd, tridia_init, tridia_substi, tridia_substi_overlap
	162
[1]	163	IMPLICIT NONE
	164
[1111]	165	LOGICAL, SAVE :: poisfft_initialized = .FALSE.
	166
[1]	167	PRIVATE
[807]	168
[1]	169	PUBLIC poisfft, poisfft_init
	170
	171	INTERFACE poisfft
	172	MODULE PROCEDURE poisfft
	173	END INTERFACE poisfft
	174
	175	INTERFACE poisfft_init
	176	MODULE PROCEDURE poisfft_init
	177	END INTERFACE poisfft_init
	178
[807]	179
[1]	180	CONTAINS
	181
[1682]	182	!------------------------------------------------------------------------------!
	183	! Description:
	184	! ------------
	185	!> @todo Missing subroutine description.
	186	!------------------------------------------------------------------------------!
[1]	187	SUBROUTINE poisfft_init
	188
[1320]	189	USE arrays_3d, &
	190	ONLY: ddzu_pres, ddzw
[1111]	191
[1320]	192	USE kinds
	193
[1111]	194	IMPLICIT NONE
	195
[1682]	196	INTEGER(iwp) :: k !<
[1111]	197
	198
[1]	199	CALL fft_init
	200
[1212]	201	CALL tridia_init
[1111]	202
	203	poisfft_initialized = .TRUE.
	204
[1]	205	END SUBROUTINE poisfft_init
	206
[1111]	207
[1804]	208
[1682]	209	!------------------------------------------------------------------------------!
	210	! Description:
	211	! ------------
	212	!> Two-dimensional Fourier Transformation in x- and y-direction.
	213	!------------------------------------------------------------------------------!
[1306]	214	SUBROUTINE poisfft( ar )
[1]	215
[1320]	216	USE control_parameters, &
	217	ONLY: fft_method, transpose_compute_overlap
	218
	219	USE cpulog, &
	220	ONLY: cpu_log, cpu_log_nowait, log_point_s
	221
	222	USE kinds
	223
[1]	224	USE pegrid
	225
	226	IMPLICIT NONE
	227
[1682]	228	INTEGER(iwp) :: ii !<
	229	INTEGER(iwp) :: iind !<
	230	INTEGER(iwp) :: inew !<
	231	INTEGER(iwp) :: jj !<
	232	INTEGER(iwp) :: jind !<
	233	INTEGER(iwp) :: jnew !<
	234	INTEGER(iwp) :: ki !<
	235	INTEGER(iwp) :: kk !<
	236	INTEGER(iwp) :: knew !<
	237	INTEGER(iwp) :: n !<
	238	INTEGER(iwp) :: nblk !<
	239	INTEGER(iwp) :: nnx_y !<
	240	INTEGER(iwp) :: nny_z !<
	241	INTEGER(iwp) :: nnz_t !<
	242	INTEGER(iwp) :: nnz_x !<
	243	INTEGER(iwp) :: nxl_y_bound !<
	244	INTEGER(iwp) :: nxr_y_bound !<
[1]	245
[1682]	246	INTEGER(iwp), DIMENSION(4) :: isave !<
[1320]	247
[1682]	248	REAL(wp), DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar !<
	249	REAL(wp), DIMENSION(nys:nyn,nxl:nxr,1:nz) :: ar_inv !<
[1]	250
[1682]	251	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: ar1 !<
	252	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_in !<
	253	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_inv !<
	254	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_out_y !<
	255	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_out_z !<
[1216]	256
	257
[1]	258	CALL cpu_log( log_point_s(3), 'poisfft', 'start' )
	259
[1111]	260	IF ( .NOT. poisfft_initialized ) CALL poisfft_init
	261
[1]	262	!
	263	!-- Two-dimensional Fourier Transformation in x- and y-direction.
[2118]	264	IF ( pdims(2) == 1 .AND. pdims(1) > 1 ) THEN
[1]	265
	266	!
	267	!-- 1d-domain-decomposition along x:
	268	!-- FFT along y and transposition y --> x
[1216]	269	CALL ffty_tr_yx( ar, ar )
[1]	270
	271	!
	272	!-- FFT along x, solving the tridiagonal system and backward FFT
	273	CALL fftx_tri_fftx( ar )
	274
	275	!
	276	!-- Transposition x --> y and backward FFT along y
[1216]	277	CALL tr_xy_ffty( ar, ar )
[1]	278
[2118]	279	ELSEIF ( pdims(1) == 1 .AND. pdims(2) > 1 ) THEN
[1]	280
	281	!
	282	!-- 1d-domain-decomposition along y:
	283	!-- FFT along x and transposition x --> y
[1216]	284	CALL fftx_tr_xy( ar, ar )
[1]	285
	286	!
	287	!-- FFT along y, solving the tridiagonal system and backward FFT
	288	CALL ffty_tri_ffty( ar )
	289
	290	!
	291	!-- Transposition y --> x and backward FFT along x
[1216]	292	CALL tr_yx_fftx( ar, ar )
[1]	293
[1216]	294	ELSEIF ( .NOT. transpose_compute_overlap ) THEN
[1]	295
	296	!
[1111]	297	!-- 2d-domain-decomposition or no decomposition (1 PE run)
[1]	298	!-- Transposition z --> x
	299	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
[1216]	300	CALL resort_for_zx( ar, ar_inv )
	301	CALL transpose_zx( ar_inv, ar )
[1]	302	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
	303
	304	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
[1106]	305	CALL fft_x( ar, 'forward' )
[1]	306	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
	307
	308	!
	309	!-- Transposition x --> y
	310	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
[1216]	311	CALL resort_for_xy( ar, ar_inv )
	312	CALL transpose_xy( ar_inv, ar )
[1]	313	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
	314
	315	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
[1216]	316	CALL fft_y( ar, 'forward', ar_tr = ar, &
	317	nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
	318	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
[1]	319	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	320
	321	!
	322	!-- Transposition y --> z
	323	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
[1216]	324	CALL resort_for_yz( ar, ar_inv )
	325	CALL transpose_yz( ar_inv, ar )
[1]	326	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
	327
	328	!
[1106]	329	!-- Solve the tridiagonal equation system along z
[1]	330	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
[1212]	331	CALL tridia_substi( ar )
[1]	332	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
	333
	334	!
	335	!-- Inverse Fourier Transformation
	336	!-- Transposition z --> y
	337	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
[1216]	338	CALL transpose_zy( ar, ar_inv )
	339	CALL resort_for_zy( ar_inv, ar )
[1]	340	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
	341
	342	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
[1216]	343	CALL fft_y( ar, 'backward', ar_tr = ar, &
	344	nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
	345	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
[1]	346	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	347
	348	!
	349	!-- Transposition y --> x
	350	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
[1216]	351	CALL transpose_yx( ar, ar_inv )
	352	CALL resort_for_yx( ar_inv, ar )
[1]	353	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
	354
	355	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
[1106]	356	CALL fft_x( ar, 'backward' )
[1]	357	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	358
	359	!
	360	!-- Transposition x --> z
	361	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
[1216]	362	CALL transpose_xz( ar, ar_inv )
	363	CALL resort_for_xz( ar_inv, ar )
[1]	364	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
	365
[1216]	366	ELSE
	367
	368	!
	369	!-- 2d-domain-decomposition or no decomposition (1 PE run) with
	370	!-- overlapping transposition / fft
[1318]	371	!-- cputime logging must not use barriers, which would prevent overlapping
[1216]	372	ALLOCATE( f_out_y(0:ny,nxl_y:nxr_y,nzb_y:nzt_y), &
	373	f_out_z(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
	374	!
	375	!-- Transposition z --> x + subsequent fft along x
	376	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
	377	CALL resort_for_zx( ar, f_inv )
	378	!
	379	!-- Save original indices and gridpoint counter
	380	isave(1) = nz
	381	isave(2) = nzb_x
	382	isave(3) = nzt_x
	383	isave(4) = sendrecvcount_zx
	384	!
	385	!-- Set new indices for transformation
	386	nblk = nz / pdims(1)
	387	nz = pdims(1)
	388	nnz_x = 1
	389	nzb_x = 1 + myidx * nnz_x
	390	nzt_x = ( myidx + 1 ) * nnz_x
	391	sendrecvcount_zx = nnx * nny * nnz_x
	392
[1306]	393	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
[1216]	394	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
	395
[1306]	396	DO kk = 1, nblk
[1216]	397
[1306]	398	IF ( kk == 1 ) THEN
[1318]	399	CALL cpu_log( log_point_s(5), 'transpo forward', 'start', cpu_log_nowait )
[1306]	400	ELSE
[1318]	401	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
[1306]	402	ENDIF
[1216]	403
[1306]	404	DO knew = 1, nz
	405	ki = kk + nblk * ( knew - 1 )
	406	f_in(:,:,knew) = f_inv(:,:,ki)
	407	ENDDO
[1216]	408
[1306]	409	CALL transpose_zx( f_in, ar1(:,:,:))
	410	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
[1216]	411
[1306]	412	IF ( kk == 1 ) THEN
[1318]	413	CALL cpu_log( log_point_s(4), 'fft_x', 'start', cpu_log_nowait )
[1306]	414	ELSE
[1318]	415	CALL cpu_log( log_point_s(4), 'fft_x', 'continue', cpu_log_nowait )
[1216]	416	ENDIF
	417
[1306]	418	n = isave(2) + kk - 1
	419	CALL fft_x( ar1(:,:,:), 'forward', ar_2d = f_out_z(:,:,n))
	420	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
[1216]	421
	422	ENDDO
	423	!
	424	!-- Restore original indices/counters
	425	nz = isave(1)
	426	nzb_x = isave(2)
	427	nzt_x = isave(3)
	428	sendrecvcount_zx = isave(4)
	429
	430	DEALLOCATE( ar1, f_in, f_inv )
	431
	432	!
	433	!-- Transposition x --> y + subsequent fft along y
	434	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
	435	CALL resort_for_xy( f_out_z, f_inv )
	436	!
	437	!-- Save original indices and gridpoint counter
	438	isave(1) = nx
	439	isave(2) = nxl_y
	440	isave(3) = nxr_y
	441	isave(4) = sendrecvcount_xy
	442	!
	443	!-- Set new indices for transformation
	444	nblk = ( ( nx+1 ) / pdims(2) ) - 1
	445	nx = pdims(2)
	446	nnx_y = 1
	447	nxl_y = myidy * nnx_y
	448	nxr_y = ( myidy + 1 ) * nnx_y - 1
	449	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
	450
[1306]	451	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
[1216]	452	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
	453
[1306]	454	DO ii = 0, nblk
[1216]	455
[1318]	456	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
[1216]	457
[1306]	458	DO inew = 0, nx-1
	459	iind = ii + ( nblk + 1 ) * inew
	460	f_in(:,:,inew) = f_inv(:,:,iind)
	461	ENDDO
[1216]	462
[1306]	463	CALL transpose_xy( f_in, ar1(:,:,:) )
[1216]	464
[1306]	465	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
[1216]	466
[1306]	467	IF ( ii == 1 ) THEN
[1318]	468	CALL cpu_log( log_point_s(7), 'fft_y', 'start', cpu_log_nowait )
[1306]	469	ELSE
[1318]	470	CALL cpu_log( log_point_s(7), 'fft_y', 'continue', cpu_log_nowait )
[1216]	471	ENDIF
	472
[1306]	473	nxl_y_bound = isave(2)
	474	nxr_y_bound = isave(3)
	475	n = isave(2) + ii
	476	CALL fft_y( ar1(:,:,:), 'forward', ar_tr = f_out_y, &
	477	nxl_y_bound = nxl_y_bound, nxr_y_bound = nxr_y_bound, &
	478	nxl_y_l = n, nxr_y_l = n )
[1216]	479
[1306]	480	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
[1216]	481
	482	ENDDO
	483	!
	484	!-- Restore original indices/counters
	485	nx = isave(1)
	486	nxl_y = isave(2)
	487	nxr_y = isave(3)
	488	sendrecvcount_xy = isave(4)
	489
	490	DEALLOCATE( ar1, f_in, f_inv )
	491
	492	!
	493	!-- Transposition y --> z + subsequent tridia + resort for z --> y
	494	ALLOCATE( f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
	495	CALL resort_for_yz( f_out_y, f_inv )
	496	!
	497	!-- Save original indices and gridpoint counter
	498	isave(1) = ny
	499	isave(2) = nys_z
	500	isave(3) = nyn_z
	501	isave(4) = sendrecvcount_yz
	502	!
	503	!-- Set new indices for transformation
	504	nblk = ( ( ny+1 ) / pdims(1) ) - 1
	505	ny = pdims(1)
	506	nny_z = 1
	507	nys_z = myidx * nny_z
	508	nyn_z = ( myidx + 1 ) * nny_z - 1
	509	sendrecvcount_yz = ( nxr_y-nxl_y+1 ) * nny_z * ( nzt_y-nzb_y+1 )
	510
[1306]	511	ALLOCATE( ar1(nxl_z:nxr_z,nys_z:nyn_z,1:nz) )
[1216]	512	ALLOCATE( f_in(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
	513
[1306]	514	DO jj = 0, nblk
[1216]	515	!
[1306]	516	!-- Forward Fourier Transformation
	517	!-- Transposition y --> z
[1318]	518	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
[1216]	519
[1306]	520	DO jnew = 0, ny-1
	521	jind = jj + ( nblk + 1 ) * jnew
	522	f_in(:,:,jnew) = f_inv(:,:,jind)
	523	ENDDO
[1216]	524
[1306]	525	CALL transpose_yz( f_in, ar1(:,:,:) )
[1216]	526
[1306]	527	IF ( jj == nblk ) THEN
	528	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
	529	ELSE
	530	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
[1216]	531	ENDIF
	532
	533	!
[1306]	534	!-- Solve the tridiagonal equation system along z
[1318]	535	CALL cpu_log( log_point_s(6), 'tridia', 'start', cpu_log_nowait )
[1216]	536
[1306]	537	n = isave(2) + jj
	538	CALL tridia_substi_overlap( ar1(:,:,:), n )
[1216]	539
[1306]	540	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
[1216]	541
[1306]	542	!
	543	!-- Inverse Fourier Transformation
	544	!-- Transposition z --> y
	545	!-- Only one thread should call MPI routines, therefore forward and
	546	!-- backward tranpose are in the same section
	547	IF ( jj == 0 ) THEN
[1318]	548	CALL cpu_log( log_point_s(8), 'transpo invers', 'start', cpu_log_nowait )
[1306]	549	ELSE
[1318]	550	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
[1216]	551	ENDIF
	552
[1306]	553	CALL transpose_zy( ar1(:,:,:), f_in )
[1216]	554
[1306]	555	DO jnew = 0, ny-1
	556	jind = jj + ( nblk + 1 ) * jnew
	557	f_inv(:,:,jind) = f_in(:,:,jnew)
	558	ENDDO
[1216]	559
[1306]	560	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
[1216]	561
	562	ENDDO
	563	!
	564	!-- Restore original indices/counters
	565	ny = isave(1)
	566	nys_z = isave(2)
	567	nyn_z = isave(3)
	568	sendrecvcount_yz = isave(4)
	569
	570	CALL resort_for_zy( f_inv, f_out_y )
	571
	572	DEALLOCATE( ar1, f_in, f_inv )
	573
	574	!
	575	!-- fft along y backward + subsequent transposition y --> x
	576	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
	577	!
	578	!-- Save original indices and gridpoint counter
	579	isave(1) = nx
	580	isave(2) = nxl_y
	581	isave(3) = nxr_y
	582	isave(4) = sendrecvcount_xy
	583	!
	584	!-- Set new indices for transformation
	585	nblk = (( nx+1 ) / pdims(2) ) - 1
	586	nx = pdims(2)
	587	nnx_y = 1
	588	nxl_y = myidy * nnx_y
	589	nxr_y = ( myidy + 1 ) * nnx_y - 1
	590	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
	591
[1306]	592	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
[1216]	593	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
	594
[1306]	595	DO ii = 0, nblk
[1216]	596
[1318]	597	CALL cpu_log( log_point_s(7), 'fft_y', 'continue', cpu_log_nowait )
[1216]	598
[1306]	599	n = isave(2) + ii
	600	nxl_y_bound = isave(2)
	601	nxr_y_bound = isave(3)
[1216]	602
[1306]	603	CALL fft_y( ar1(:,:,:), 'backward', ar_tr = f_out_y, &
	604	nxl_y_bound = nxl_y_bound, nxr_y_bound = nxr_y_bound, &
	605	nxl_y_l = n, nxr_y_l = n )
[1216]	606
[1306]	607	IF ( ii == nblk ) THEN
	608	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	609	ELSE
	610	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
[1216]	611	ENDIF
	612
[1318]	613	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
[1216]	614
[1306]	615	CALL transpose_yx( ar1(:,:,:), f_in )
[1216]	616
[1306]	617	DO inew = 0, nx-1
	618	iind = ii + (nblk+1) * inew
	619	f_inv(:,:,iind) = f_in(:,:,inew)
	620	ENDDO
[1216]	621
[1306]	622	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
[1216]	623
	624	ENDDO
	625	!
	626	!-- Restore original indices/counters
	627	nx = isave(1)
	628	nxl_y = isave(2)
	629	nxr_y = isave(3)
	630	sendrecvcount_xy = isave(4)
	631
	632	CALL resort_for_yx( f_inv, f_out_z )
	633
	634	DEALLOCATE( ar1, f_in, f_inv )
	635
	636	!
	637	!-- fft along x backward + subsequent final transposition x --> z
	638	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
	639	!
	640	!-- Save original indices and gridpoint counter
	641	isave(1) = nz
	642	isave(2) = nzb_x
	643	isave(3) = nzt_x
	644	isave(4) = sendrecvcount_zx
	645	!
	646	!-- Set new indices for transformation
	647	nblk = nz / pdims(1)
	648	nz = pdims(1)
	649	nnz_x = 1
	650	nzb_x = 1 + myidx * nnz_x
	651	nzt_x = ( myidx + 1 ) * nnz_x
	652	sendrecvcount_zx = nnx * nny * nnz_x
	653
[1306]	654	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
[1216]	655	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
	656
[1306]	657	DO kk = 1, nblk
[1216]	658
[1318]	659	CALL cpu_log( log_point_s(4), 'fft_x', 'continue', cpu_log_nowait )
[1216]	660
[1306]	661	n = isave(2) + kk - 1
	662	CALL fft_x( ar1(:,:,:), 'backward', f_out_z(:,:,n))
[1216]	663
[1306]	664	IF ( kk == nblk ) THEN
	665	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	666	ELSE
	667	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
[1216]	668	ENDIF
	669
[1318]	670	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
[1216]	671
[1306]	672	CALL transpose_xz( ar1(:,:,:), f_in )
[1216]	673
[1306]	674	DO knew = 1, nz
	675	ki = kk + nblk * (knew-1)
	676	f_inv(:,:,ki) = f_in(:,:,knew)
	677	ENDDO
[1216]	678
[1306]	679	IF ( kk == nblk ) THEN
	680	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
	681	ELSE
	682	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
[1216]	683	ENDIF
	684
	685	ENDDO
	686	!
	687	!-- Restore original indices/counters
	688	nz = isave(1)
	689	nzb_x = isave(2)
	690	nzt_x = isave(3)
	691	sendrecvcount_zx = isave(4)
	692
	693	CALL resort_for_xz( f_inv, ar )
	694
	695	DEALLOCATE( ar1, f_in, f_inv )
	696
[1]	697	ENDIF
	698
	699	CALL cpu_log( log_point_s(3), 'poisfft', 'stop' )
	700
	701	END SUBROUTINE poisfft
	702
	703
	704	!------------------------------------------------------------------------------!
[1682]	705	! Description:
	706	! ------------
	707	!> Fourier-transformation along y with subsequent transposition y --> x for
	708	!> a 1d-decomposition along x.
	709	!>
	710	!> @attention The performance of this routine is much faster on the NEC-SX6,
	711	!> if the first index of work_ffty_vec is odd. Otherwise
	712	!> memory bank conflicts may occur (especially if the index is a
	713	!> multiple of 128). That's why work_ffty_vec is dimensioned as
	714	!> 0:ny+1.
	715	!> Of course, this will not work if users are using an odd number
	716	!> of gridpoints along y.
[1]	717	!------------------------------------------------------------------------------!
[1682]	718	SUBROUTINE ffty_tr_yx( f_in, f_out )
[1]	719
[1320]	720	USE control_parameters, &
[2300]	721	ONLY: loop_optimization
[1320]	722
	723	USE cpulog, &
	724	ONLY: cpu_log, log_point_s
	725
	726	USE kinds
	727
[1]	728	USE pegrid
	729
	730	IMPLICIT NONE
	731
[1682]	732	INTEGER(iwp) :: i !<
	733	INTEGER(iwp) :: iend !<
	734	INTEGER(iwp) :: iouter !<
	735	INTEGER(iwp) :: ir !<
	736	INTEGER(iwp) :: j !<
	737	INTEGER(iwp) :: k !<
[1]	738
[1682]	739	INTEGER(iwp), PARAMETER :: stridex = 4 !<
[1320]	740
[1682]	741	REAL(wp), DIMENSION(1:nz,0:ny,nxl:nxr) :: f_in !<
	742	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_out !<
	743	REAL(wp), DIMENSION(nxl:nxr,1:nz,0:ny) :: work !<
[1]	744
[2300]	745	REAL(wp), DIMENSION(:,:), ALLOCATABLE :: work_ffty !<
	746	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: work_ffty_vec !<
	747
[1]	748	!
	749	!-- Carry out the FFT along y, where all data are present due to the
	750	!-- 1d-decomposition along x. Resort the data in a way that x becomes
	751	!-- the first index.
[1106]	752	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'start' )
[1]	753
[2300]	754	IF ( loop_optimization == 'vector' ) THEN
	755
	756	ALLOCATE( work_ffty_vec(0:ny+1,1:nz,nxl:nxr) )
[1]	757	!
	758	!-- Code optimized for vector processors
[2300]	759	!$OMP PARALLEL PRIVATE ( i, j, k )
	760	!$OMP DO
[1]	761	DO i = nxl, nxr
	762
	763	DO j = 0, ny
	764	DO k = 1, nz
	765	work_ffty_vec(j,k,i) = f_in(k,j,i)
	766	ENDDO
	767	ENDDO
	768
	769	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' )
	770
	771	ENDDO
	772
[2300]	773	!$OMP DO
[1]	774	DO k = 1, nz
	775	DO j = 0, ny
	776	DO i = nxl, nxr
	777	work(i,k,j) = work_ffty_vec(j,k,i)
	778	ENDDO
	779	ENDDO
	780	ENDDO
[2300]	781	!$OMP END PARALLEL
[1]	782
[2300]	783	DEALLOCATE( work_ffty_vec )
	784
[1]	785	ELSE
	786	!
	787	!-- Cache optimized code.
[2300]	788	ALLOCATE( work_ffty(0:ny,stridex) )
	789	!
[1]	790	!-- The i-(x-)direction is split into a strided outer loop and an inner
	791	!-- loop for better cache performance
[2300]	792	!$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty)
	793	!$OMP DO
[1]	794	DO iouter = nxl, nxr, stridex
	795
	796	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
	797
	798	DO k = 1, nz
	799
	800	DO i = iouter, iend
	801
	802	ir = i-iouter+1 ! counter within a stride
	803	DO j = 0, ny
	804	work_ffty(j,ir) = f_in(k,j,i)
	805	ENDDO
	806	!
	807	!-- FFT along y
[1106]	808	CALL fft_y_1d( work_ffty(:,ir), 'forward' )
[1]	809
	810	ENDDO
	811
	812	!
	813	!-- Resort
	814	DO j = 0, ny
	815	DO i = iouter, iend
	816	work(i,k,j) = work_ffty(j,i-iouter+1)
	817	ENDDO
	818	ENDDO
	819
	820	ENDDO
	821
	822	ENDDO
[2300]	823	!$OMP END PARALLEL
[1]	824
[2300]	825	DEALLOCATE( work_ffty )
	826
[1]	827	ENDIF
[2300]	828
[1106]	829	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'pause' )
[1]	830
	831	!
	832	!-- Transpose array
[1111]	833	#if defined( __parallel )
[1]	834	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	835	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	836	CALL MPI_ALLTOALL( work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
	837	f_out(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
	838	comm1dx, ierr )
	839	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1111]	840	#endif
[1]	841
	842	END SUBROUTINE ffty_tr_yx
	843
	844
	845	!------------------------------------------------------------------------------!
[1682]	846	! Description:
	847	! ------------
	848	!> Transposition x --> y with a subsequent backward Fourier transformation for
	849	!> a 1d-decomposition along x
[1]	850	!------------------------------------------------------------------------------!
[1682]	851	SUBROUTINE tr_xy_ffty( f_in, f_out )
[1]	852
[1320]	853	USE control_parameters, &
[2300]	854	ONLY: loop_optimization
[1320]	855
	856	USE cpulog, &
	857	ONLY: cpu_log, log_point_s
	858
	859	USE kinds
	860
[1]	861	USE pegrid
	862
	863	IMPLICIT NONE
	864
[1682]	865	INTEGER(iwp) :: i !<
	866	INTEGER(iwp) :: iend !<
	867	INTEGER(iwp) :: iouter !<
	868	INTEGER(iwp) :: ir !<
	869	INTEGER(iwp) :: j !<
	870	INTEGER(iwp) :: k !<
[1]	871
[1682]	872	INTEGER(iwp), PARAMETER :: stridex = 4 !<
[1320]	873
[1682]	874	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_in !<
	875	REAL(wp), DIMENSION(1:nz,0:ny,nxl:nxr) :: f_out !<
	876	REAL(wp), DIMENSION(nxl:nxr,1:nz,0:ny) :: work !<
[1]	877
[2300]	878	REAL(wp), DIMENSION(:,:), ALLOCATABLE :: work_ffty !<
	879	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: work_ffty_vec !<
	880
[1]	881	!
	882	!-- Transpose array
[1111]	883	#if defined( __parallel )
[1]	884	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	885	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	886	CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
	887	work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
	888	comm1dx, ierr )
	889	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1111]	890	#endif
[1]	891
	892	!
	893	!-- Resort the data in a way that y becomes the first index and carry out the
	894	!-- backward fft along y.
[1106]	895	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'continue' )
[1]	896
[2300]	897	IF ( loop_optimization == 'vector' ) THEN
	898
	899	ALLOCATE( work_ffty_vec(0:ny+1,1:nz,nxl:nxr) )
[1]	900	!
	901	!-- Code optimized for vector processors
[2300]	902	!$OMP PARALLEL PRIVATE ( i, j, k )
	903	!$OMP DO
[1]	904	DO k = 1, nz
	905	DO j = 0, ny
	906	DO i = nxl, nxr
	907	work_ffty_vec(j,k,i) = work(i,k,j)
	908	ENDDO
	909	ENDDO
	910	ENDDO
	911
[2300]	912	!$OMP DO
[1]	913	DO i = nxl, nxr
	914
	915	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' )
	916
	917	DO j = 0, ny
	918	DO k = 1, nz
	919	f_out(k,j,i) = work_ffty_vec(j,k,i)
	920	ENDDO
	921	ENDDO
	922
	923	ENDDO
[2300]	924	!$OMP END PARALLEL
[1]	925
[2300]	926	DEALLOCATE( work_ffty_vec )
	927
[1]	928	ELSE
	929	!
	930	!-- Cache optimized code.
[2300]	931	ALLOCATE( work_ffty(0:ny,stridex) )
	932	!
[1]	933	!-- The i-(x-)direction is split into a strided outer loop and an inner
	934	!-- loop for better cache performance
[2300]	935	!$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty )
	936	!$OMP DO
[1]	937	DO iouter = nxl, nxr, stridex
	938
	939	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
	940
	941	DO k = 1, nz
	942	!
	943	!-- Resort
	944	DO j = 0, ny
	945	DO i = iouter, iend
	946	work_ffty(j,i-iouter+1) = work(i,k,j)
	947	ENDDO
	948	ENDDO
	949
	950	DO i = iouter, iend
	951
	952	!
	953	!-- FFT along y
	954	ir = i-iouter+1 ! counter within a stride
[1106]	955	CALL fft_y_1d( work_ffty(:,ir), 'backward' )
[1]	956
	957	DO j = 0, ny
	958	f_out(k,j,i) = work_ffty(j,ir)
	959	ENDDO
	960	ENDDO
	961
	962	ENDDO
	963
	964	ENDDO
[2300]	965	!$OMP END PARALLEL
[1]	966
[2300]	967	DEALLOCATE( work_ffty )
	968
[1]	969	ENDIF
	970
[1106]	971	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'stop' )
[1]	972
	973	END SUBROUTINE tr_xy_ffty
	974
	975
	976	!------------------------------------------------------------------------------!
[1682]	977	! Description:
	978	! ------------
	979	!> FFT along x, solution of the tridiagonal system and backward FFT for
	980	!> a 1d-decomposition along x
	981	!>
	982	!> @warning this subroutine may still not work for hybrid parallelization
	983	!> with OpenMP (for possible necessary changes see the original
	984	!> routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
[1]	985	!------------------------------------------------------------------------------!
[1682]	986	SUBROUTINE fftx_tri_fftx( ar )
[1]	987
[1320]	988	USE control_parameters, &
[2300]	989	ONLY: loop_optimization
[1320]	990
	991	USE cpulog, &
	992	ONLY: cpu_log, log_point_s
	993
	994	USE grid_variables, &
	995	ONLY: ddx2, ddy2
	996
	997	USE kinds
	998
[1]	999	USE pegrid
	1000
	1001	IMPLICIT NONE
	1002
[1682]	1003	INTEGER(iwp) :: i !<
	1004	INTEGER(iwp) :: j !<
	1005	INTEGER(iwp) :: k !<
	1006	INTEGER(iwp) :: m !<
	1007	INTEGER(iwp) :: n !<
	1008	INTEGER(iwp) :: omp_get_thread_num !<
	1009	INTEGER(iwp) :: tn !<
[1]	1010
[1682]	1011	REAL(wp), DIMENSION(0:nx) :: work_fftx !<
	1012	REAL(wp), DIMENSION(0:nx,1:nz) :: work_trix !<
	1013	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: ar !<
	1014	REAL(wp), DIMENSION(:,:,:,:), ALLOCATABLE :: tri !<
[1]	1015
	1016
[1106]	1017	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'start' )
[1]	1018
	1019	ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) )
	1020
	1021	tn = 0 ! Default thread number in case of one thread
	1022	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_fftx, work_trix )
	1023	DO j = nys_x, nyn_x
	1024
	1025	!$ tn = omp_get_thread_num()
	1026
[2300]	1027	IF ( loop_optimization == 'vector' ) THEN
[1]	1028	!
	1029	!-- Code optimized for vector processors
	1030	DO k = 1, nz
	1031
	1032	m = 0
	1033	DO n = 1, pdims(1)
[1003]	1034	DO i = 1, nnx
[1]	1035	work_trix(m,k) = ar(i,k,j,n)
	1036	m = m + 1
	1037	ENDDO
	1038	ENDDO
	1039
	1040	ENDDO
	1041
	1042	CALL fft_x_m( work_trix, 'forward' )
	1043
	1044	ELSE
	1045	!
	1046	!-- Cache optimized code
	1047	DO k = 1, nz
	1048
	1049	m = 0
	1050	DO n = 1, pdims(1)
[1003]	1051	DO i = 1, nnx
[1]	1052	work_fftx(m) = ar(i,k,j,n)
	1053	m = m + 1
	1054	ENDDO
	1055	ENDDO
	1056
[1106]	1057	CALL fft_x_1d( work_fftx, 'forward' )
[1]	1058
	1059	DO i = 0, nx
	1060	work_trix(i,k) = work_fftx(i)
	1061	ENDDO
	1062
	1063	ENDDO
	1064
	1065	ENDIF
	1066
	1067	!
	1068	!-- Solve the linear equation system
	1069	CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) )
	1070
[2300]	1071	IF ( loop_optimization == 'vector' ) THEN
[1]	1072	!
	1073	!-- Code optimized for vector processors
	1074	CALL fft_x_m( work_trix, 'backward' )
	1075
	1076	DO k = 1, nz
	1077
	1078	m = 0
	1079	DO n = 1, pdims(1)
[1003]	1080	DO i = 1, nnx
[1]	1081	ar(i,k,j,n) = work_trix(m,k)
	1082	m = m + 1
	1083	ENDDO
	1084	ENDDO
	1085
	1086	ENDDO
	1087
	1088	ELSE
	1089	!
	1090	!-- Cache optimized code
	1091	DO k = 1, nz
	1092
	1093	DO i = 0, nx
	1094	work_fftx(i) = work_trix(i,k)
	1095	ENDDO
	1096
[1106]	1097	CALL fft_x_1d( work_fftx, 'backward' )
[1]	1098
	1099	m = 0
	1100	DO n = 1, pdims(1)
[1003]	1101	DO i = 1, nnx
[1]	1102	ar(i,k,j,n) = work_fftx(m)
	1103	m = m + 1
	1104	ENDDO
	1105	ENDDO
	1106
	1107	ENDDO
	1108
	1109	ENDIF
	1110
	1111	ENDDO
	1112
	1113	DEALLOCATE( tri )
	1114
[1106]	1115	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'stop' )
[1]	1116
	1117	END SUBROUTINE fftx_tri_fftx
	1118
	1119
	1120	!------------------------------------------------------------------------------!
[1682]	1121	! Description:
	1122	! ------------
	1123	!> Fourier-transformation along x with subsequent transposition x --> y for
	1124	!> a 1d-decomposition along y.
	1125	!>
	1126	!> @attention NEC-branch of this routine may significantly profit from
	1127	!> further optimizations. So far, performance is much worse than
	1128	!> for routine ffty_tr_yx (more than three times slower).
[1]	1129	!------------------------------------------------------------------------------!
[1682]	1130	SUBROUTINE fftx_tr_xy( f_in, f_out )
[1]	1131
[1682]	1132
[1320]	1133	USE control_parameters, &
[2300]	1134	ONLY: loop_optimization
[1320]	1135
	1136	USE cpulog, &
	1137	ONLY: cpu_log, log_point_s
	1138
	1139	USE kinds
	1140
[1]	1141	USE pegrid
	1142
	1143	IMPLICIT NONE
	1144
[1682]	1145	INTEGER(iwp) :: i !<
	1146	INTEGER(iwp) :: j !<
	1147	INTEGER(iwp) :: k !<
[1]	1148
[1682]	1149	REAL(wp), DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx !<
	1150	REAL(wp), DIMENSION(1:nz,nys:nyn,0:nx) :: f_in !<
	1151	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_out !<
	1152	REAL(wp), DIMENSION(nys:nyn,1:nz,0:nx) :: work !<
[1]	1153
	1154	!
	1155	!-- Carry out the FFT along x, where all data are present due to the
	1156	!-- 1d-decomposition along y. Resort the data in a way that y becomes
	1157	!-- the first index.
[1106]	1158	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'start' )
[1]	1159
[2300]	1160	IF ( loop_optimization == 'vector' ) THEN
[1]	1161	!
	1162	!-- Code for vector processors
[85]	1163	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	1164	!$OMP DO
	1165	DO i = 0, nx
	1166
	1167	DO j = nys, nyn
	1168	DO k = 1, nz
	1169	work_fftx(i,k,j) = f_in(k,j,i)
	1170	ENDDO
	1171	ENDDO
	1172
	1173	ENDDO
	1174
	1175	!$OMP DO
	1176	DO j = nys, nyn
	1177
	1178	CALL fft_x_m( work_fftx(:,:,j), 'forward' )
	1179
	1180	DO k = 1, nz
	1181	DO i = 0, nx
	1182	work(j,k,i) = work_fftx(i,k,j)
	1183	ENDDO
	1184	ENDDO
	1185
	1186	ENDDO
	1187	!$OMP END PARALLEL
	1188
	1189	ELSE
	1190
	1191	!
	1192	!-- Cache optimized code (there might be still a potential for better
	1193	!-- optimization).
[696]	1194	!$OMP PARALLEL PRIVATE (i,j,k)
[1]	1195	!$OMP DO
	1196	DO i = 0, nx
	1197
	1198	DO j = nys, nyn
	1199	DO k = 1, nz
	1200	work_fftx(i,k,j) = f_in(k,j,i)
	1201	ENDDO
	1202	ENDDO
	1203
	1204	ENDDO
	1205
	1206	!$OMP DO
	1207	DO j = nys, nyn
	1208	DO k = 1, nz
	1209
[1106]	1210	CALL fft_x_1d( work_fftx(0:nx,k,j), 'forward' )
[1]	1211
	1212	DO i = 0, nx
	1213	work(j,k,i) = work_fftx(i,k,j)
	1214	ENDDO
	1215	ENDDO
	1216
	1217	ENDDO
	1218	!$OMP END PARALLEL
	1219
	1220	ENDIF
[1106]	1221	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'pause' )
[1]	1222
	1223	!
	1224	!-- Transpose array
[1111]	1225	#if defined( __parallel )
[1]	1226	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	1227	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	1228	CALL MPI_ALLTOALL( work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
	1229	f_out(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
	1230	comm1dy, ierr )
	1231	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1111]	1232	#endif
[1]	1233
	1234	END SUBROUTINE fftx_tr_xy
	1235
	1236
	1237	!------------------------------------------------------------------------------!
[1682]	1238	! Description:
	1239	! ------------
	1240	!> Transposition y --> x with a subsequent backward Fourier transformation for
	1241	!> a 1d-decomposition along x.
[1]	1242	!------------------------------------------------------------------------------!
[1682]	1243	SUBROUTINE tr_yx_fftx( f_in, f_out )
[1]	1244
[1682]	1245
[1320]	1246	USE control_parameters, &
[2300]	1247	ONLY: loop_optimization
[1320]	1248
	1249	USE cpulog, &
	1250	ONLY: cpu_log, log_point_s
	1251
	1252	USE kinds
	1253
[1]	1254	USE pegrid
	1255
	1256	IMPLICIT NONE
	1257
[1682]	1258	INTEGER(iwp) :: i !<
	1259	INTEGER(iwp) :: j !<
	1260	INTEGER(iwp) :: k !<
[1]	1261
[1682]	1262	REAL(wp), DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx !<
	1263	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_in !<
	1264	REAL(wp), DIMENSION(1:nz,nys:nyn,0:nx) :: f_out !<
	1265	REAL(wp), DIMENSION(nys:nyn,1:nz,0:nx) :: work !<
[1]	1266
	1267	!
	1268	!-- Transpose array
[1111]	1269	#if defined( __parallel )
[1]	1270	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	1271	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	1272	CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
	1273	work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
	1274	comm1dy, ierr )
	1275	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1111]	1276	#endif
[1]	1277
	1278	!
	1279	!-- Carry out the FFT along x, where all data are present due to the
	1280	!-- 1d-decomposition along y. Resort the data in a way that y becomes
	1281	!-- the first index.
[1106]	1282	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'continue' )
[1]	1283
[2300]	1284	IF ( loop_optimization == 'vector' ) THEN
[1]	1285	!
	1286	!-- Code optimized for vector processors
[85]	1287	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	1288	!$OMP DO
	1289	DO j = nys, nyn
	1290
	1291	DO k = 1, nz
	1292	DO i = 0, nx
	1293	work_fftx(i,k,j) = work(j,k,i)
	1294	ENDDO
	1295	ENDDO
	1296
	1297	CALL fft_x_m( work_fftx(:,:,j), 'backward' )
	1298
	1299	ENDDO
	1300
	1301	!$OMP DO
	1302	DO i = 0, nx
	1303	DO j = nys, nyn
	1304	DO k = 1, nz
	1305	f_out(k,j,i) = work_fftx(i,k,j)
	1306	ENDDO
	1307	ENDDO
	1308	ENDDO
	1309	!$OMP END PARALLEL
	1310
	1311	ELSE
	1312
	1313	!
	1314	!-- Cache optimized code (there might be still a potential for better
	1315	!-- optimization).
[696]	1316	!$OMP PARALLEL PRIVATE (i,j,k)
[1]	1317	!$OMP DO
	1318	DO j = nys, nyn
	1319	DO k = 1, nz
	1320
	1321	DO i = 0, nx
	1322	work_fftx(i,k,j) = work(j,k,i)
	1323	ENDDO
	1324
[1106]	1325	CALL fft_x_1d( work_fftx(0:nx,k,j), 'backward' )
[1]	1326
	1327	ENDDO
	1328	ENDDO
	1329
	1330	!$OMP DO
	1331	DO i = 0, nx
	1332	DO j = nys, nyn
	1333	DO k = 1, nz
	1334	f_out(k,j,i) = work_fftx(i,k,j)
	1335	ENDDO
	1336	ENDDO
	1337	ENDDO
	1338	!$OMP END PARALLEL
	1339
	1340	ENDIF
[1106]	1341	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'stop' )
[1]	1342
	1343	END SUBROUTINE tr_yx_fftx
	1344
	1345
	1346	!------------------------------------------------------------------------------!
[1682]	1347	! Description:
	1348	! ------------
	1349	!> FFT along y, solution of the tridiagonal system and backward FFT for
	1350	!> a 1d-decomposition along y.
	1351	!>
	1352	!> @warning this subroutine may still not work for hybrid parallelization
	1353	!> with OpenMP (for possible necessary changes see the original
	1354	!> routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
[1]	1355	!------------------------------------------------------------------------------!
[1682]	1356	SUBROUTINE ffty_tri_ffty( ar )
[1]	1357
[1682]	1358
[1320]	1359	USE control_parameters, &
[2300]	1360	ONLY: loop_optimization
[1320]	1361
	1362	USE cpulog, &
	1363	ONLY: cpu_log, log_point_s
	1364
	1365	USE grid_variables, &
	1366	ONLY: ddx2, ddy2
	1367
	1368	USE kinds
	1369
[1]	1370	USE pegrid
	1371
	1372	IMPLICIT NONE
	1373
[1682]	1374	INTEGER(iwp) :: i !<
	1375	INTEGER(iwp) :: j !<
	1376	INTEGER(iwp) :: k !<
	1377	INTEGER(iwp) :: m !<
	1378	INTEGER(iwp) :: n !<
	1379	INTEGER(iwp) :: omp_get_thread_num !<
	1380	INTEGER(iwp) :: tn !<
[1]	1381
[1682]	1382	REAL(wp), DIMENSION(0:ny) :: work_ffty !<
	1383	REAL(wp), DIMENSION(0:ny,1:nz) :: work_triy !<
	1384	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: ar !<
	1385	REAL(wp), DIMENSION(:,:,:,:), ALLOCATABLE :: tri !<
[1]	1386
	1387
[1106]	1388	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'start' )
[1]	1389
	1390	ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) )
	1391
	1392	tn = 0 ! Default thread number in case of one thread
[696]	1393	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy )
[1]	1394	DO i = nxl_y, nxr_y
	1395
	1396	!$ tn = omp_get_thread_num()
	1397
[2300]	1398	IF ( loop_optimization == 'vector' ) THEN
[1]	1399	!
	1400	!-- Code optimized for vector processors
	1401	DO k = 1, nz
	1402
	1403	m = 0
	1404	DO n = 1, pdims(2)
[1003]	1405	DO j = 1, nny
[1]	1406	work_triy(m,k) = ar(j,k,i,n)
	1407	m = m + 1
	1408	ENDDO
	1409	ENDDO
	1410
	1411	ENDDO
	1412
	1413	CALL fft_y_m( work_triy, ny, 'forward' )
	1414
	1415	ELSE
	1416	!
	1417	!-- Cache optimized code
	1418	DO k = 1, nz
	1419
	1420	m = 0
	1421	DO n = 1, pdims(2)
[1003]	1422	DO j = 1, nny
[1]	1423	work_ffty(m) = ar(j,k,i,n)
	1424	m = m + 1
	1425	ENDDO
	1426	ENDDO
	1427
[1106]	1428	CALL fft_y_1d( work_ffty, 'forward' )
[1]	1429
	1430	DO j = 0, ny
	1431	work_triy(j,k) = work_ffty(j)
	1432	ENDDO
	1433
	1434	ENDDO
	1435
	1436	ENDIF
	1437
	1438	!
	1439	!-- Solve the linear equation system
	1440	CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) )
	1441
[2300]	1442	IF ( loop_optimization == 'vector' ) THEN
[1]	1443	!
	1444	!-- Code optimized for vector processors
	1445	CALL fft_y_m( work_triy, ny, 'backward' )
	1446
	1447	DO k = 1, nz
	1448
	1449	m = 0
	1450	DO n = 1, pdims(2)
[1003]	1451	DO j = 1, nny
[1]	1452	ar(j,k,i,n) = work_triy(m,k)
	1453	m = m + 1
	1454	ENDDO
	1455	ENDDO
	1456
	1457	ENDDO
	1458
	1459	ELSE
	1460	!
	1461	!-- Cache optimized code
	1462	DO k = 1, nz
	1463
	1464	DO j = 0, ny
	1465	work_ffty(j) = work_triy(j,k)
	1466	ENDDO
	1467
[1106]	1468	CALL fft_y_1d( work_ffty, 'backward' )
[1]	1469
	1470	m = 0
	1471	DO n = 1, pdims(2)
[1003]	1472	DO j = 1, nny
[1]	1473	ar(j,k,i,n) = work_ffty(m)
	1474	m = m + 1
	1475	ENDDO
	1476	ENDDO
	1477
	1478	ENDDO
	1479
	1480	ENDIF
	1481
	1482	ENDDO
	1483
	1484	DEALLOCATE( tri )
	1485
[1106]	1486	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'stop' )
[1]	1487
	1488	END SUBROUTINE ffty_tri_ffty
	1489
	1490	END MODULE poisfft_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |