Home

Context Navigation

source: palm/trunk/SOURCE/poisfft_mod.f90 @ 3210

Last change on this file since 3210 was 2718, checked in by maronga, 7 years ago
deleting of deprecated files; headers updated where needed
Property svn:keywords set to `Id`
File size: 45.6 KB

Rev	Line
[1850]	1	!> @file poisfft_mod.f90
[2000]	2	!------------------------------------------------------------------------------!
[2696]	3	! This file is part of the PALM model system.
[1036]	4	!
[2000]	5	! PALM is free software: you can redistribute it and/or modify it under the
	6	! terms of the GNU General Public License as published by the Free Software
	7	! Foundation, either version 3 of the License, or (at your option) any later
	8	! version.
[1036]	9	!
	10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
	11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
	12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
	13	!
	14	! You should have received a copy of the GNU General Public License along with
	15	! PALM. If not, see <http://www.gnu.org/licenses/>.
	16	!
[2718]	17	! Copyright 1997-2018 Leibniz Universitaet Hannover
[2000]	18	!------------------------------------------------------------------------------!
[1036]	19	!
[484]	20	! Current revisions:
[1]	21	! -----------------
[1683]	22	!
[2119]	23	!
[1321]	24	! Former revisions:
	25	! -----------------
	26	! $Id: poisfft_mod.f90 2718 2018-01-02 08:49:38Z sward $
[2716]	27	! Corrected "Former revisions" section
	28	!
	29	! 2696 2017-12-14 17:12:51Z kanani
	30	! Change in file header (GPL part)
	31	!
	32	! 2300 2017-06-29 13:31:14Z raasch
[2300]	33	! settings depending on host variable removed or replaced by loop_optimization
	34	!
	35	! 2119 2017-01-17 16:51:50Z raasch
[1321]	36	!
[2119]	37	! 2118 2017-01-17 16:38:49Z raasch
	38	! OpenACC directives and related code removed
	39	!
[2001]	40	! 2000 2016-08-20 18:09:15Z knoop
	41	! Forced header and separation lines into 80 columns
	42	!
[1851]	43	! 1850 2016-04-08 13:29:27Z maronga
	44	! Module renamed
	45	!
	46	!
[1805]	47	! 1804 2016-04-05 16:30:18Z maronga
	48	! Removed code for parameter file check (__check)
	49	!
[1683]	50	! 1682 2015-10-07 23:56:08Z knoop
	51	! Code annotations made doxygen readable
	52	!
[1483]	53	! 1482 2014-10-18 12:34:45Z raasch
	54	! use 2d-decomposition, if accelerator boards are used
	55	!
[1407]	56	! 1406 2014-05-16 13:47:01Z raasch
	57	! bugfix for pgi 14.4: declare create moved after array declaration
	58	!
[1321]	59	! 1320 2014-03-20 08:40:49Z raasch
[1320]	60	! ONLY-attribute added to USE-statements,
	61	! kind-parameters added to all INTEGER and REAL declaration statements,
	62	! kinds are defined in new module kinds,
	63	! old module precision_kind is removed,
	64	! revision history before 2012 removed,
	65	! comment fields (!:) to be used for variable explanations added to
	66	! all variable declaration statements
[1217]	67	!
[1319]	68	! 1318 2014-03-17 13:35:16Z raasch
	69	! module interfaces removed
	70	!
[1307]	71	! 1306 2014-03-13 14:30:59Z raasch
	72	! openmp sections removed from the overlap branch,
	73	! second argument removed from parameter list
	74	!
[1217]	75	! 1216 2013-08-26 09:31:42Z raasch
[1216]	76	! resorting of arrays moved to separate routines resort_for_...,
	77	! one argument, used as temporary work array, removed from all transpose
	78	! routines
	79	! overlapping fft / transposition implemented
[1112]	80	!
[1213]	81	! 1212 2013-08-15 08:46:27Z raasch
	82	! tridia routines moved to seperate module tridia_solver
	83	!
[1209]	84	! 1208 2013-08-13 06:41:49Z raasch
	85	! acc-update clauses added for "ar" so that ffts other than cufft can also be
	86	! used (although they are not ported and will give a poor performance)
	87	!
[1112]	88	! 1111 2013-03-08 23:54:10Z raasch
[1111]	89	! further openACC porting of non-parallel (MPI) branch:
	90	! tridiagonal routines split into extermal subroutines (instead using CONTAINS),
	91	! no distinction between parallel/non-parallel in poisfft and tridia any more,
[1112]	92	! tridia routines moved to end of file because of probable bug in PGI compiler 12.5
[1111]	93	! (otherwise "invalid device function" is indicated during runtime),
	94	! optimization of tridia routines: constant elements and coefficients of tri are
	95	! stored in seperate arrays ddzuw and tric, last dimension of tri reduced from 5
	96	! to 2,
	97	! poisfft_init is now called internally from poisfft, maketri is called from
	98	! poisfft_init,
	99	! ibc_p_b = 2 removed
[1]	100	!
[1107]	101	! 1106 2013-03-04 05:31:38Z raasch
	102	! routines fftx, ffty, fftxp, fftyp removed, calls replaced by fft_x, fft_y,
	103	! in the 1D-decomposition routines fft_x, ffty are replaced by fft_x_1d,
	104	! fft_y_1d
	105	!
[1104]	106	! 1103 2013-02-20 02:15:53Z raasch
	107	! tri, ar, and ar1 arguments in tridia-routines (2d) are removed because they
	108	! sometimes cause segmentation faults with intel 12.1 compiler
	109	!
[1093]	110	! 1092 2013-02-02 11:24:22Z raasch
	111	! unused variables removed
	112	!
[1037]	113	! 1036 2012-10-22 13:43:42Z raasch
	114	! code put under GPL (PALM 3.9)
	115	!
[1014]	116	! 2012-09-21 07:03:55Z raasch
	117	! FLOAT type conversion replaced by REAL
	118	!
[1004]	119	! 1003 2012-09-14 14:35:53Z raasch
	120	! indices nxa, nya, etc. replaced by nx, ny, etc.
	121	!
[941]	122	! 940 2012-07-09 14:31:00Z raasch
	123	! special handling of tri-array as an argument in tridia_1dd routines switched
	124	! off because it caused segmentation faults with intel 12.1 compiler
	125	!
[878]	126	! 877 2012-04-03 11:21:44Z suehring
	127	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
	128	! pressure at the top of the model domain.
	129	!
[810]	130	! 809 2012-01-30 13:32:58Z maronga
	131	! Bugfix: replaced .AND. and .NOT. with && and ! in the preprocessor directives
	132	!
[808]	133	! 807 2012-01-25 11:53:51Z maronga
	134	! New cpp directive "__check" implemented which is used by check_namelist_files
	135	! (most of the code is unneeded by check_namelist_files).
	136	!
[1]	137	! Revision 1.1 1997/07/24 11:24:14 raasch
	138	! Initial revision
	139	!
	140	!
	141	! Description:
	142	! ------------
[1682]	143	!> Solves the Poisson equation with a 2D spectral method
	144	!> d^2 p / dx^2 + d^2 p / dy^2 + d^2 p / dz^2 = s
	145	!>
	146	!> Input:
	147	!> real ar contains (nnz,nny,nnx) elements of the velocity divergence,
	148	!> starting from (1,nys,nxl)
	149	!>
	150	!> Output:
	151	!> real ar contains the solution for perturbation pressure p
[1]	152	!------------------------------------------------------------------------------!
[1682]	153	MODULE poisfft_mod
	154
[1]	155
[1320]	156	USE fft_xy, &
	157	ONLY: fft_init, fft_y, fft_y_1d, fft_y_m, fft_x, fft_x_1d, fft_x_m
[1]	158
[1320]	159	USE indices, &
	160	ONLY: nnx, nny, nx, nxl, nxr, ny, nys, nyn, nz
	161
	162	USE transpose_indices, &
	163	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nys_x, nys_z, nyn_x, nyn_z, nzb_x, &
	164	nzb_y, nzt_x, nzt_y
	165
	166	USE tridia_solver, &
	167	ONLY: tridia_1dd, tridia_init, tridia_substi, tridia_substi_overlap
	168
[1]	169	IMPLICIT NONE
	170
[1111]	171	LOGICAL, SAVE :: poisfft_initialized = .FALSE.
	172
[1]	173	PRIVATE
[807]	174
[1]	175	PUBLIC poisfft, poisfft_init
	176
	177	INTERFACE poisfft
	178	MODULE PROCEDURE poisfft
	179	END INTERFACE poisfft
	180
	181	INTERFACE poisfft_init
	182	MODULE PROCEDURE poisfft_init
	183	END INTERFACE poisfft_init
	184
[807]	185
[1]	186	CONTAINS
	187
[1682]	188	!------------------------------------------------------------------------------!
	189	! Description:
	190	! ------------
	191	!> @todo Missing subroutine description.
	192	!------------------------------------------------------------------------------!
[1]	193	SUBROUTINE poisfft_init
	194
[1320]	195	USE arrays_3d, &
	196	ONLY: ddzu_pres, ddzw
[1111]	197
[1320]	198	USE kinds
	199
[1111]	200	IMPLICIT NONE
	201
[1682]	202	INTEGER(iwp) :: k !<
[1111]	203
	204
[1]	205	CALL fft_init
	206
[1212]	207	CALL tridia_init
[1111]	208
	209	poisfft_initialized = .TRUE.
	210
[1]	211	END SUBROUTINE poisfft_init
	212
[1111]	213
[1804]	214
[1682]	215	!------------------------------------------------------------------------------!
	216	! Description:
	217	! ------------
	218	!> Two-dimensional Fourier Transformation in x- and y-direction.
	219	!------------------------------------------------------------------------------!
[1306]	220	SUBROUTINE poisfft( ar )
[1]	221
[1320]	222	USE control_parameters, &
	223	ONLY: fft_method, transpose_compute_overlap
	224
	225	USE cpulog, &
	226	ONLY: cpu_log, cpu_log_nowait, log_point_s
	227
	228	USE kinds
	229
[1]	230	USE pegrid
	231
	232	IMPLICIT NONE
	233
[1682]	234	INTEGER(iwp) :: ii !<
	235	INTEGER(iwp) :: iind !<
	236	INTEGER(iwp) :: inew !<
	237	INTEGER(iwp) :: jj !<
	238	INTEGER(iwp) :: jind !<
	239	INTEGER(iwp) :: jnew !<
	240	INTEGER(iwp) :: ki !<
	241	INTEGER(iwp) :: kk !<
	242	INTEGER(iwp) :: knew !<
	243	INTEGER(iwp) :: n !<
	244	INTEGER(iwp) :: nblk !<
	245	INTEGER(iwp) :: nnx_y !<
	246	INTEGER(iwp) :: nny_z !<
	247	INTEGER(iwp) :: nnz_t !<
	248	INTEGER(iwp) :: nnz_x !<
	249	INTEGER(iwp) :: nxl_y_bound !<
	250	INTEGER(iwp) :: nxr_y_bound !<
[1]	251
[1682]	252	INTEGER(iwp), DIMENSION(4) :: isave !<
[1320]	253
[1682]	254	REAL(wp), DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar !<
	255	REAL(wp), DIMENSION(nys:nyn,nxl:nxr,1:nz) :: ar_inv !<
[1]	256
[1682]	257	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: ar1 !<
	258	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_in !<
	259	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_inv !<
	260	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_out_y !<
	261	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_out_z !<
[1216]	262
	263
[1]	264	CALL cpu_log( log_point_s(3), 'poisfft', 'start' )
	265
[1111]	266	IF ( .NOT. poisfft_initialized ) CALL poisfft_init
	267
[1]	268	!
	269	!-- Two-dimensional Fourier Transformation in x- and y-direction.
[2118]	270	IF ( pdims(2) == 1 .AND. pdims(1) > 1 ) THEN
[1]	271
	272	!
	273	!-- 1d-domain-decomposition along x:
	274	!-- FFT along y and transposition y --> x
[1216]	275	CALL ffty_tr_yx( ar, ar )
[1]	276
	277	!
	278	!-- FFT along x, solving the tridiagonal system and backward FFT
	279	CALL fftx_tri_fftx( ar )
	280
	281	!
	282	!-- Transposition x --> y and backward FFT along y
[1216]	283	CALL tr_xy_ffty( ar, ar )
[1]	284
[2118]	285	ELSEIF ( pdims(1) == 1 .AND. pdims(2) > 1 ) THEN
[1]	286
	287	!
	288	!-- 1d-domain-decomposition along y:
	289	!-- FFT along x and transposition x --> y
[1216]	290	CALL fftx_tr_xy( ar, ar )
[1]	291
	292	!
	293	!-- FFT along y, solving the tridiagonal system and backward FFT
	294	CALL ffty_tri_ffty( ar )
	295
	296	!
	297	!-- Transposition y --> x and backward FFT along x
[1216]	298	CALL tr_yx_fftx( ar, ar )
[1]	299
[1216]	300	ELSEIF ( .NOT. transpose_compute_overlap ) THEN
[1]	301
	302	!
[1111]	303	!-- 2d-domain-decomposition or no decomposition (1 PE run)
[1]	304	!-- Transposition z --> x
	305	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
[1216]	306	CALL resort_for_zx( ar, ar_inv )
	307	CALL transpose_zx( ar_inv, ar )
[1]	308	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
	309
	310	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
[1106]	311	CALL fft_x( ar, 'forward' )
[1]	312	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
	313
	314	!
	315	!-- Transposition x --> y
	316	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
[1216]	317	CALL resort_for_xy( ar, ar_inv )
	318	CALL transpose_xy( ar_inv, ar )
[1]	319	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
	320
	321	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
[1216]	322	CALL fft_y( ar, 'forward', ar_tr = ar, &
	323	nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
	324	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
[1]	325	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	326
	327	!
	328	!-- Transposition y --> z
	329	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
[1216]	330	CALL resort_for_yz( ar, ar_inv )
	331	CALL transpose_yz( ar_inv, ar )
[1]	332	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
	333
	334	!
[1106]	335	!-- Solve the tridiagonal equation system along z
[1]	336	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
[1212]	337	CALL tridia_substi( ar )
[1]	338	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
	339
	340	!
	341	!-- Inverse Fourier Transformation
	342	!-- Transposition z --> y
	343	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
[1216]	344	CALL transpose_zy( ar, ar_inv )
	345	CALL resort_for_zy( ar_inv, ar )
[1]	346	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
	347
	348	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
[1216]	349	CALL fft_y( ar, 'backward', ar_tr = ar, &
	350	nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
	351	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
[1]	352	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	353
	354	!
	355	!-- Transposition y --> x
	356	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
[1216]	357	CALL transpose_yx( ar, ar_inv )
	358	CALL resort_for_yx( ar_inv, ar )
[1]	359	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
	360
	361	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
[1106]	362	CALL fft_x( ar, 'backward' )
[1]	363	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	364
	365	!
	366	!-- Transposition x --> z
	367	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
[1216]	368	CALL transpose_xz( ar, ar_inv )
	369	CALL resort_for_xz( ar_inv, ar )
[1]	370	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
	371
[1216]	372	ELSE
	373
	374	!
	375	!-- 2d-domain-decomposition or no decomposition (1 PE run) with
	376	!-- overlapping transposition / fft
[1318]	377	!-- cputime logging must not use barriers, which would prevent overlapping
[1216]	378	ALLOCATE( f_out_y(0:ny,nxl_y:nxr_y,nzb_y:nzt_y), &
	379	f_out_z(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
	380	!
	381	!-- Transposition z --> x + subsequent fft along x
	382	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
	383	CALL resort_for_zx( ar, f_inv )
	384	!
	385	!-- Save original indices and gridpoint counter
	386	isave(1) = nz
	387	isave(2) = nzb_x
	388	isave(3) = nzt_x
	389	isave(4) = sendrecvcount_zx
	390	!
	391	!-- Set new indices for transformation
	392	nblk = nz / pdims(1)
	393	nz = pdims(1)
	394	nnz_x = 1
	395	nzb_x = 1 + myidx * nnz_x
	396	nzt_x = ( myidx + 1 ) * nnz_x
	397	sendrecvcount_zx = nnx * nny * nnz_x
	398
[1306]	399	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
[1216]	400	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
	401
[1306]	402	DO kk = 1, nblk
[1216]	403
[1306]	404	IF ( kk == 1 ) THEN
[1318]	405	CALL cpu_log( log_point_s(5), 'transpo forward', 'start', cpu_log_nowait )
[1306]	406	ELSE
[1318]	407	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
[1306]	408	ENDIF
[1216]	409
[1306]	410	DO knew = 1, nz
	411	ki = kk + nblk * ( knew - 1 )
	412	f_in(:,:,knew) = f_inv(:,:,ki)
	413	ENDDO
[1216]	414
[1306]	415	CALL transpose_zx( f_in, ar1(:,:,:))
	416	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
[1216]	417
[1306]	418	IF ( kk == 1 ) THEN
[1318]	419	CALL cpu_log( log_point_s(4), 'fft_x', 'start', cpu_log_nowait )
[1306]	420	ELSE
[1318]	421	CALL cpu_log( log_point_s(4), 'fft_x', 'continue', cpu_log_nowait )
[1216]	422	ENDIF
	423
[1306]	424	n = isave(2) + kk - 1
	425	CALL fft_x( ar1(:,:,:), 'forward', ar_2d = f_out_z(:,:,n))
	426	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
[1216]	427
	428	ENDDO
	429	!
	430	!-- Restore original indices/counters
	431	nz = isave(1)
	432	nzb_x = isave(2)
	433	nzt_x = isave(3)
	434	sendrecvcount_zx = isave(4)
	435
	436	DEALLOCATE( ar1, f_in, f_inv )
	437
	438	!
	439	!-- Transposition x --> y + subsequent fft along y
	440	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
	441	CALL resort_for_xy( f_out_z, f_inv )
	442	!
	443	!-- Save original indices and gridpoint counter
	444	isave(1) = nx
	445	isave(2) = nxl_y
	446	isave(3) = nxr_y
	447	isave(4) = sendrecvcount_xy
	448	!
	449	!-- Set new indices for transformation
	450	nblk = ( ( nx+1 ) / pdims(2) ) - 1
	451	nx = pdims(2)
	452	nnx_y = 1
	453	nxl_y = myidy * nnx_y
	454	nxr_y = ( myidy + 1 ) * nnx_y - 1
	455	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
	456
[1306]	457	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
[1216]	458	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
	459
[1306]	460	DO ii = 0, nblk
[1216]	461
[1318]	462	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
[1216]	463
[1306]	464	DO inew = 0, nx-1
	465	iind = ii + ( nblk + 1 ) * inew
	466	f_in(:,:,inew) = f_inv(:,:,iind)
	467	ENDDO
[1216]	468
[1306]	469	CALL transpose_xy( f_in, ar1(:,:,:) )
[1216]	470
[1306]	471	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
[1216]	472
[1306]	473	IF ( ii == 1 ) THEN
[1318]	474	CALL cpu_log( log_point_s(7), 'fft_y', 'start', cpu_log_nowait )
[1306]	475	ELSE
[1318]	476	CALL cpu_log( log_point_s(7), 'fft_y', 'continue', cpu_log_nowait )
[1216]	477	ENDIF
	478
[1306]	479	nxl_y_bound = isave(2)
	480	nxr_y_bound = isave(3)
	481	n = isave(2) + ii
	482	CALL fft_y( ar1(:,:,:), 'forward', ar_tr = f_out_y, &
	483	nxl_y_bound = nxl_y_bound, nxr_y_bound = nxr_y_bound, &
	484	nxl_y_l = n, nxr_y_l = n )
[1216]	485
[1306]	486	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
[1216]	487
	488	ENDDO
	489	!
	490	!-- Restore original indices/counters
	491	nx = isave(1)
	492	nxl_y = isave(2)
	493	nxr_y = isave(3)
	494	sendrecvcount_xy = isave(4)
	495
	496	DEALLOCATE( ar1, f_in, f_inv )
	497
	498	!
	499	!-- Transposition y --> z + subsequent tridia + resort for z --> y
	500	ALLOCATE( f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
	501	CALL resort_for_yz( f_out_y, f_inv )
	502	!
	503	!-- Save original indices and gridpoint counter
	504	isave(1) = ny
	505	isave(2) = nys_z
	506	isave(3) = nyn_z
	507	isave(4) = sendrecvcount_yz
	508	!
	509	!-- Set new indices for transformation
	510	nblk = ( ( ny+1 ) / pdims(1) ) - 1
	511	ny = pdims(1)
	512	nny_z = 1
	513	nys_z = myidx * nny_z
	514	nyn_z = ( myidx + 1 ) * nny_z - 1
	515	sendrecvcount_yz = ( nxr_y-nxl_y+1 ) * nny_z * ( nzt_y-nzb_y+1 )
	516
[1306]	517	ALLOCATE( ar1(nxl_z:nxr_z,nys_z:nyn_z,1:nz) )
[1216]	518	ALLOCATE( f_in(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
	519
[1306]	520	DO jj = 0, nblk
[1216]	521	!
[1306]	522	!-- Forward Fourier Transformation
	523	!-- Transposition y --> z
[1318]	524	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
[1216]	525
[1306]	526	DO jnew = 0, ny-1
	527	jind = jj + ( nblk + 1 ) * jnew
	528	f_in(:,:,jnew) = f_inv(:,:,jind)
	529	ENDDO
[1216]	530
[1306]	531	CALL transpose_yz( f_in, ar1(:,:,:) )
[1216]	532
[1306]	533	IF ( jj == nblk ) THEN
	534	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
	535	ELSE
	536	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
[1216]	537	ENDIF
	538
	539	!
[1306]	540	!-- Solve the tridiagonal equation system along z
[1318]	541	CALL cpu_log( log_point_s(6), 'tridia', 'start', cpu_log_nowait )
[1216]	542
[1306]	543	n = isave(2) + jj
	544	CALL tridia_substi_overlap( ar1(:,:,:), n )
[1216]	545
[1306]	546	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
[1216]	547
[1306]	548	!
	549	!-- Inverse Fourier Transformation
	550	!-- Transposition z --> y
	551	!-- Only one thread should call MPI routines, therefore forward and
	552	!-- backward tranpose are in the same section
	553	IF ( jj == 0 ) THEN
[1318]	554	CALL cpu_log( log_point_s(8), 'transpo invers', 'start', cpu_log_nowait )
[1306]	555	ELSE
[1318]	556	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
[1216]	557	ENDIF
	558
[1306]	559	CALL transpose_zy( ar1(:,:,:), f_in )
[1216]	560
[1306]	561	DO jnew = 0, ny-1
	562	jind = jj + ( nblk + 1 ) * jnew
	563	f_inv(:,:,jind) = f_in(:,:,jnew)
	564	ENDDO
[1216]	565
[1306]	566	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
[1216]	567
	568	ENDDO
	569	!
	570	!-- Restore original indices/counters
	571	ny = isave(1)
	572	nys_z = isave(2)
	573	nyn_z = isave(3)
	574	sendrecvcount_yz = isave(4)
	575
	576	CALL resort_for_zy( f_inv, f_out_y )
	577
	578	DEALLOCATE( ar1, f_in, f_inv )
	579
	580	!
	581	!-- fft along y backward + subsequent transposition y --> x
	582	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
	583	!
	584	!-- Save original indices and gridpoint counter
	585	isave(1) = nx
	586	isave(2) = nxl_y
	587	isave(3) = nxr_y
	588	isave(4) = sendrecvcount_xy
	589	!
	590	!-- Set new indices for transformation
	591	nblk = (( nx+1 ) / pdims(2) ) - 1
	592	nx = pdims(2)
	593	nnx_y = 1
	594	nxl_y = myidy * nnx_y
	595	nxr_y = ( myidy + 1 ) * nnx_y - 1
	596	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
	597
[1306]	598	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
[1216]	599	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
	600
[1306]	601	DO ii = 0, nblk
[1216]	602
[1318]	603	CALL cpu_log( log_point_s(7), 'fft_y', 'continue', cpu_log_nowait )
[1216]	604
[1306]	605	n = isave(2) + ii
	606	nxl_y_bound = isave(2)
	607	nxr_y_bound = isave(3)
[1216]	608
[1306]	609	CALL fft_y( ar1(:,:,:), 'backward', ar_tr = f_out_y, &
	610	nxl_y_bound = nxl_y_bound, nxr_y_bound = nxr_y_bound, &
	611	nxl_y_l = n, nxr_y_l = n )
[1216]	612
[1306]	613	IF ( ii == nblk ) THEN
	614	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	615	ELSE
	616	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
[1216]	617	ENDIF
	618
[1318]	619	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
[1216]	620
[1306]	621	CALL transpose_yx( ar1(:,:,:), f_in )
[1216]	622
[1306]	623	DO inew = 0, nx-1
	624	iind = ii + (nblk+1) * inew
	625	f_inv(:,:,iind) = f_in(:,:,inew)
	626	ENDDO
[1216]	627
[1306]	628	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
[1216]	629
	630	ENDDO
	631	!
	632	!-- Restore original indices/counters
	633	nx = isave(1)
	634	nxl_y = isave(2)
	635	nxr_y = isave(3)
	636	sendrecvcount_xy = isave(4)
	637
	638	CALL resort_for_yx( f_inv, f_out_z )
	639
	640	DEALLOCATE( ar1, f_in, f_inv )
	641
	642	!
	643	!-- fft along x backward + subsequent final transposition x --> z
	644	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
	645	!
	646	!-- Save original indices and gridpoint counter
	647	isave(1) = nz
	648	isave(2) = nzb_x
	649	isave(3) = nzt_x
	650	isave(4) = sendrecvcount_zx
	651	!
	652	!-- Set new indices for transformation
	653	nblk = nz / pdims(1)
	654	nz = pdims(1)
	655	nnz_x = 1
	656	nzb_x = 1 + myidx * nnz_x
	657	nzt_x = ( myidx + 1 ) * nnz_x
	658	sendrecvcount_zx = nnx * nny * nnz_x
	659
[1306]	660	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
[1216]	661	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
	662
[1306]	663	DO kk = 1, nblk
[1216]	664
[1318]	665	CALL cpu_log( log_point_s(4), 'fft_x', 'continue', cpu_log_nowait )
[1216]	666
[1306]	667	n = isave(2) + kk - 1
	668	CALL fft_x( ar1(:,:,:), 'backward', f_out_z(:,:,n))
[1216]	669
[1306]	670	IF ( kk == nblk ) THEN
	671	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	672	ELSE
	673	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
[1216]	674	ENDIF
	675
[1318]	676	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
[1216]	677
[1306]	678	CALL transpose_xz( ar1(:,:,:), f_in )
[1216]	679
[1306]	680	DO knew = 1, nz
	681	ki = kk + nblk * (knew-1)
	682	f_inv(:,:,ki) = f_in(:,:,knew)
	683	ENDDO
[1216]	684
[1306]	685	IF ( kk == nblk ) THEN
	686	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
	687	ELSE
	688	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
[1216]	689	ENDIF
	690
	691	ENDDO
	692	!
	693	!-- Restore original indices/counters
	694	nz = isave(1)
	695	nzb_x = isave(2)
	696	nzt_x = isave(3)
	697	sendrecvcount_zx = isave(4)
	698
	699	CALL resort_for_xz( f_inv, ar )
	700
	701	DEALLOCATE( ar1, f_in, f_inv )
	702
[1]	703	ENDIF
	704
	705	CALL cpu_log( log_point_s(3), 'poisfft', 'stop' )
	706
	707	END SUBROUTINE poisfft
	708
	709
	710	!------------------------------------------------------------------------------!
[1682]	711	! Description:
	712	! ------------
	713	!> Fourier-transformation along y with subsequent transposition y --> x for
	714	!> a 1d-decomposition along x.
	715	!>
	716	!> @attention The performance of this routine is much faster on the NEC-SX6,
	717	!> if the first index of work_ffty_vec is odd. Otherwise
	718	!> memory bank conflicts may occur (especially if the index is a
	719	!> multiple of 128). That's why work_ffty_vec is dimensioned as
	720	!> 0:ny+1.
	721	!> Of course, this will not work if users are using an odd number
	722	!> of gridpoints along y.
[1]	723	!------------------------------------------------------------------------------!
[1682]	724	SUBROUTINE ffty_tr_yx( f_in, f_out )
[1]	725
[1320]	726	USE control_parameters, &
[2300]	727	ONLY: loop_optimization
[1320]	728
	729	USE cpulog, &
	730	ONLY: cpu_log, log_point_s
	731
	732	USE kinds
	733
[1]	734	USE pegrid
	735
	736	IMPLICIT NONE
	737
[1682]	738	INTEGER(iwp) :: i !<
	739	INTEGER(iwp) :: iend !<
	740	INTEGER(iwp) :: iouter !<
	741	INTEGER(iwp) :: ir !<
	742	INTEGER(iwp) :: j !<
	743	INTEGER(iwp) :: k !<
[1]	744
[1682]	745	INTEGER(iwp), PARAMETER :: stridex = 4 !<
[1320]	746
[1682]	747	REAL(wp), DIMENSION(1:nz,0:ny,nxl:nxr) :: f_in !<
	748	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_out !<
	749	REAL(wp), DIMENSION(nxl:nxr,1:nz,0:ny) :: work !<
[1]	750
[2300]	751	REAL(wp), DIMENSION(:,:), ALLOCATABLE :: work_ffty !<
	752	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: work_ffty_vec !<
	753
[1]	754	!
	755	!-- Carry out the FFT along y, where all data are present due to the
	756	!-- 1d-decomposition along x. Resort the data in a way that x becomes
	757	!-- the first index.
[1106]	758	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'start' )
[1]	759
[2300]	760	IF ( loop_optimization == 'vector' ) THEN
	761
	762	ALLOCATE( work_ffty_vec(0:ny+1,1:nz,nxl:nxr) )
[1]	763	!
	764	!-- Code optimized for vector processors
[2300]	765	!$OMP PARALLEL PRIVATE ( i, j, k )
	766	!$OMP DO
[1]	767	DO i = nxl, nxr
	768
	769	DO j = 0, ny
	770	DO k = 1, nz
	771	work_ffty_vec(j,k,i) = f_in(k,j,i)
	772	ENDDO
	773	ENDDO
	774
	775	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' )
	776
	777	ENDDO
	778
[2300]	779	!$OMP DO
[1]	780	DO k = 1, nz
	781	DO j = 0, ny
	782	DO i = nxl, nxr
	783	work(i,k,j) = work_ffty_vec(j,k,i)
	784	ENDDO
	785	ENDDO
	786	ENDDO
[2300]	787	!$OMP END PARALLEL
[1]	788
[2300]	789	DEALLOCATE( work_ffty_vec )
	790
[1]	791	ELSE
	792	!
	793	!-- Cache optimized code.
[2300]	794	ALLOCATE( work_ffty(0:ny,stridex) )
	795	!
[1]	796	!-- The i-(x-)direction is split into a strided outer loop and an inner
	797	!-- loop for better cache performance
[2300]	798	!$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty)
	799	!$OMP DO
[1]	800	DO iouter = nxl, nxr, stridex
	801
	802	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
	803
	804	DO k = 1, nz
	805
	806	DO i = iouter, iend
	807
	808	ir = i-iouter+1 ! counter within a stride
	809	DO j = 0, ny
	810	work_ffty(j,ir) = f_in(k,j,i)
	811	ENDDO
	812	!
	813	!-- FFT along y
[1106]	814	CALL fft_y_1d( work_ffty(:,ir), 'forward' )
[1]	815
	816	ENDDO
	817
	818	!
	819	!-- Resort
	820	DO j = 0, ny
	821	DO i = iouter, iend
	822	work(i,k,j) = work_ffty(j,i-iouter+1)
	823	ENDDO
	824	ENDDO
	825
	826	ENDDO
	827
	828	ENDDO
[2300]	829	!$OMP END PARALLEL
[1]	830
[2300]	831	DEALLOCATE( work_ffty )
	832
[1]	833	ENDIF
[2300]	834
[1106]	835	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'pause' )
[1]	836
	837	!
	838	!-- Transpose array
[1111]	839	#if defined( __parallel )
[1]	840	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	841	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	842	CALL MPI_ALLTOALL( work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
	843	f_out(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
	844	comm1dx, ierr )
	845	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1111]	846	#endif
[1]	847
	848	END SUBROUTINE ffty_tr_yx
	849
	850
	851	!------------------------------------------------------------------------------!
[1682]	852	! Description:
	853	! ------------
	854	!> Transposition x --> y with a subsequent backward Fourier transformation for
	855	!> a 1d-decomposition along x
[1]	856	!------------------------------------------------------------------------------!
[1682]	857	SUBROUTINE tr_xy_ffty( f_in, f_out )
[1]	858
[1320]	859	USE control_parameters, &
[2300]	860	ONLY: loop_optimization
[1320]	861
	862	USE cpulog, &
	863	ONLY: cpu_log, log_point_s
	864
	865	USE kinds
	866
[1]	867	USE pegrid
	868
	869	IMPLICIT NONE
	870
[1682]	871	INTEGER(iwp) :: i !<
	872	INTEGER(iwp) :: iend !<
	873	INTEGER(iwp) :: iouter !<
	874	INTEGER(iwp) :: ir !<
	875	INTEGER(iwp) :: j !<
	876	INTEGER(iwp) :: k !<
[1]	877
[1682]	878	INTEGER(iwp), PARAMETER :: stridex = 4 !<
[1320]	879
[1682]	880	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_in !<
	881	REAL(wp), DIMENSION(1:nz,0:ny,nxl:nxr) :: f_out !<
	882	REAL(wp), DIMENSION(nxl:nxr,1:nz,0:ny) :: work !<
[1]	883
[2300]	884	REAL(wp), DIMENSION(:,:), ALLOCATABLE :: work_ffty !<
	885	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: work_ffty_vec !<
	886
[1]	887	!
	888	!-- Transpose array
[1111]	889	#if defined( __parallel )
[1]	890	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	891	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	892	CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
	893	work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
	894	comm1dx, ierr )
	895	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1111]	896	#endif
[1]	897
	898	!
	899	!-- Resort the data in a way that y becomes the first index and carry out the
	900	!-- backward fft along y.
[1106]	901	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'continue' )
[1]	902
[2300]	903	IF ( loop_optimization == 'vector' ) THEN
	904
	905	ALLOCATE( work_ffty_vec(0:ny+1,1:nz,nxl:nxr) )
[1]	906	!
	907	!-- Code optimized for vector processors
[2300]	908	!$OMP PARALLEL PRIVATE ( i, j, k )
	909	!$OMP DO
[1]	910	DO k = 1, nz
	911	DO j = 0, ny
	912	DO i = nxl, nxr
	913	work_ffty_vec(j,k,i) = work(i,k,j)
	914	ENDDO
	915	ENDDO
	916	ENDDO
	917
[2300]	918	!$OMP DO
[1]	919	DO i = nxl, nxr
	920
	921	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' )
	922
	923	DO j = 0, ny
	924	DO k = 1, nz
	925	f_out(k,j,i) = work_ffty_vec(j,k,i)
	926	ENDDO
	927	ENDDO
	928
	929	ENDDO
[2300]	930	!$OMP END PARALLEL
[1]	931
[2300]	932	DEALLOCATE( work_ffty_vec )
	933
[1]	934	ELSE
	935	!
	936	!-- Cache optimized code.
[2300]	937	ALLOCATE( work_ffty(0:ny,stridex) )
	938	!
[1]	939	!-- The i-(x-)direction is split into a strided outer loop and an inner
	940	!-- loop for better cache performance
[2300]	941	!$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty )
	942	!$OMP DO
[1]	943	DO iouter = nxl, nxr, stridex
	944
	945	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
	946
	947	DO k = 1, nz
	948	!
	949	!-- Resort
	950	DO j = 0, ny
	951	DO i = iouter, iend
	952	work_ffty(j,i-iouter+1) = work(i,k,j)
	953	ENDDO
	954	ENDDO
	955
	956	DO i = iouter, iend
	957
	958	!
	959	!-- FFT along y
	960	ir = i-iouter+1 ! counter within a stride
[1106]	961	CALL fft_y_1d( work_ffty(:,ir), 'backward' )
[1]	962
	963	DO j = 0, ny
	964	f_out(k,j,i) = work_ffty(j,ir)
	965	ENDDO
	966	ENDDO
	967
	968	ENDDO
	969
	970	ENDDO
[2300]	971	!$OMP END PARALLEL
[1]	972
[2300]	973	DEALLOCATE( work_ffty )
	974
[1]	975	ENDIF
	976
[1106]	977	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'stop' )
[1]	978
	979	END SUBROUTINE tr_xy_ffty
	980
	981
	982	!------------------------------------------------------------------------------!
[1682]	983	! Description:
	984	! ------------
	985	!> FFT along x, solution of the tridiagonal system and backward FFT for
	986	!> a 1d-decomposition along x
	987	!>
	988	!> @warning this subroutine may still not work for hybrid parallelization
	989	!> with OpenMP (for possible necessary changes see the original
	990	!> routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
[1]	991	!------------------------------------------------------------------------------!
[1682]	992	SUBROUTINE fftx_tri_fftx( ar )
[1]	993
[1320]	994	USE control_parameters, &
[2300]	995	ONLY: loop_optimization
[1320]	996
	997	USE cpulog, &
	998	ONLY: cpu_log, log_point_s
	999
	1000	USE grid_variables, &
	1001	ONLY: ddx2, ddy2
	1002
	1003	USE kinds
	1004
[1]	1005	USE pegrid
	1006
	1007	IMPLICIT NONE
	1008
[1682]	1009	INTEGER(iwp) :: i !<
	1010	INTEGER(iwp) :: j !<
	1011	INTEGER(iwp) :: k !<
	1012	INTEGER(iwp) :: m !<
	1013	INTEGER(iwp) :: n !<
	1014	INTEGER(iwp) :: omp_get_thread_num !<
	1015	INTEGER(iwp) :: tn !<
[1]	1016
[1682]	1017	REAL(wp), DIMENSION(0:nx) :: work_fftx !<
	1018	REAL(wp), DIMENSION(0:nx,1:nz) :: work_trix !<
	1019	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: ar !<
	1020	REAL(wp), DIMENSION(:,:,:,:), ALLOCATABLE :: tri !<
[1]	1021
	1022
[1106]	1023	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'start' )
[1]	1024
	1025	ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) )
	1026
	1027	tn = 0 ! Default thread number in case of one thread
	1028	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_fftx, work_trix )
	1029	DO j = nys_x, nyn_x
	1030
	1031	!$ tn = omp_get_thread_num()
	1032
[2300]	1033	IF ( loop_optimization == 'vector' ) THEN
[1]	1034	!
	1035	!-- Code optimized for vector processors
	1036	DO k = 1, nz
	1037
	1038	m = 0
	1039	DO n = 1, pdims(1)
[1003]	1040	DO i = 1, nnx
[1]	1041	work_trix(m,k) = ar(i,k,j,n)
	1042	m = m + 1
	1043	ENDDO
	1044	ENDDO
	1045
	1046	ENDDO
	1047
	1048	CALL fft_x_m( work_trix, 'forward' )
	1049
	1050	ELSE
	1051	!
	1052	!-- Cache optimized code
	1053	DO k = 1, nz
	1054
	1055	m = 0
	1056	DO n = 1, pdims(1)
[1003]	1057	DO i = 1, nnx
[1]	1058	work_fftx(m) = ar(i,k,j,n)
	1059	m = m + 1
	1060	ENDDO
	1061	ENDDO
	1062
[1106]	1063	CALL fft_x_1d( work_fftx, 'forward' )
[1]	1064
	1065	DO i = 0, nx
	1066	work_trix(i,k) = work_fftx(i)
	1067	ENDDO
	1068
	1069	ENDDO
	1070
	1071	ENDIF
	1072
	1073	!
	1074	!-- Solve the linear equation system
	1075	CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) )
	1076
[2300]	1077	IF ( loop_optimization == 'vector' ) THEN
[1]	1078	!
	1079	!-- Code optimized for vector processors
	1080	CALL fft_x_m( work_trix, 'backward' )
	1081
	1082	DO k = 1, nz
	1083
	1084	m = 0
	1085	DO n = 1, pdims(1)
[1003]	1086	DO i = 1, nnx
[1]	1087	ar(i,k,j,n) = work_trix(m,k)
	1088	m = m + 1
	1089	ENDDO
	1090	ENDDO
	1091
	1092	ENDDO
	1093
	1094	ELSE
	1095	!
	1096	!-- Cache optimized code
	1097	DO k = 1, nz
	1098
	1099	DO i = 0, nx
	1100	work_fftx(i) = work_trix(i,k)
	1101	ENDDO
	1102
[1106]	1103	CALL fft_x_1d( work_fftx, 'backward' )
[1]	1104
	1105	m = 0
	1106	DO n = 1, pdims(1)
[1003]	1107	DO i = 1, nnx
[1]	1108	ar(i,k,j,n) = work_fftx(m)
	1109	m = m + 1
	1110	ENDDO
	1111	ENDDO
	1112
	1113	ENDDO
	1114
	1115	ENDIF
	1116
	1117	ENDDO
	1118
	1119	DEALLOCATE( tri )
	1120
[1106]	1121	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'stop' )
[1]	1122
	1123	END SUBROUTINE fftx_tri_fftx
	1124
	1125
	1126	!------------------------------------------------------------------------------!
[1682]	1127	! Description:
	1128	! ------------
	1129	!> Fourier-transformation along x with subsequent transposition x --> y for
	1130	!> a 1d-decomposition along y.
	1131	!>
	1132	!> @attention NEC-branch of this routine may significantly profit from
	1133	!> further optimizations. So far, performance is much worse than
	1134	!> for routine ffty_tr_yx (more than three times slower).
[1]	1135	!------------------------------------------------------------------------------!
[1682]	1136	SUBROUTINE fftx_tr_xy( f_in, f_out )
[1]	1137
[1682]	1138
[1320]	1139	USE control_parameters, &
[2300]	1140	ONLY: loop_optimization
[1320]	1141
	1142	USE cpulog, &
	1143	ONLY: cpu_log, log_point_s
	1144
	1145	USE kinds
	1146
[1]	1147	USE pegrid
	1148
	1149	IMPLICIT NONE
	1150
[1682]	1151	INTEGER(iwp) :: i !<
	1152	INTEGER(iwp) :: j !<
	1153	INTEGER(iwp) :: k !<
[1]	1154
[1682]	1155	REAL(wp), DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx !<
	1156	REAL(wp), DIMENSION(1:nz,nys:nyn,0:nx) :: f_in !<
	1157	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_out !<
	1158	REAL(wp), DIMENSION(nys:nyn,1:nz,0:nx) :: work !<
[1]	1159
	1160	!
	1161	!-- Carry out the FFT along x, where all data are present due to the
	1162	!-- 1d-decomposition along y. Resort the data in a way that y becomes
	1163	!-- the first index.
[1106]	1164	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'start' )
[1]	1165
[2300]	1166	IF ( loop_optimization == 'vector' ) THEN
[1]	1167	!
	1168	!-- Code for vector processors
[85]	1169	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	1170	!$OMP DO
	1171	DO i = 0, nx
	1172
	1173	DO j = nys, nyn
	1174	DO k = 1, nz
	1175	work_fftx(i,k,j) = f_in(k,j,i)
	1176	ENDDO
	1177	ENDDO
	1178
	1179	ENDDO
	1180
	1181	!$OMP DO
	1182	DO j = nys, nyn
	1183
	1184	CALL fft_x_m( work_fftx(:,:,j), 'forward' )
	1185
	1186	DO k = 1, nz
	1187	DO i = 0, nx
	1188	work(j,k,i) = work_fftx(i,k,j)
	1189	ENDDO
	1190	ENDDO
	1191
	1192	ENDDO
	1193	!$OMP END PARALLEL
	1194
	1195	ELSE
	1196
	1197	!
	1198	!-- Cache optimized code (there might be still a potential for better
	1199	!-- optimization).
[696]	1200	!$OMP PARALLEL PRIVATE (i,j,k)
[1]	1201	!$OMP DO
	1202	DO i = 0, nx
	1203
	1204	DO j = nys, nyn
	1205	DO k = 1, nz
	1206	work_fftx(i,k,j) = f_in(k,j,i)
	1207	ENDDO
	1208	ENDDO
	1209
	1210	ENDDO
	1211
	1212	!$OMP DO
	1213	DO j = nys, nyn
	1214	DO k = 1, nz
	1215
[1106]	1216	CALL fft_x_1d( work_fftx(0:nx,k,j), 'forward' )
[1]	1217
	1218	DO i = 0, nx
	1219	work(j,k,i) = work_fftx(i,k,j)
	1220	ENDDO
	1221	ENDDO
	1222
	1223	ENDDO
	1224	!$OMP END PARALLEL
	1225
	1226	ENDIF
[1106]	1227	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'pause' )
[1]	1228
	1229	!
	1230	!-- Transpose array
[1111]	1231	#if defined( __parallel )
[1]	1232	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	1233	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	1234	CALL MPI_ALLTOALL( work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
	1235	f_out(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
	1236	comm1dy, ierr )
	1237	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1111]	1238	#endif
[1]	1239
	1240	END SUBROUTINE fftx_tr_xy
	1241
	1242
	1243	!------------------------------------------------------------------------------!
[1682]	1244	! Description:
	1245	! ------------
	1246	!> Transposition y --> x with a subsequent backward Fourier transformation for
	1247	!> a 1d-decomposition along x.
[1]	1248	!------------------------------------------------------------------------------!
[1682]	1249	SUBROUTINE tr_yx_fftx( f_in, f_out )
[1]	1250
[1682]	1251
[1320]	1252	USE control_parameters, &
[2300]	1253	ONLY: loop_optimization
[1320]	1254
	1255	USE cpulog, &
	1256	ONLY: cpu_log, log_point_s
	1257
	1258	USE kinds
	1259
[1]	1260	USE pegrid
	1261
	1262	IMPLICIT NONE
	1263
[1682]	1264	INTEGER(iwp) :: i !<
	1265	INTEGER(iwp) :: j !<
	1266	INTEGER(iwp) :: k !<
[1]	1267
[1682]	1268	REAL(wp), DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx !<
	1269	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_in !<
	1270	REAL(wp), DIMENSION(1:nz,nys:nyn,0:nx) :: f_out !<
	1271	REAL(wp), DIMENSION(nys:nyn,1:nz,0:nx) :: work !<
[1]	1272
	1273	!
	1274	!-- Transpose array
[1111]	1275	#if defined( __parallel )
[1]	1276	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	1277	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	1278	CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
	1279	work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
	1280	comm1dy, ierr )
	1281	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1111]	1282	#endif
[1]	1283
	1284	!
	1285	!-- Carry out the FFT along x, where all data are present due to the
	1286	!-- 1d-decomposition along y. Resort the data in a way that y becomes
	1287	!-- the first index.
[1106]	1288	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'continue' )
[1]	1289
[2300]	1290	IF ( loop_optimization == 'vector' ) THEN
[1]	1291	!
	1292	!-- Code optimized for vector processors
[85]	1293	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	1294	!$OMP DO
	1295	DO j = nys, nyn
	1296
	1297	DO k = 1, nz
	1298	DO i = 0, nx
	1299	work_fftx(i,k,j) = work(j,k,i)
	1300	ENDDO
	1301	ENDDO
	1302
	1303	CALL fft_x_m( work_fftx(:,:,j), 'backward' )
	1304
	1305	ENDDO
	1306
	1307	!$OMP DO
	1308	DO i = 0, nx
	1309	DO j = nys, nyn
	1310	DO k = 1, nz
	1311	f_out(k,j,i) = work_fftx(i,k,j)
	1312	ENDDO
	1313	ENDDO
	1314	ENDDO
	1315	!$OMP END PARALLEL
	1316
	1317	ELSE
	1318
	1319	!
	1320	!-- Cache optimized code (there might be still a potential for better
	1321	!-- optimization).
[696]	1322	!$OMP PARALLEL PRIVATE (i,j,k)
[1]	1323	!$OMP DO
	1324	DO j = nys, nyn
	1325	DO k = 1, nz
	1326
	1327	DO i = 0, nx
	1328	work_fftx(i,k,j) = work(j,k,i)
	1329	ENDDO
	1330
[1106]	1331	CALL fft_x_1d( work_fftx(0:nx,k,j), 'backward' )
[1]	1332
	1333	ENDDO
	1334	ENDDO
	1335
	1336	!$OMP DO
	1337	DO i = 0, nx
	1338	DO j = nys, nyn
	1339	DO k = 1, nz
	1340	f_out(k,j,i) = work_fftx(i,k,j)
	1341	ENDDO
	1342	ENDDO
	1343	ENDDO
	1344	!$OMP END PARALLEL
	1345
	1346	ENDIF
[1106]	1347	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'stop' )
[1]	1348
	1349	END SUBROUTINE tr_yx_fftx
	1350
	1351
	1352	!------------------------------------------------------------------------------!
[1682]	1353	! Description:
	1354	! ------------
	1355	!> FFT along y, solution of the tridiagonal system and backward FFT for
	1356	!> a 1d-decomposition along y.
	1357	!>
	1358	!> @warning this subroutine may still not work for hybrid parallelization
	1359	!> with OpenMP (for possible necessary changes see the original
	1360	!> routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
[1]	1361	!------------------------------------------------------------------------------!
[1682]	1362	SUBROUTINE ffty_tri_ffty( ar )
[1]	1363
[1682]	1364
[1320]	1365	USE control_parameters, &
[2300]	1366	ONLY: loop_optimization
[1320]	1367
	1368	USE cpulog, &
	1369	ONLY: cpu_log, log_point_s
	1370
	1371	USE grid_variables, &
	1372	ONLY: ddx2, ddy2
	1373
	1374	USE kinds
	1375
[1]	1376	USE pegrid
	1377
	1378	IMPLICIT NONE
	1379
[1682]	1380	INTEGER(iwp) :: i !<
	1381	INTEGER(iwp) :: j !<
	1382	INTEGER(iwp) :: k !<
	1383	INTEGER(iwp) :: m !<
	1384	INTEGER(iwp) :: n !<
	1385	INTEGER(iwp) :: omp_get_thread_num !<
	1386	INTEGER(iwp) :: tn !<
[1]	1387
[1682]	1388	REAL(wp), DIMENSION(0:ny) :: work_ffty !<
	1389	REAL(wp), DIMENSION(0:ny,1:nz) :: work_triy !<
	1390	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: ar !<
	1391	REAL(wp), DIMENSION(:,:,:,:), ALLOCATABLE :: tri !<
[1]	1392
	1393
[1106]	1394	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'start' )
[1]	1395
	1396	ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) )
	1397
	1398	tn = 0 ! Default thread number in case of one thread
[696]	1399	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy )
[1]	1400	DO i = nxl_y, nxr_y
	1401
	1402	!$ tn = omp_get_thread_num()
	1403
[2300]	1404	IF ( loop_optimization == 'vector' ) THEN
[1]	1405	!
	1406	!-- Code optimized for vector processors
	1407	DO k = 1, nz
	1408
	1409	m = 0
	1410	DO n = 1, pdims(2)
[1003]	1411	DO j = 1, nny
[1]	1412	work_triy(m,k) = ar(j,k,i,n)
	1413	m = m + 1
	1414	ENDDO
	1415	ENDDO
	1416
	1417	ENDDO
	1418
	1419	CALL fft_y_m( work_triy, ny, 'forward' )
	1420
	1421	ELSE
	1422	!
	1423	!-- Cache optimized code
	1424	DO k = 1, nz
	1425
	1426	m = 0
	1427	DO n = 1, pdims(2)
[1003]	1428	DO j = 1, nny
[1]	1429	work_ffty(m) = ar(j,k,i,n)
	1430	m = m + 1
	1431	ENDDO
	1432	ENDDO
	1433
[1106]	1434	CALL fft_y_1d( work_ffty, 'forward' )
[1]	1435
	1436	DO j = 0, ny
	1437	work_triy(j,k) = work_ffty(j)
	1438	ENDDO
	1439
	1440	ENDDO
	1441
	1442	ENDIF
	1443
	1444	!
	1445	!-- Solve the linear equation system
	1446	CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) )
	1447
[2300]	1448	IF ( loop_optimization == 'vector' ) THEN
[1]	1449	!
	1450	!-- Code optimized for vector processors
	1451	CALL fft_y_m( work_triy, ny, 'backward' )
	1452
	1453	DO k = 1, nz
	1454
	1455	m = 0
	1456	DO n = 1, pdims(2)
[1003]	1457	DO j = 1, nny
[1]	1458	ar(j,k,i,n) = work_triy(m,k)
	1459	m = m + 1
	1460	ENDDO
	1461	ENDDO
	1462
	1463	ENDDO
	1464
	1465	ELSE
	1466	!
	1467	!-- Cache optimized code
	1468	DO k = 1, nz
	1469
	1470	DO j = 0, ny
	1471	work_ffty(j) = work_triy(j,k)
	1472	ENDDO
	1473
[1106]	1474	CALL fft_y_1d( work_ffty, 'backward' )
[1]	1475
	1476	m = 0
	1477	DO n = 1, pdims(2)
[1003]	1478	DO j = 1, nny
[1]	1479	ar(j,k,i,n) = work_ffty(m)
	1480	m = m + 1
	1481	ENDDO
	1482	ENDDO
	1483
	1484	ENDDO
	1485
	1486	ENDIF
	1487
	1488	ENDDO
	1489
	1490	DEALLOCATE( tri )
	1491
[1106]	1492	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'stop' )
[1]	1493
	1494	END SUBROUTINE ffty_tri_ffty
	1495
	1496	END MODULE poisfft_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |