Home

Context Navigation

source: palm/trunk/SOURCE/poisfft_mod.f90 @ 2169

Last change on this file since 2169 was 2119, checked in by raasch, 8 years ago
last commit documented
Property svn:keywords set to `Id`
File size: 44.8 KB

Rev	Line
[1850]	1	!> @file poisfft_mod.f90
[2000]	2	!------------------------------------------------------------------------------!
[1036]	3	! This file is part of PALM.
	4	!
[2000]	5	! PALM is free software: you can redistribute it and/or modify it under the
	6	! terms of the GNU General Public License as published by the Free Software
	7	! Foundation, either version 3 of the License, or (at your option) any later
	8	! version.
[1036]	9	!
	10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
	11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
	12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
	13	!
	14	! You should have received a copy of the GNU General Public License along with
	15	! PALM. If not, see <http://www.gnu.org/licenses/>.
	16	!
[2101]	17	! Copyright 1997-2017 Leibniz Universitaet Hannover
[2000]	18	!------------------------------------------------------------------------------!
[1036]	19	!
[484]	20	! Current revisions:
[1]	21	! -----------------
[1683]	22	!
[2119]	23	!
[1321]	24	! Former revisions:
	25	! -----------------
	26	! $Id: poisfft_mod.f90 2119 2017-01-17 16:51:50Z suehring $
	27	!
[2119]	28	! 2118 2017-01-17 16:38:49Z raasch
	29	! OpenACC directives and related code removed
	30	!
[2001]	31	! 2000 2016-08-20 18:09:15Z knoop
	32	! Forced header and separation lines into 80 columns
	33	!
[1851]	34	! 1850 2016-04-08 13:29:27Z maronga
	35	! Module renamed
	36	!
	37	!
[1805]	38	! 1804 2016-04-05 16:30:18Z maronga
	39	! Removed code for parameter file check (__check)
	40	!
[1683]	41	! 1682 2015-10-07 23:56:08Z knoop
	42	! Code annotations made doxygen readable
	43	!
[1483]	44	! 1482 2014-10-18 12:34:45Z raasch
	45	! use 2d-decomposition, if accelerator boards are used
	46	!
[1407]	47	! 1406 2014-05-16 13:47:01Z raasch
	48	! bugfix for pgi 14.4: declare create moved after array declaration
	49	!
[1321]	50	! 1320 2014-03-20 08:40:49Z raasch
[1320]	51	! ONLY-attribute added to USE-statements,
	52	! kind-parameters added to all INTEGER and REAL declaration statements,
	53	! kinds are defined in new module kinds,
	54	! old module precision_kind is removed,
	55	! revision history before 2012 removed,
	56	! comment fields (!:) to be used for variable explanations added to
	57	! all variable declaration statements
[1217]	58	!
[1319]	59	! 1318 2014-03-17 13:35:16Z raasch
	60	! module interfaces removed
	61	!
[1307]	62	! 1306 2014-03-13 14:30:59Z raasch
	63	! openmp sections removed from the overlap branch,
	64	! second argument removed from parameter list
	65	!
[1217]	66	! 1216 2013-08-26 09:31:42Z raasch
[1216]	67	! resorting of arrays moved to separate routines resort_for_...,
	68	! one argument, used as temporary work array, removed from all transpose
	69	! routines
	70	! overlapping fft / transposition implemented
[1112]	71	!
[1213]	72	! 1212 2013-08-15 08:46:27Z raasch
	73	! tridia routines moved to seperate module tridia_solver
	74	!
[1209]	75	! 1208 2013-08-13 06:41:49Z raasch
	76	! acc-update clauses added for "ar" so that ffts other than cufft can also be
	77	! used (although they are not ported and will give a poor performance)
	78	!
[1112]	79	! 1111 2013-03-08 23:54:10Z raasch
[1111]	80	! further openACC porting of non-parallel (MPI) branch:
	81	! tridiagonal routines split into extermal subroutines (instead using CONTAINS),
	82	! no distinction between parallel/non-parallel in poisfft and tridia any more,
[1112]	83	! tridia routines moved to end of file because of probable bug in PGI compiler 12.5
[1111]	84	! (otherwise "invalid device function" is indicated during runtime),
	85	! optimization of tridia routines: constant elements and coefficients of tri are
	86	! stored in seperate arrays ddzuw and tric, last dimension of tri reduced from 5
	87	! to 2,
	88	! poisfft_init is now called internally from poisfft, maketri is called from
	89	! poisfft_init,
	90	! ibc_p_b = 2 removed
[1]	91	!
[1107]	92	! 1106 2013-03-04 05:31:38Z raasch
	93	! routines fftx, ffty, fftxp, fftyp removed, calls replaced by fft_x, fft_y,
	94	! in the 1D-decomposition routines fft_x, ffty are replaced by fft_x_1d,
	95	! fft_y_1d
	96	!
[1104]	97	! 1103 2013-02-20 02:15:53Z raasch
	98	! tri, ar, and ar1 arguments in tridia-routines (2d) are removed because they
	99	! sometimes cause segmentation faults with intel 12.1 compiler
	100	!
[1093]	101	! 1092 2013-02-02 11:24:22Z raasch
	102	! unused variables removed
	103	!
[1037]	104	! 1036 2012-10-22 13:43:42Z raasch
	105	! code put under GPL (PALM 3.9)
	106	!
[1014]	107	! 2012-09-21 07:03:55Z raasch
	108	! FLOAT type conversion replaced by REAL
	109	!
[1004]	110	! 1003 2012-09-14 14:35:53Z raasch
	111	! indices nxa, nya, etc. replaced by nx, ny, etc.
	112	!
[941]	113	! 940 2012-07-09 14:31:00Z raasch
	114	! special handling of tri-array as an argument in tridia_1dd routines switched
	115	! off because it caused segmentation faults with intel 12.1 compiler
	116	!
[878]	117	! 877 2012-04-03 11:21:44Z suehring
	118	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
	119	! pressure at the top of the model domain.
	120	!
[810]	121	! 809 2012-01-30 13:32:58Z maronga
	122	! Bugfix: replaced .AND. and .NOT. with && and ! in the preprocessor directives
	123	!
[808]	124	! 807 2012-01-25 11:53:51Z maronga
	125	! New cpp directive "__check" implemented which is used by check_namelist_files
	126	! (most of the code is unneeded by check_namelist_files).
	127	!
[1]	128	! Revision 1.1 1997/07/24 11:24:14 raasch
	129	! Initial revision
	130	!
	131	!
	132	! Description:
	133	! ------------
[1682]	134	!> Solves the Poisson equation with a 2D spectral method
	135	!> d^2 p / dx^2 + d^2 p / dy^2 + d^2 p / dz^2 = s
	136	!>
	137	!> Input:
	138	!> real ar contains (nnz,nny,nnx) elements of the velocity divergence,
	139	!> starting from (1,nys,nxl)
	140	!>
	141	!> Output:
	142	!> real ar contains the solution for perturbation pressure p
[1]	143	!------------------------------------------------------------------------------!
[1682]	144	MODULE poisfft_mod
	145
[1]	146
[1320]	147	USE fft_xy, &
	148	ONLY: fft_init, fft_y, fft_y_1d, fft_y_m, fft_x, fft_x_1d, fft_x_m
[1]	149
[1320]	150	USE indices, &
	151	ONLY: nnx, nny, nx, nxl, nxr, ny, nys, nyn, nz
	152
	153	USE transpose_indices, &
	154	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nys_x, nys_z, nyn_x, nyn_z, nzb_x, &
	155	nzb_y, nzt_x, nzt_y
	156
	157	USE tridia_solver, &
	158	ONLY: tridia_1dd, tridia_init, tridia_substi, tridia_substi_overlap
	159
[1]	160	IMPLICIT NONE
	161
[1111]	162	LOGICAL, SAVE :: poisfft_initialized = .FALSE.
	163
[1]	164	PRIVATE
[807]	165
[1]	166	PUBLIC poisfft, poisfft_init
	167
	168	INTERFACE poisfft
	169	MODULE PROCEDURE poisfft
	170	END INTERFACE poisfft
	171
	172	INTERFACE poisfft_init
	173	MODULE PROCEDURE poisfft_init
	174	END INTERFACE poisfft_init
	175
[807]	176
[1]	177	CONTAINS
	178
[1682]	179	!------------------------------------------------------------------------------!
	180	! Description:
	181	! ------------
	182	!> @todo Missing subroutine description.
	183	!------------------------------------------------------------------------------!
[1]	184	SUBROUTINE poisfft_init
	185
[1320]	186	USE arrays_3d, &
	187	ONLY: ddzu_pres, ddzw
[1111]	188
[1320]	189	USE kinds
	190
[1111]	191	IMPLICIT NONE
	192
[1682]	193	INTEGER(iwp) :: k !<
[1111]	194
	195
[1]	196	CALL fft_init
	197
[1212]	198	CALL tridia_init
[1111]	199
	200	poisfft_initialized = .TRUE.
	201
[1]	202	END SUBROUTINE poisfft_init
	203
[1111]	204
[1804]	205
[1682]	206	!------------------------------------------------------------------------------!
	207	! Description:
	208	! ------------
	209	!> Two-dimensional Fourier Transformation in x- and y-direction.
	210	!------------------------------------------------------------------------------!
[1306]	211	SUBROUTINE poisfft( ar )
[1]	212
[1320]	213	USE control_parameters, &
	214	ONLY: fft_method, transpose_compute_overlap
	215
	216	USE cpulog, &
	217	ONLY: cpu_log, cpu_log_nowait, log_point_s
	218
	219	USE kinds
	220
[1]	221	USE pegrid
	222
	223	IMPLICIT NONE
	224
[1682]	225	INTEGER(iwp) :: ii !<
	226	INTEGER(iwp) :: iind !<
	227	INTEGER(iwp) :: inew !<
	228	INTEGER(iwp) :: jj !<
	229	INTEGER(iwp) :: jind !<
	230	INTEGER(iwp) :: jnew !<
	231	INTEGER(iwp) :: ki !<
	232	INTEGER(iwp) :: kk !<
	233	INTEGER(iwp) :: knew !<
	234	INTEGER(iwp) :: n !<
	235	INTEGER(iwp) :: nblk !<
	236	INTEGER(iwp) :: nnx_y !<
	237	INTEGER(iwp) :: nny_z !<
	238	INTEGER(iwp) :: nnz_t !<
	239	INTEGER(iwp) :: nnz_x !<
	240	INTEGER(iwp) :: nxl_y_bound !<
	241	INTEGER(iwp) :: nxr_y_bound !<
[1]	242
[1682]	243	INTEGER(iwp), DIMENSION(4) :: isave !<
[1320]	244
[1682]	245	REAL(wp), DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar !<
	246	REAL(wp), DIMENSION(nys:nyn,nxl:nxr,1:nz) :: ar_inv !<
[1]	247
[1682]	248	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: ar1 !<
	249	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_in !<
	250	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_inv !<
	251	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_out_y !<
	252	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_out_z !<
[1216]	253
	254
[1]	255	CALL cpu_log( log_point_s(3), 'poisfft', 'start' )
	256
[1111]	257	IF ( .NOT. poisfft_initialized ) CALL poisfft_init
	258
[1]	259	!
	260	!-- Two-dimensional Fourier Transformation in x- and y-direction.
[2118]	261	IF ( pdims(2) == 1 .AND. pdims(1) > 1 ) THEN
[1]	262
	263	!
	264	!-- 1d-domain-decomposition along x:
	265	!-- FFT along y and transposition y --> x
[1216]	266	CALL ffty_tr_yx( ar, ar )
[1]	267
	268	!
	269	!-- FFT along x, solving the tridiagonal system and backward FFT
	270	CALL fftx_tri_fftx( ar )
	271
	272	!
	273	!-- Transposition x --> y and backward FFT along y
[1216]	274	CALL tr_xy_ffty( ar, ar )
[1]	275
[2118]	276	ELSEIF ( pdims(1) == 1 .AND. pdims(2) > 1 ) THEN
[1]	277
	278	!
	279	!-- 1d-domain-decomposition along y:
	280	!-- FFT along x and transposition x --> y
[1216]	281	CALL fftx_tr_xy( ar, ar )
[1]	282
	283	!
	284	!-- FFT along y, solving the tridiagonal system and backward FFT
	285	CALL ffty_tri_ffty( ar )
	286
	287	!
	288	!-- Transposition y --> x and backward FFT along x
[1216]	289	CALL tr_yx_fftx( ar, ar )
[1]	290
[1216]	291	ELSEIF ( .NOT. transpose_compute_overlap ) THEN
[1]	292
	293	!
[1111]	294	!-- 2d-domain-decomposition or no decomposition (1 PE run)
[1]	295	!-- Transposition z --> x
	296	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
[1216]	297	CALL resort_for_zx( ar, ar_inv )
	298	CALL transpose_zx( ar_inv, ar )
[1]	299	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
	300
	301	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
[1106]	302	CALL fft_x( ar, 'forward' )
[1]	303	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
	304
	305	!
	306	!-- Transposition x --> y
	307	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
[1216]	308	CALL resort_for_xy( ar, ar_inv )
	309	CALL transpose_xy( ar_inv, ar )
[1]	310	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
	311
	312	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
[1216]	313	CALL fft_y( ar, 'forward', ar_tr = ar, &
	314	nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
	315	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
[1]	316	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	317
	318	!
	319	!-- Transposition y --> z
	320	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
[1216]	321	CALL resort_for_yz( ar, ar_inv )
	322	CALL transpose_yz( ar_inv, ar )
[1]	323	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
	324
	325	!
[1106]	326	!-- Solve the tridiagonal equation system along z
[1]	327	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
[1212]	328	CALL tridia_substi( ar )
[1]	329	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
	330
	331	!
	332	!-- Inverse Fourier Transformation
	333	!-- Transposition z --> y
	334	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
[1216]	335	CALL transpose_zy( ar, ar_inv )
	336	CALL resort_for_zy( ar_inv, ar )
[1]	337	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
	338
	339	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
[1216]	340	CALL fft_y( ar, 'backward', ar_tr = ar, &
	341	nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
	342	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
[1]	343	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	344
	345	!
	346	!-- Transposition y --> x
	347	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
[1216]	348	CALL transpose_yx( ar, ar_inv )
	349	CALL resort_for_yx( ar_inv, ar )
[1]	350	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
	351
	352	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
[1106]	353	CALL fft_x( ar, 'backward' )
[1]	354	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	355
	356	!
	357	!-- Transposition x --> z
	358	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
[1216]	359	CALL transpose_xz( ar, ar_inv )
	360	CALL resort_for_xz( ar_inv, ar )
[1]	361	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
	362
[1216]	363	ELSE
	364
	365	!
	366	!-- 2d-domain-decomposition or no decomposition (1 PE run) with
	367	!-- overlapping transposition / fft
[1318]	368	!-- cputime logging must not use barriers, which would prevent overlapping
[1216]	369	ALLOCATE( f_out_y(0:ny,nxl_y:nxr_y,nzb_y:nzt_y), &
	370	f_out_z(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
	371	!
	372	!-- Transposition z --> x + subsequent fft along x
	373	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
	374	CALL resort_for_zx( ar, f_inv )
	375	!
	376	!-- Save original indices and gridpoint counter
	377	isave(1) = nz
	378	isave(2) = nzb_x
	379	isave(3) = nzt_x
	380	isave(4) = sendrecvcount_zx
	381	!
	382	!-- Set new indices for transformation
	383	nblk = nz / pdims(1)
	384	nz = pdims(1)
	385	nnz_x = 1
	386	nzb_x = 1 + myidx * nnz_x
	387	nzt_x = ( myidx + 1 ) * nnz_x
	388	sendrecvcount_zx = nnx * nny * nnz_x
	389
[1306]	390	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
[1216]	391	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
	392
[1306]	393	DO kk = 1, nblk
[1216]	394
[1306]	395	IF ( kk == 1 ) THEN
[1318]	396	CALL cpu_log( log_point_s(5), 'transpo forward', 'start', cpu_log_nowait )
[1306]	397	ELSE
[1318]	398	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
[1306]	399	ENDIF
[1216]	400
[1306]	401	DO knew = 1, nz
	402	ki = kk + nblk * ( knew - 1 )
	403	f_in(:,:,knew) = f_inv(:,:,ki)
	404	ENDDO
[1216]	405
[1306]	406	CALL transpose_zx( f_in, ar1(:,:,:))
	407	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
[1216]	408
[1306]	409	IF ( kk == 1 ) THEN
[1318]	410	CALL cpu_log( log_point_s(4), 'fft_x', 'start', cpu_log_nowait )
[1306]	411	ELSE
[1318]	412	CALL cpu_log( log_point_s(4), 'fft_x', 'continue', cpu_log_nowait )
[1216]	413	ENDIF
	414
[1306]	415	n = isave(2) + kk - 1
	416	CALL fft_x( ar1(:,:,:), 'forward', ar_2d = f_out_z(:,:,n))
	417	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
[1216]	418
	419	ENDDO
	420	!
	421	!-- Restore original indices/counters
	422	nz = isave(1)
	423	nzb_x = isave(2)
	424	nzt_x = isave(3)
	425	sendrecvcount_zx = isave(4)
	426
	427	DEALLOCATE( ar1, f_in, f_inv )
	428
	429	!
	430	!-- Transposition x --> y + subsequent fft along y
	431	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
	432	CALL resort_for_xy( f_out_z, f_inv )
	433	!
	434	!-- Save original indices and gridpoint counter
	435	isave(1) = nx
	436	isave(2) = nxl_y
	437	isave(3) = nxr_y
	438	isave(4) = sendrecvcount_xy
	439	!
	440	!-- Set new indices for transformation
	441	nblk = ( ( nx+1 ) / pdims(2) ) - 1
	442	nx = pdims(2)
	443	nnx_y = 1
	444	nxl_y = myidy * nnx_y
	445	nxr_y = ( myidy + 1 ) * nnx_y - 1
	446	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
	447
[1306]	448	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
[1216]	449	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
	450
[1306]	451	DO ii = 0, nblk
[1216]	452
[1318]	453	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
[1216]	454
[1306]	455	DO inew = 0, nx-1
	456	iind = ii + ( nblk + 1 ) * inew
	457	f_in(:,:,inew) = f_inv(:,:,iind)
	458	ENDDO
[1216]	459
[1306]	460	CALL transpose_xy( f_in, ar1(:,:,:) )
[1216]	461
[1306]	462	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
[1216]	463
[1306]	464	IF ( ii == 1 ) THEN
[1318]	465	CALL cpu_log( log_point_s(7), 'fft_y', 'start', cpu_log_nowait )
[1306]	466	ELSE
[1318]	467	CALL cpu_log( log_point_s(7), 'fft_y', 'continue', cpu_log_nowait )
[1216]	468	ENDIF
	469
[1306]	470	nxl_y_bound = isave(2)
	471	nxr_y_bound = isave(3)
	472	n = isave(2) + ii
	473	CALL fft_y( ar1(:,:,:), 'forward', ar_tr = f_out_y, &
	474	nxl_y_bound = nxl_y_bound, nxr_y_bound = nxr_y_bound, &
	475	nxl_y_l = n, nxr_y_l = n )
[1216]	476
[1306]	477	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
[1216]	478
	479	ENDDO
	480	!
	481	!-- Restore original indices/counters
	482	nx = isave(1)
	483	nxl_y = isave(2)
	484	nxr_y = isave(3)
	485	sendrecvcount_xy = isave(4)
	486
	487	DEALLOCATE( ar1, f_in, f_inv )
	488
	489	!
	490	!-- Transposition y --> z + subsequent tridia + resort for z --> y
	491	ALLOCATE( f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
	492	CALL resort_for_yz( f_out_y, f_inv )
	493	!
	494	!-- Save original indices and gridpoint counter
	495	isave(1) = ny
	496	isave(2) = nys_z
	497	isave(3) = nyn_z
	498	isave(4) = sendrecvcount_yz
	499	!
	500	!-- Set new indices for transformation
	501	nblk = ( ( ny+1 ) / pdims(1) ) - 1
	502	ny = pdims(1)
	503	nny_z = 1
	504	nys_z = myidx * nny_z
	505	nyn_z = ( myidx + 1 ) * nny_z - 1
	506	sendrecvcount_yz = ( nxr_y-nxl_y+1 ) * nny_z * ( nzt_y-nzb_y+1 )
	507
[1306]	508	ALLOCATE( ar1(nxl_z:nxr_z,nys_z:nyn_z,1:nz) )
[1216]	509	ALLOCATE( f_in(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
	510
[1306]	511	DO jj = 0, nblk
[1216]	512	!
[1306]	513	!-- Forward Fourier Transformation
	514	!-- Transposition y --> z
[1318]	515	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
[1216]	516
[1306]	517	DO jnew = 0, ny-1
	518	jind = jj + ( nblk + 1 ) * jnew
	519	f_in(:,:,jnew) = f_inv(:,:,jind)
	520	ENDDO
[1216]	521
[1306]	522	CALL transpose_yz( f_in, ar1(:,:,:) )
[1216]	523
[1306]	524	IF ( jj == nblk ) THEN
	525	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
	526	ELSE
	527	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
[1216]	528	ENDIF
	529
	530	!
[1306]	531	!-- Solve the tridiagonal equation system along z
[1318]	532	CALL cpu_log( log_point_s(6), 'tridia', 'start', cpu_log_nowait )
[1216]	533
[1306]	534	n = isave(2) + jj
	535	CALL tridia_substi_overlap( ar1(:,:,:), n )
[1216]	536
[1306]	537	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
[1216]	538
[1306]	539	!
	540	!-- Inverse Fourier Transformation
	541	!-- Transposition z --> y
	542	!-- Only one thread should call MPI routines, therefore forward and
	543	!-- backward tranpose are in the same section
	544	IF ( jj == 0 ) THEN
[1318]	545	CALL cpu_log( log_point_s(8), 'transpo invers', 'start', cpu_log_nowait )
[1306]	546	ELSE
[1318]	547	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
[1216]	548	ENDIF
	549
[1306]	550	CALL transpose_zy( ar1(:,:,:), f_in )
[1216]	551
[1306]	552	DO jnew = 0, ny-1
	553	jind = jj + ( nblk + 1 ) * jnew
	554	f_inv(:,:,jind) = f_in(:,:,jnew)
	555	ENDDO
[1216]	556
[1306]	557	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
[1216]	558
	559	ENDDO
	560	!
	561	!-- Restore original indices/counters
	562	ny = isave(1)
	563	nys_z = isave(2)
	564	nyn_z = isave(3)
	565	sendrecvcount_yz = isave(4)
	566
	567	CALL resort_for_zy( f_inv, f_out_y )
	568
	569	DEALLOCATE( ar1, f_in, f_inv )
	570
	571	!
	572	!-- fft along y backward + subsequent transposition y --> x
	573	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
	574	!
	575	!-- Save original indices and gridpoint counter
	576	isave(1) = nx
	577	isave(2) = nxl_y
	578	isave(3) = nxr_y
	579	isave(4) = sendrecvcount_xy
	580	!
	581	!-- Set new indices for transformation
	582	nblk = (( nx+1 ) / pdims(2) ) - 1
	583	nx = pdims(2)
	584	nnx_y = 1
	585	nxl_y = myidy * nnx_y
	586	nxr_y = ( myidy + 1 ) * nnx_y - 1
	587	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
	588
[1306]	589	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
[1216]	590	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
	591
[1306]	592	DO ii = 0, nblk
[1216]	593
[1318]	594	CALL cpu_log( log_point_s(7), 'fft_y', 'continue', cpu_log_nowait )
[1216]	595
[1306]	596	n = isave(2) + ii
	597	nxl_y_bound = isave(2)
	598	nxr_y_bound = isave(3)
[1216]	599
[1306]	600	CALL fft_y( ar1(:,:,:), 'backward', ar_tr = f_out_y, &
	601	nxl_y_bound = nxl_y_bound, nxr_y_bound = nxr_y_bound, &
	602	nxl_y_l = n, nxr_y_l = n )
[1216]	603
[1306]	604	IF ( ii == nblk ) THEN
	605	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	606	ELSE
	607	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
[1216]	608	ENDIF
	609
[1318]	610	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
[1216]	611
[1306]	612	CALL transpose_yx( ar1(:,:,:), f_in )
[1216]	613
[1306]	614	DO inew = 0, nx-1
	615	iind = ii + (nblk+1) * inew
	616	f_inv(:,:,iind) = f_in(:,:,inew)
	617	ENDDO
[1216]	618
[1306]	619	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
[1216]	620
	621	ENDDO
	622	!
	623	!-- Restore original indices/counters
	624	nx = isave(1)
	625	nxl_y = isave(2)
	626	nxr_y = isave(3)
	627	sendrecvcount_xy = isave(4)
	628
	629	CALL resort_for_yx( f_inv, f_out_z )
	630
	631	DEALLOCATE( ar1, f_in, f_inv )
	632
	633	!
	634	!-- fft along x backward + subsequent final transposition x --> z
	635	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
	636	!
	637	!-- Save original indices and gridpoint counter
	638	isave(1) = nz
	639	isave(2) = nzb_x
	640	isave(3) = nzt_x
	641	isave(4) = sendrecvcount_zx
	642	!
	643	!-- Set new indices for transformation
	644	nblk = nz / pdims(1)
	645	nz = pdims(1)
	646	nnz_x = 1
	647	nzb_x = 1 + myidx * nnz_x
	648	nzt_x = ( myidx + 1 ) * nnz_x
	649	sendrecvcount_zx = nnx * nny * nnz_x
	650
[1306]	651	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
[1216]	652	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
	653
[1306]	654	DO kk = 1, nblk
[1216]	655
[1318]	656	CALL cpu_log( log_point_s(4), 'fft_x', 'continue', cpu_log_nowait )
[1216]	657
[1306]	658	n = isave(2) + kk - 1
	659	CALL fft_x( ar1(:,:,:), 'backward', f_out_z(:,:,n))
[1216]	660
[1306]	661	IF ( kk == nblk ) THEN
	662	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	663	ELSE
	664	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
[1216]	665	ENDIF
	666
[1318]	667	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
[1216]	668
[1306]	669	CALL transpose_xz( ar1(:,:,:), f_in )
[1216]	670
[1306]	671	DO knew = 1, nz
	672	ki = kk + nblk * (knew-1)
	673	f_inv(:,:,ki) = f_in(:,:,knew)
	674	ENDDO
[1216]	675
[1306]	676	IF ( kk == nblk ) THEN
	677	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
	678	ELSE
	679	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
[1216]	680	ENDIF
	681
	682	ENDDO
	683	!
	684	!-- Restore original indices/counters
	685	nz = isave(1)
	686	nzb_x = isave(2)
	687	nzt_x = isave(3)
	688	sendrecvcount_zx = isave(4)
	689
	690	CALL resort_for_xz( f_inv, ar )
	691
	692	DEALLOCATE( ar1, f_in, f_inv )
	693
[1]	694	ENDIF
	695
	696	CALL cpu_log( log_point_s(3), 'poisfft', 'stop' )
	697
	698	END SUBROUTINE poisfft
	699
	700
	701	!------------------------------------------------------------------------------!
[1682]	702	! Description:
	703	! ------------
	704	!> Fourier-transformation along y with subsequent transposition y --> x for
	705	!> a 1d-decomposition along x.
	706	!>
	707	!> @attention The performance of this routine is much faster on the NEC-SX6,
	708	!> if the first index of work_ffty_vec is odd. Otherwise
	709	!> memory bank conflicts may occur (especially if the index is a
	710	!> multiple of 128). That's why work_ffty_vec is dimensioned as
	711	!> 0:ny+1.
	712	!> Of course, this will not work if users are using an odd number
	713	!> of gridpoints along y.
[1]	714	!------------------------------------------------------------------------------!
[1682]	715	SUBROUTINE ffty_tr_yx( f_in, f_out )
[1]	716
[1682]	717
[1320]	718	USE control_parameters, &
	719	ONLY: host
	720
	721	USE cpulog, &
	722	ONLY: cpu_log, log_point_s
	723
	724	USE kinds
	725
[1]	726	USE pegrid
	727
	728	IMPLICIT NONE
	729
[1682]	730	INTEGER(iwp) :: i !<
	731	INTEGER(iwp) :: iend !<
	732	INTEGER(iwp) :: iouter !<
	733	INTEGER(iwp) :: ir !<
	734	INTEGER(iwp) :: j !<
	735	INTEGER(iwp) :: k !<
[1]	736
[1682]	737	INTEGER(iwp), PARAMETER :: stridex = 4 !<
[1320]	738
[1682]	739	REAL(wp), DIMENSION(0:ny,stridex) :: work_ffty !<
[1]	740	#if defined( __nec )
[1682]	741	REAL(wp), DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec !<
[1]	742	#endif
[1682]	743	REAL(wp), DIMENSION(1:nz,0:ny,nxl:nxr) :: f_in !<
	744	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_out !<
	745	REAL(wp), DIMENSION(nxl:nxr,1:nz,0:ny) :: work !<
[1]	746
	747	!
	748	!-- Carry out the FFT along y, where all data are present due to the
	749	!-- 1d-decomposition along x. Resort the data in a way that x becomes
	750	!-- the first index.
[1106]	751	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'start' )
[1]	752
	753	IF ( host(1:3) == 'nec' ) THEN
	754	#if defined( __nec )
	755	!
	756	!-- Code optimized for vector processors
[85]	757	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	758	!$OMP DO
	759	DO i = nxl, nxr
	760
	761	DO j = 0, ny
	762	DO k = 1, nz
	763	work_ffty_vec(j,k,i) = f_in(k,j,i)
	764	ENDDO
	765	ENDDO
	766
	767	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' )
	768
	769	ENDDO
	770
	771	!$OMP DO
	772	DO k = 1, nz
	773	DO j = 0, ny
	774	DO i = nxl, nxr
	775	work(i,k,j) = work_ffty_vec(j,k,i)
	776	ENDDO
	777	ENDDO
	778	ENDDO
	779	!$OMP END PARALLEL
	780	#endif
	781
	782	ELSE
	783
	784	!
	785	!-- Cache optimized code.
	786	!-- The i-(x-)direction is split into a strided outer loop and an inner
	787	!-- loop for better cache performance
	788	!$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty)
	789	!$OMP DO
	790	DO iouter = nxl, nxr, stridex
	791
	792	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
	793
	794	DO k = 1, nz
	795
	796	DO i = iouter, iend
	797
	798	ir = i-iouter+1 ! counter within a stride
	799	DO j = 0, ny
	800	work_ffty(j,ir) = f_in(k,j,i)
	801	ENDDO
	802	!
	803	!-- FFT along y
[1106]	804	CALL fft_y_1d( work_ffty(:,ir), 'forward' )
[1]	805
	806	ENDDO
	807
	808	!
	809	!-- Resort
	810	DO j = 0, ny
	811	DO i = iouter, iend
	812	work(i,k,j) = work_ffty(j,i-iouter+1)
	813	ENDDO
	814	ENDDO
	815
	816	ENDDO
	817
	818	ENDDO
	819	!$OMP END PARALLEL
	820
	821	ENDIF
[1106]	822	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'pause' )
[1]	823
	824	!
	825	!-- Transpose array
[1111]	826	#if defined( __parallel )
[1]	827	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	828	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	829	CALL MPI_ALLTOALL( work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
	830	f_out(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
	831	comm1dx, ierr )
	832	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1111]	833	#endif
[1]	834
	835	END SUBROUTINE ffty_tr_yx
	836
	837
	838	!------------------------------------------------------------------------------!
[1682]	839	! Description:
	840	! ------------
	841	!> Transposition x --> y with a subsequent backward Fourier transformation for
	842	!> a 1d-decomposition along x
[1]	843	!------------------------------------------------------------------------------!
[1682]	844	SUBROUTINE tr_xy_ffty( f_in, f_out )
[1]	845
[1682]	846
[1320]	847	USE control_parameters, &
	848	ONLY: host
	849
	850	USE cpulog, &
	851	ONLY: cpu_log, log_point_s
	852
	853	USE kinds
	854
[1]	855	USE pegrid
	856
	857	IMPLICIT NONE
	858
[1682]	859	INTEGER(iwp) :: i !<
	860	INTEGER(iwp) :: iend !<
	861	INTEGER(iwp) :: iouter !<
	862	INTEGER(iwp) :: ir !<
	863	INTEGER(iwp) :: j !<
	864	INTEGER(iwp) :: k !<
[1]	865
[1682]	866	INTEGER(iwp), PARAMETER :: stridex = 4 !<
[1320]	867
[1682]	868	REAL(wp), DIMENSION(0:ny,stridex) :: work_ffty !<
[1]	869	#if defined( __nec )
[1682]	870	REAL(wp), DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec !<
[1]	871	#endif
[1682]	872	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_in !<
	873	REAL(wp), DIMENSION(1:nz,0:ny,nxl:nxr) :: f_out !<
	874	REAL(wp), DIMENSION(nxl:nxr,1:nz,0:ny) :: work !<
[1]	875
	876	!
	877	!-- Transpose array
[1111]	878	#if defined( __parallel )
[1]	879	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	880	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	881	CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
	882	work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
	883	comm1dx, ierr )
	884	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1111]	885	#endif
[1]	886
	887	!
	888	!-- Resort the data in a way that y becomes the first index and carry out the
	889	!-- backward fft along y.
[1106]	890	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'continue' )
[1]	891
	892	IF ( host(1:3) == 'nec' ) THEN
	893	#if defined( __nec )
	894	!
	895	!-- Code optimized for vector processors
[85]	896	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	897	!$OMP DO
	898	DO k = 1, nz
	899	DO j = 0, ny
	900	DO i = nxl, nxr
	901	work_ffty_vec(j,k,i) = work(i,k,j)
	902	ENDDO
	903	ENDDO
	904	ENDDO
	905
	906	!$OMP DO
	907	DO i = nxl, nxr
	908
	909	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' )
	910
	911	DO j = 0, ny
	912	DO k = 1, nz
	913	f_out(k,j,i) = work_ffty_vec(j,k,i)
	914	ENDDO
	915	ENDDO
	916
	917	ENDDO
	918	!$OMP END PARALLEL
	919	#endif
	920
	921	ELSE
	922
	923	!
	924	!-- Cache optimized code.
	925	!-- The i-(x-)direction is split into a strided outer loop and an inner
	926	!-- loop for better cache performance
	927	!$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty )
	928	!$OMP DO
	929	DO iouter = nxl, nxr, stridex
	930
	931	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
	932
	933	DO k = 1, nz
	934	!
	935	!-- Resort
	936	DO j = 0, ny
	937	DO i = iouter, iend
	938	work_ffty(j,i-iouter+1) = work(i,k,j)
	939	ENDDO
	940	ENDDO
	941
	942	DO i = iouter, iend
	943
	944	!
	945	!-- FFT along y
	946	ir = i-iouter+1 ! counter within a stride
[1106]	947	CALL fft_y_1d( work_ffty(:,ir), 'backward' )
[1]	948
	949	DO j = 0, ny
	950	f_out(k,j,i) = work_ffty(j,ir)
	951	ENDDO
	952	ENDDO
	953
	954	ENDDO
	955
	956	ENDDO
	957	!$OMP END PARALLEL
	958
	959	ENDIF
	960
[1106]	961	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'stop' )
[1]	962
	963	END SUBROUTINE tr_xy_ffty
	964
	965
	966	!------------------------------------------------------------------------------!
[1682]	967	! Description:
	968	! ------------
	969	!> FFT along x, solution of the tridiagonal system and backward FFT for
	970	!> a 1d-decomposition along x
	971	!>
	972	!> @warning this subroutine may still not work for hybrid parallelization
	973	!> with OpenMP (for possible necessary changes see the original
	974	!> routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
[1]	975	!------------------------------------------------------------------------------!
[1682]	976	SUBROUTINE fftx_tri_fftx( ar )
[1]	977
[1682]	978
[1320]	979	USE control_parameters, &
	980	ONLY: host
	981
	982	USE cpulog, &
	983	ONLY: cpu_log, log_point_s
	984
	985	USE grid_variables, &
	986	ONLY: ddx2, ddy2
	987
	988	USE kinds
	989
[1]	990	USE pegrid
	991
	992	IMPLICIT NONE
	993
[1682]	994	INTEGER(iwp) :: i !<
	995	INTEGER(iwp) :: j !<
	996	INTEGER(iwp) :: k !<
	997	INTEGER(iwp) :: m !<
	998	INTEGER(iwp) :: n !<
	999	INTEGER(iwp) :: omp_get_thread_num !<
	1000	INTEGER(iwp) :: tn !<
[1]	1001
[1682]	1002	REAL(wp), DIMENSION(0:nx) :: work_fftx !<
	1003	REAL(wp), DIMENSION(0:nx,1:nz) :: work_trix !<
	1004	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: ar !<
	1005	REAL(wp), DIMENSION(:,:,:,:), ALLOCATABLE :: tri !<
[1]	1006
	1007
[1106]	1008	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'start' )
[1]	1009
	1010	ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) )
	1011
	1012	tn = 0 ! Default thread number in case of one thread
	1013	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_fftx, work_trix )
	1014	DO j = nys_x, nyn_x
	1015
	1016	!$ tn = omp_get_thread_num()
	1017
	1018	IF ( host(1:3) == 'nec' ) THEN
	1019	!
	1020	!-- Code optimized for vector processors
	1021	DO k = 1, nz
	1022
	1023	m = 0
	1024	DO n = 1, pdims(1)
[1003]	1025	DO i = 1, nnx
[1]	1026	work_trix(m,k) = ar(i,k,j,n)
	1027	m = m + 1
	1028	ENDDO
	1029	ENDDO
	1030
	1031	ENDDO
	1032
	1033	CALL fft_x_m( work_trix, 'forward' )
	1034
	1035	ELSE
	1036	!
	1037	!-- Cache optimized code
	1038	DO k = 1, nz
	1039
	1040	m = 0
	1041	DO n = 1, pdims(1)
[1003]	1042	DO i = 1, nnx
[1]	1043	work_fftx(m) = ar(i,k,j,n)
	1044	m = m + 1
	1045	ENDDO
	1046	ENDDO
	1047
[1106]	1048	CALL fft_x_1d( work_fftx, 'forward' )
[1]	1049
	1050	DO i = 0, nx
	1051	work_trix(i,k) = work_fftx(i)
	1052	ENDDO
	1053
	1054	ENDDO
	1055
	1056	ENDIF
	1057
	1058	!
	1059	!-- Solve the linear equation system
	1060	CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) )
	1061
	1062	IF ( host(1:3) == 'nec' ) THEN
	1063	!
	1064	!-- Code optimized for vector processors
	1065	CALL fft_x_m( work_trix, 'backward' )
	1066
	1067	DO k = 1, nz
	1068
	1069	m = 0
	1070	DO n = 1, pdims(1)
[1003]	1071	DO i = 1, nnx
[1]	1072	ar(i,k,j,n) = work_trix(m,k)
	1073	m = m + 1
	1074	ENDDO
	1075	ENDDO
	1076
	1077	ENDDO
	1078
	1079	ELSE
	1080	!
	1081	!-- Cache optimized code
	1082	DO k = 1, nz
	1083
	1084	DO i = 0, nx
	1085	work_fftx(i) = work_trix(i,k)
	1086	ENDDO
	1087
[1106]	1088	CALL fft_x_1d( work_fftx, 'backward' )
[1]	1089
	1090	m = 0
	1091	DO n = 1, pdims(1)
[1003]	1092	DO i = 1, nnx
[1]	1093	ar(i,k,j,n) = work_fftx(m)
	1094	m = m + 1
	1095	ENDDO
	1096	ENDDO
	1097
	1098	ENDDO
	1099
	1100	ENDIF
	1101
	1102	ENDDO
	1103
	1104	DEALLOCATE( tri )
	1105
[1106]	1106	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'stop' )
[1]	1107
	1108	END SUBROUTINE fftx_tri_fftx
	1109
	1110
	1111	!------------------------------------------------------------------------------!
[1682]	1112	! Description:
	1113	! ------------
	1114	!> Fourier-transformation along x with subsequent transposition x --> y for
	1115	!> a 1d-decomposition along y.
	1116	!>
	1117	!> @attention NEC-branch of this routine may significantly profit from
	1118	!> further optimizations. So far, performance is much worse than
	1119	!> for routine ffty_tr_yx (more than three times slower).
[1]	1120	!------------------------------------------------------------------------------!
[1682]	1121	SUBROUTINE fftx_tr_xy( f_in, f_out )
[1]	1122
[1682]	1123
[1320]	1124	USE control_parameters, &
	1125	ONLY: host
	1126
	1127	USE cpulog, &
	1128	ONLY: cpu_log, log_point_s
	1129
	1130	USE kinds
	1131
[1]	1132	USE pegrid
	1133
	1134	IMPLICIT NONE
	1135
[1682]	1136	INTEGER(iwp) :: i !<
	1137	INTEGER(iwp) :: j !<
	1138	INTEGER(iwp) :: k !<
[1]	1139
[1682]	1140	REAL(wp), DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx !<
	1141	REAL(wp), DIMENSION(1:nz,nys:nyn,0:nx) :: f_in !<
	1142	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_out !<
	1143	REAL(wp), DIMENSION(nys:nyn,1:nz,0:nx) :: work !<
[1]	1144
	1145	!
	1146	!-- Carry out the FFT along x, where all data are present due to the
	1147	!-- 1d-decomposition along y. Resort the data in a way that y becomes
	1148	!-- the first index.
[1106]	1149	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'start' )
[1]	1150
	1151	IF ( host(1:3) == 'nec' ) THEN
	1152	!
	1153	!-- Code for vector processors
[85]	1154	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	1155	!$OMP DO
	1156	DO i = 0, nx
	1157
	1158	DO j = nys, nyn
	1159	DO k = 1, nz
	1160	work_fftx(i,k,j) = f_in(k,j,i)
	1161	ENDDO
	1162	ENDDO
	1163
	1164	ENDDO
	1165
	1166	!$OMP DO
	1167	DO j = nys, nyn
	1168
	1169	CALL fft_x_m( work_fftx(:,:,j), 'forward' )
	1170
	1171	DO k = 1, nz
	1172	DO i = 0, nx
	1173	work(j,k,i) = work_fftx(i,k,j)
	1174	ENDDO
	1175	ENDDO
	1176
	1177	ENDDO
	1178	!$OMP END PARALLEL
	1179
	1180	ELSE
	1181
	1182	!
	1183	!-- Cache optimized code (there might be still a potential for better
	1184	!-- optimization).
[696]	1185	!$OMP PARALLEL PRIVATE (i,j,k)
[1]	1186	!$OMP DO
	1187	DO i = 0, nx
	1188
	1189	DO j = nys, nyn
	1190	DO k = 1, nz
	1191	work_fftx(i,k,j) = f_in(k,j,i)
	1192	ENDDO
	1193	ENDDO
	1194
	1195	ENDDO
	1196
	1197	!$OMP DO
	1198	DO j = nys, nyn
	1199	DO k = 1, nz
	1200
[1106]	1201	CALL fft_x_1d( work_fftx(0:nx,k,j), 'forward' )
[1]	1202
	1203	DO i = 0, nx
	1204	work(j,k,i) = work_fftx(i,k,j)
	1205	ENDDO
	1206	ENDDO
	1207
	1208	ENDDO
	1209	!$OMP END PARALLEL
	1210
	1211	ENDIF
[1106]	1212	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'pause' )
[1]	1213
	1214	!
	1215	!-- Transpose array
[1111]	1216	#if defined( __parallel )
[1]	1217	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	1218	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	1219	CALL MPI_ALLTOALL( work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
	1220	f_out(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
	1221	comm1dy, ierr )
	1222	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1111]	1223	#endif
[1]	1224
	1225	END SUBROUTINE fftx_tr_xy
	1226
	1227
	1228	!------------------------------------------------------------------------------!
[1682]	1229	! Description:
	1230	! ------------
	1231	!> Transposition y --> x with a subsequent backward Fourier transformation for
	1232	!> a 1d-decomposition along x.
[1]	1233	!------------------------------------------------------------------------------!
[1682]	1234	SUBROUTINE tr_yx_fftx( f_in, f_out )
[1]	1235
[1682]	1236
[1320]	1237	USE control_parameters, &
	1238	ONLY: host
	1239
	1240	USE cpulog, &
	1241	ONLY: cpu_log, log_point_s
	1242
	1243	USE kinds
	1244
[1]	1245	USE pegrid
	1246
	1247	IMPLICIT NONE
	1248
[1682]	1249	INTEGER(iwp) :: i !<
	1250	INTEGER(iwp) :: j !<
	1251	INTEGER(iwp) :: k !<
[1]	1252
[1682]	1253	REAL(wp), DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx !<
	1254	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_in !<
	1255	REAL(wp), DIMENSION(1:nz,nys:nyn,0:nx) :: f_out !<
	1256	REAL(wp), DIMENSION(nys:nyn,1:nz,0:nx) :: work !<
[1]	1257
	1258	!
	1259	!-- Transpose array
[1111]	1260	#if defined( __parallel )
[1]	1261	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	1262	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	1263	CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
	1264	work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
	1265	comm1dy, ierr )
	1266	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1111]	1267	#endif
[1]	1268
	1269	!
	1270	!-- Carry out the FFT along x, where all data are present due to the
	1271	!-- 1d-decomposition along y. Resort the data in a way that y becomes
	1272	!-- the first index.
[1106]	1273	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'continue' )
[1]	1274
	1275	IF ( host(1:3) == 'nec' ) THEN
	1276	!
	1277	!-- Code optimized for vector processors
[85]	1278	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	1279	!$OMP DO
	1280	DO j = nys, nyn
	1281
	1282	DO k = 1, nz
	1283	DO i = 0, nx
	1284	work_fftx(i,k,j) = work(j,k,i)
	1285	ENDDO
	1286	ENDDO
	1287
	1288	CALL fft_x_m( work_fftx(:,:,j), 'backward' )
	1289
	1290	ENDDO
	1291
	1292	!$OMP DO
	1293	DO i = 0, nx
	1294	DO j = nys, nyn
	1295	DO k = 1, nz
	1296	f_out(k,j,i) = work_fftx(i,k,j)
	1297	ENDDO
	1298	ENDDO
	1299	ENDDO
	1300	!$OMP END PARALLEL
	1301
	1302	ELSE
	1303
	1304	!
	1305	!-- Cache optimized code (there might be still a potential for better
	1306	!-- optimization).
[696]	1307	!$OMP PARALLEL PRIVATE (i,j,k)
[1]	1308	!$OMP DO
	1309	DO j = nys, nyn
	1310	DO k = 1, nz
	1311
	1312	DO i = 0, nx
	1313	work_fftx(i,k,j) = work(j,k,i)
	1314	ENDDO
	1315
[1106]	1316	CALL fft_x_1d( work_fftx(0:nx,k,j), 'backward' )
[1]	1317
	1318	ENDDO
	1319	ENDDO
	1320
	1321	!$OMP DO
	1322	DO i = 0, nx
	1323	DO j = nys, nyn
	1324	DO k = 1, nz
	1325	f_out(k,j,i) = work_fftx(i,k,j)
	1326	ENDDO
	1327	ENDDO
	1328	ENDDO
	1329	!$OMP END PARALLEL
	1330
	1331	ENDIF
[1106]	1332	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'stop' )
[1]	1333
	1334	END SUBROUTINE tr_yx_fftx
	1335
	1336
	1337	!------------------------------------------------------------------------------!
[1682]	1338	! Description:
	1339	! ------------
	1340	!> FFT along y, solution of the tridiagonal system and backward FFT for
	1341	!> a 1d-decomposition along y.
	1342	!>
	1343	!> @warning this subroutine may still not work for hybrid parallelization
	1344	!> with OpenMP (for possible necessary changes see the original
	1345	!> routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
[1]	1346	!------------------------------------------------------------------------------!
[1682]	1347	SUBROUTINE ffty_tri_ffty( ar )
[1]	1348
[1682]	1349
[1320]	1350	USE control_parameters, &
	1351	ONLY: host
	1352
	1353	USE cpulog, &
	1354	ONLY: cpu_log, log_point_s
	1355
	1356	USE grid_variables, &
	1357	ONLY: ddx2, ddy2
	1358
	1359	USE kinds
	1360
[1]	1361	USE pegrid
	1362
	1363	IMPLICIT NONE
	1364
[1682]	1365	INTEGER(iwp) :: i !<
	1366	INTEGER(iwp) :: j !<
	1367	INTEGER(iwp) :: k !<
	1368	INTEGER(iwp) :: m !<
	1369	INTEGER(iwp) :: n !<
	1370	INTEGER(iwp) :: omp_get_thread_num !<
	1371	INTEGER(iwp) :: tn !<
[1]	1372
[1682]	1373	REAL(wp), DIMENSION(0:ny) :: work_ffty !<
	1374	REAL(wp), DIMENSION(0:ny,1:nz) :: work_triy !<
	1375	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: ar !<
	1376	REAL(wp), DIMENSION(:,:,:,:), ALLOCATABLE :: tri !<
[1]	1377
	1378
[1106]	1379	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'start' )
[1]	1380
	1381	ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) )
	1382
	1383	tn = 0 ! Default thread number in case of one thread
[696]	1384	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy )
[1]	1385	DO i = nxl_y, nxr_y
	1386
	1387	!$ tn = omp_get_thread_num()
	1388
	1389	IF ( host(1:3) == 'nec' ) THEN
	1390	!
	1391	!-- Code optimized for vector processors
	1392	DO k = 1, nz
	1393
	1394	m = 0
	1395	DO n = 1, pdims(2)
[1003]	1396	DO j = 1, nny
[1]	1397	work_triy(m,k) = ar(j,k,i,n)
	1398	m = m + 1
	1399	ENDDO
	1400	ENDDO
	1401
	1402	ENDDO
	1403
	1404	CALL fft_y_m( work_triy, ny, 'forward' )
	1405
	1406	ELSE
	1407	!
	1408	!-- Cache optimized code
	1409	DO k = 1, nz
	1410
	1411	m = 0
	1412	DO n = 1, pdims(2)
[1003]	1413	DO j = 1, nny
[1]	1414	work_ffty(m) = ar(j,k,i,n)
	1415	m = m + 1
	1416	ENDDO
	1417	ENDDO
	1418
[1106]	1419	CALL fft_y_1d( work_ffty, 'forward' )
[1]	1420
	1421	DO j = 0, ny
	1422	work_triy(j,k) = work_ffty(j)
	1423	ENDDO
	1424
	1425	ENDDO
	1426
	1427	ENDIF
	1428
	1429	!
	1430	!-- Solve the linear equation system
	1431	CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) )
	1432
	1433	IF ( host(1:3) == 'nec' ) THEN
	1434	!
	1435	!-- Code optimized for vector processors
	1436	CALL fft_y_m( work_triy, ny, 'backward' )
	1437
	1438	DO k = 1, nz
	1439
	1440	m = 0
	1441	DO n = 1, pdims(2)
[1003]	1442	DO j = 1, nny
[1]	1443	ar(j,k,i,n) = work_triy(m,k)
	1444	m = m + 1
	1445	ENDDO
	1446	ENDDO
	1447
	1448	ENDDO
	1449
	1450	ELSE
	1451	!
	1452	!-- Cache optimized code
	1453	DO k = 1, nz
	1454
	1455	DO j = 0, ny
	1456	work_ffty(j) = work_triy(j,k)
	1457	ENDDO
	1458
[1106]	1459	CALL fft_y_1d( work_ffty, 'backward' )
[1]	1460
	1461	m = 0
	1462	DO n = 1, pdims(2)
[1003]	1463	DO j = 1, nny
[1]	1464	ar(j,k,i,n) = work_ffty(m)
	1465	m = m + 1
	1466	ENDDO
	1467	ENDDO
	1468
	1469	ENDDO
	1470
	1471	ENDIF
	1472
	1473	ENDDO
	1474
	1475	DEALLOCATE( tri )
	1476
[1106]	1477	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'stop' )
[1]	1478
	1479	END SUBROUTINE ffty_tri_ffty
	1480
	1481	END MODULE poisfft_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |