Home

Context Navigation

source: palm/trunk/SOURCE/poisfft_mod.f90 @ 2118

Last change on this file since 2118 was 2118, checked in by raasch, 7 years ago
all OpenACC directives and related parts removed from the code
Property svn:keywords set to `Id`
File size: 44.8 KB

Rev	Line
[1850]	1	!> @file poisfft_mod.f90
[2000]	2	!------------------------------------------------------------------------------!
[1036]	3	! This file is part of PALM.
	4	!
[2000]	5	! PALM is free software: you can redistribute it and/or modify it under the
	6	! terms of the GNU General Public License as published by the Free Software
	7	! Foundation, either version 3 of the License, or (at your option) any later
	8	! version.
[1036]	9	!
	10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
	11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
	12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
	13	!
	14	! You should have received a copy of the GNU General Public License along with
	15	! PALM. If not, see <http://www.gnu.org/licenses/>.
	16	!
[2101]	17	! Copyright 1997-2017 Leibniz Universitaet Hannover
[2000]	18	!------------------------------------------------------------------------------!
[1036]	19	!
[484]	20	! Current revisions:
[1]	21	! -----------------
[2118]	22	! OpenACC directives and related code removed
[1683]	23	!
[1321]	24	! Former revisions:
	25	! -----------------
	26	! $Id: poisfft_mod.f90 2118 2017-01-17 16:38:49Z raasch $
	27	!
[2001]	28	! 2000 2016-08-20 18:09:15Z knoop
	29	! Forced header and separation lines into 80 columns
	30	!
[1851]	31	! 1850 2016-04-08 13:29:27Z maronga
	32	! Module renamed
	33	!
	34	!
[1805]	35	! 1804 2016-04-05 16:30:18Z maronga
	36	! Removed code for parameter file check (__check)
	37	!
[1683]	38	! 1682 2015-10-07 23:56:08Z knoop
	39	! Code annotations made doxygen readable
	40	!
[1483]	41	! 1482 2014-10-18 12:34:45Z raasch
	42	! use 2d-decomposition, if accelerator boards are used
	43	!
[1407]	44	! 1406 2014-05-16 13:47:01Z raasch
	45	! bugfix for pgi 14.4: declare create moved after array declaration
	46	!
[1321]	47	! 1320 2014-03-20 08:40:49Z raasch
[1320]	48	! ONLY-attribute added to USE-statements,
	49	! kind-parameters added to all INTEGER and REAL declaration statements,
	50	! kinds are defined in new module kinds,
	51	! old module precision_kind is removed,
	52	! revision history before 2012 removed,
	53	! comment fields (!:) to be used for variable explanations added to
	54	! all variable declaration statements
[1217]	55	!
[1319]	56	! 1318 2014-03-17 13:35:16Z raasch
	57	! module interfaces removed
	58	!
[1307]	59	! 1306 2014-03-13 14:30:59Z raasch
	60	! openmp sections removed from the overlap branch,
	61	! second argument removed from parameter list
	62	!
[1217]	63	! 1216 2013-08-26 09:31:42Z raasch
[1216]	64	! resorting of arrays moved to separate routines resort_for_...,
	65	! one argument, used as temporary work array, removed from all transpose
	66	! routines
	67	! overlapping fft / transposition implemented
[1112]	68	!
[1213]	69	! 1212 2013-08-15 08:46:27Z raasch
	70	! tridia routines moved to seperate module tridia_solver
	71	!
[1209]	72	! 1208 2013-08-13 06:41:49Z raasch
	73	! acc-update clauses added for "ar" so that ffts other than cufft can also be
	74	! used (although they are not ported and will give a poor performance)
	75	!
[1112]	76	! 1111 2013-03-08 23:54:10Z raasch
[1111]	77	! further openACC porting of non-parallel (MPI) branch:
	78	! tridiagonal routines split into extermal subroutines (instead using CONTAINS),
	79	! no distinction between parallel/non-parallel in poisfft and tridia any more,
[1112]	80	! tridia routines moved to end of file because of probable bug in PGI compiler 12.5
[1111]	81	! (otherwise "invalid device function" is indicated during runtime),
	82	! optimization of tridia routines: constant elements and coefficients of tri are
	83	! stored in seperate arrays ddzuw and tric, last dimension of tri reduced from 5
	84	! to 2,
	85	! poisfft_init is now called internally from poisfft, maketri is called from
	86	! poisfft_init,
	87	! ibc_p_b = 2 removed
[1]	88	!
[1107]	89	! 1106 2013-03-04 05:31:38Z raasch
	90	! routines fftx, ffty, fftxp, fftyp removed, calls replaced by fft_x, fft_y,
	91	! in the 1D-decomposition routines fft_x, ffty are replaced by fft_x_1d,
	92	! fft_y_1d
	93	!
[1104]	94	! 1103 2013-02-20 02:15:53Z raasch
	95	! tri, ar, and ar1 arguments in tridia-routines (2d) are removed because they
	96	! sometimes cause segmentation faults with intel 12.1 compiler
	97	!
[1093]	98	! 1092 2013-02-02 11:24:22Z raasch
	99	! unused variables removed
	100	!
[1037]	101	! 1036 2012-10-22 13:43:42Z raasch
	102	! code put under GPL (PALM 3.9)
	103	!
[1014]	104	! 2012-09-21 07:03:55Z raasch
	105	! FLOAT type conversion replaced by REAL
	106	!
[1004]	107	! 1003 2012-09-14 14:35:53Z raasch
	108	! indices nxa, nya, etc. replaced by nx, ny, etc.
	109	!
[941]	110	! 940 2012-07-09 14:31:00Z raasch
	111	! special handling of tri-array as an argument in tridia_1dd routines switched
	112	! off because it caused segmentation faults with intel 12.1 compiler
	113	!
[878]	114	! 877 2012-04-03 11:21:44Z suehring
	115	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
	116	! pressure at the top of the model domain.
	117	!
[810]	118	! 809 2012-01-30 13:32:58Z maronga
	119	! Bugfix: replaced .AND. and .NOT. with && and ! in the preprocessor directives
	120	!
[808]	121	! 807 2012-01-25 11:53:51Z maronga
	122	! New cpp directive "__check" implemented which is used by check_namelist_files
	123	! (most of the code is unneeded by check_namelist_files).
	124	!
[1]	125	! Revision 1.1 1997/07/24 11:24:14 raasch
	126	! Initial revision
	127	!
	128	!
	129	! Description:
	130	! ------------
[1682]	131	!> Solves the Poisson equation with a 2D spectral method
	132	!> d^2 p / dx^2 + d^2 p / dy^2 + d^2 p / dz^2 = s
	133	!>
	134	!> Input:
	135	!> real ar contains (nnz,nny,nnx) elements of the velocity divergence,
	136	!> starting from (1,nys,nxl)
	137	!>
	138	!> Output:
	139	!> real ar contains the solution for perturbation pressure p
[1]	140	!------------------------------------------------------------------------------!
[1682]	141	MODULE poisfft_mod
	142
[1]	143
[1320]	144	USE fft_xy, &
	145	ONLY: fft_init, fft_y, fft_y_1d, fft_y_m, fft_x, fft_x_1d, fft_x_m
[1]	146
[1320]	147	USE indices, &
	148	ONLY: nnx, nny, nx, nxl, nxr, ny, nys, nyn, nz
	149
	150	USE transpose_indices, &
	151	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nys_x, nys_z, nyn_x, nyn_z, nzb_x, &
	152	nzb_y, nzt_x, nzt_y
	153
	154	USE tridia_solver, &
	155	ONLY: tridia_1dd, tridia_init, tridia_substi, tridia_substi_overlap
	156
[1]	157	IMPLICIT NONE
	158
[1111]	159	LOGICAL, SAVE :: poisfft_initialized = .FALSE.
	160
[1]	161	PRIVATE
[807]	162
[1]	163	PUBLIC poisfft, poisfft_init
	164
	165	INTERFACE poisfft
	166	MODULE PROCEDURE poisfft
	167	END INTERFACE poisfft
	168
	169	INTERFACE poisfft_init
	170	MODULE PROCEDURE poisfft_init
	171	END INTERFACE poisfft_init
	172
[807]	173
[1]	174	CONTAINS
	175
[1682]	176	!------------------------------------------------------------------------------!
	177	! Description:
	178	! ------------
	179	!> @todo Missing subroutine description.
	180	!------------------------------------------------------------------------------!
[1]	181	SUBROUTINE poisfft_init
	182
[1320]	183	USE arrays_3d, &
	184	ONLY: ddzu_pres, ddzw
[1111]	185
[1320]	186	USE kinds
	187
[1111]	188	IMPLICIT NONE
	189
[1682]	190	INTEGER(iwp) :: k !<
[1111]	191
	192
[1]	193	CALL fft_init
	194
[1212]	195	CALL tridia_init
[1111]	196
	197	poisfft_initialized = .TRUE.
	198
[1]	199	END SUBROUTINE poisfft_init
	200
[1111]	201
[1804]	202
[1682]	203	!------------------------------------------------------------------------------!
	204	! Description:
	205	! ------------
	206	!> Two-dimensional Fourier Transformation in x- and y-direction.
	207	!------------------------------------------------------------------------------!
[1306]	208	SUBROUTINE poisfft( ar )
[1]	209
[1320]	210	USE control_parameters, &
	211	ONLY: fft_method, transpose_compute_overlap
	212
	213	USE cpulog, &
	214	ONLY: cpu_log, cpu_log_nowait, log_point_s
	215
	216	USE kinds
	217
[1]	218	USE pegrid
	219
	220	IMPLICIT NONE
	221
[1682]	222	INTEGER(iwp) :: ii !<
	223	INTEGER(iwp) :: iind !<
	224	INTEGER(iwp) :: inew !<
	225	INTEGER(iwp) :: jj !<
	226	INTEGER(iwp) :: jind !<
	227	INTEGER(iwp) :: jnew !<
	228	INTEGER(iwp) :: ki !<
	229	INTEGER(iwp) :: kk !<
	230	INTEGER(iwp) :: knew !<
	231	INTEGER(iwp) :: n !<
	232	INTEGER(iwp) :: nblk !<
	233	INTEGER(iwp) :: nnx_y !<
	234	INTEGER(iwp) :: nny_z !<
	235	INTEGER(iwp) :: nnz_t !<
	236	INTEGER(iwp) :: nnz_x !<
	237	INTEGER(iwp) :: nxl_y_bound !<
	238	INTEGER(iwp) :: nxr_y_bound !<
[1]	239
[1682]	240	INTEGER(iwp), DIMENSION(4) :: isave !<
[1320]	241
[1682]	242	REAL(wp), DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar !<
	243	REAL(wp), DIMENSION(nys:nyn,nxl:nxr,1:nz) :: ar_inv !<
[1]	244
[1682]	245	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: ar1 !<
	246	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_in !<
	247	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_inv !<
	248	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_out_y !<
	249	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_out_z !<
[1216]	250
	251
[1]	252	CALL cpu_log( log_point_s(3), 'poisfft', 'start' )
	253
[1111]	254	IF ( .NOT. poisfft_initialized ) CALL poisfft_init
	255
[1]	256	!
	257	!-- Two-dimensional Fourier Transformation in x- and y-direction.
[2118]	258	IF ( pdims(2) == 1 .AND. pdims(1) > 1 ) THEN
[1]	259
	260	!
	261	!-- 1d-domain-decomposition along x:
	262	!-- FFT along y and transposition y --> x
[1216]	263	CALL ffty_tr_yx( ar, ar )
[1]	264
	265	!
	266	!-- FFT along x, solving the tridiagonal system and backward FFT
	267	CALL fftx_tri_fftx( ar )
	268
	269	!
	270	!-- Transposition x --> y and backward FFT along y
[1216]	271	CALL tr_xy_ffty( ar, ar )
[1]	272
[2118]	273	ELSEIF ( pdims(1) == 1 .AND. pdims(2) > 1 ) THEN
[1]	274
	275	!
	276	!-- 1d-domain-decomposition along y:
	277	!-- FFT along x and transposition x --> y
[1216]	278	CALL fftx_tr_xy( ar, ar )
[1]	279
	280	!
	281	!-- FFT along y, solving the tridiagonal system and backward FFT
	282	CALL ffty_tri_ffty( ar )
	283
	284	!
	285	!-- Transposition y --> x and backward FFT along x
[1216]	286	CALL tr_yx_fftx( ar, ar )
[1]	287
[1216]	288	ELSEIF ( .NOT. transpose_compute_overlap ) THEN
[1]	289
	290	!
[1111]	291	!-- 2d-domain-decomposition or no decomposition (1 PE run)
[1]	292	!-- Transposition z --> x
	293	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
[1216]	294	CALL resort_for_zx( ar, ar_inv )
	295	CALL transpose_zx( ar_inv, ar )
[1]	296	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
	297
	298	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
[1106]	299	CALL fft_x( ar, 'forward' )
[1]	300	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
	301
	302	!
	303	!-- Transposition x --> y
	304	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
[1216]	305	CALL resort_for_xy( ar, ar_inv )
	306	CALL transpose_xy( ar_inv, ar )
[1]	307	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
	308
	309	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
[1216]	310	CALL fft_y( ar, 'forward', ar_tr = ar, &
	311	nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
	312	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
[1]	313	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	314
	315	!
	316	!-- Transposition y --> z
	317	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
[1216]	318	CALL resort_for_yz( ar, ar_inv )
	319	CALL transpose_yz( ar_inv, ar )
[1]	320	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
	321
	322	!
[1106]	323	!-- Solve the tridiagonal equation system along z
[1]	324	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
[1212]	325	CALL tridia_substi( ar )
[1]	326	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
	327
	328	!
	329	!-- Inverse Fourier Transformation
	330	!-- Transposition z --> y
	331	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
[1216]	332	CALL transpose_zy( ar, ar_inv )
	333	CALL resort_for_zy( ar_inv, ar )
[1]	334	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
	335
	336	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
[1216]	337	CALL fft_y( ar, 'backward', ar_tr = ar, &
	338	nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
	339	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
[1]	340	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	341
	342	!
	343	!-- Transposition y --> x
	344	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
[1216]	345	CALL transpose_yx( ar, ar_inv )
	346	CALL resort_for_yx( ar_inv, ar )
[1]	347	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
	348
	349	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
[1106]	350	CALL fft_x( ar, 'backward' )
[1]	351	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	352
	353	!
	354	!-- Transposition x --> z
	355	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
[1216]	356	CALL transpose_xz( ar, ar_inv )
	357	CALL resort_for_xz( ar_inv, ar )
[1]	358	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
	359
[1216]	360	ELSE
	361
	362	!
	363	!-- 2d-domain-decomposition or no decomposition (1 PE run) with
	364	!-- overlapping transposition / fft
[1318]	365	!-- cputime logging must not use barriers, which would prevent overlapping
[1216]	366	ALLOCATE( f_out_y(0:ny,nxl_y:nxr_y,nzb_y:nzt_y), &
	367	f_out_z(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
	368	!
	369	!-- Transposition z --> x + subsequent fft along x
	370	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
	371	CALL resort_for_zx( ar, f_inv )
	372	!
	373	!-- Save original indices and gridpoint counter
	374	isave(1) = nz
	375	isave(2) = nzb_x
	376	isave(3) = nzt_x
	377	isave(4) = sendrecvcount_zx
	378	!
	379	!-- Set new indices for transformation
	380	nblk = nz / pdims(1)
	381	nz = pdims(1)
	382	nnz_x = 1
	383	nzb_x = 1 + myidx * nnz_x
	384	nzt_x = ( myidx + 1 ) * nnz_x
	385	sendrecvcount_zx = nnx * nny * nnz_x
	386
[1306]	387	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
[1216]	388	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
	389
[1306]	390	DO kk = 1, nblk
[1216]	391
[1306]	392	IF ( kk == 1 ) THEN
[1318]	393	CALL cpu_log( log_point_s(5), 'transpo forward', 'start', cpu_log_nowait )
[1306]	394	ELSE
[1318]	395	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
[1306]	396	ENDIF
[1216]	397
[1306]	398	DO knew = 1, nz
	399	ki = kk + nblk * ( knew - 1 )
	400	f_in(:,:,knew) = f_inv(:,:,ki)
	401	ENDDO
[1216]	402
[1306]	403	CALL transpose_zx( f_in, ar1(:,:,:))
	404	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
[1216]	405
[1306]	406	IF ( kk == 1 ) THEN
[1318]	407	CALL cpu_log( log_point_s(4), 'fft_x', 'start', cpu_log_nowait )
[1306]	408	ELSE
[1318]	409	CALL cpu_log( log_point_s(4), 'fft_x', 'continue', cpu_log_nowait )
[1216]	410	ENDIF
	411
[1306]	412	n = isave(2) + kk - 1
	413	CALL fft_x( ar1(:,:,:), 'forward', ar_2d = f_out_z(:,:,n))
	414	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
[1216]	415
	416	ENDDO
	417	!
	418	!-- Restore original indices/counters
	419	nz = isave(1)
	420	nzb_x = isave(2)
	421	nzt_x = isave(3)
	422	sendrecvcount_zx = isave(4)
	423
	424	DEALLOCATE( ar1, f_in, f_inv )
	425
	426	!
	427	!-- Transposition x --> y + subsequent fft along y
	428	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
	429	CALL resort_for_xy( f_out_z, f_inv )
	430	!
	431	!-- Save original indices and gridpoint counter
	432	isave(1) = nx
	433	isave(2) = nxl_y
	434	isave(3) = nxr_y
	435	isave(4) = sendrecvcount_xy
	436	!
	437	!-- Set new indices for transformation
	438	nblk = ( ( nx+1 ) / pdims(2) ) - 1
	439	nx = pdims(2)
	440	nnx_y = 1
	441	nxl_y = myidy * nnx_y
	442	nxr_y = ( myidy + 1 ) * nnx_y - 1
	443	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
	444
[1306]	445	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
[1216]	446	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
	447
[1306]	448	DO ii = 0, nblk
[1216]	449
[1318]	450	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
[1216]	451
[1306]	452	DO inew = 0, nx-1
	453	iind = ii + ( nblk + 1 ) * inew
	454	f_in(:,:,inew) = f_inv(:,:,iind)
	455	ENDDO
[1216]	456
[1306]	457	CALL transpose_xy( f_in, ar1(:,:,:) )
[1216]	458
[1306]	459	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
[1216]	460
[1306]	461	IF ( ii == 1 ) THEN
[1318]	462	CALL cpu_log( log_point_s(7), 'fft_y', 'start', cpu_log_nowait )
[1306]	463	ELSE
[1318]	464	CALL cpu_log( log_point_s(7), 'fft_y', 'continue', cpu_log_nowait )
[1216]	465	ENDIF
	466
[1306]	467	nxl_y_bound = isave(2)
	468	nxr_y_bound = isave(3)
	469	n = isave(2) + ii
	470	CALL fft_y( ar1(:,:,:), 'forward', ar_tr = f_out_y, &
	471	nxl_y_bound = nxl_y_bound, nxr_y_bound = nxr_y_bound, &
	472	nxl_y_l = n, nxr_y_l = n )
[1216]	473
[1306]	474	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
[1216]	475
	476	ENDDO
	477	!
	478	!-- Restore original indices/counters
	479	nx = isave(1)
	480	nxl_y = isave(2)
	481	nxr_y = isave(3)
	482	sendrecvcount_xy = isave(4)
	483
	484	DEALLOCATE( ar1, f_in, f_inv )
	485
	486	!
	487	!-- Transposition y --> z + subsequent tridia + resort for z --> y
	488	ALLOCATE( f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
	489	CALL resort_for_yz( f_out_y, f_inv )
	490	!
	491	!-- Save original indices and gridpoint counter
	492	isave(1) = ny
	493	isave(2) = nys_z
	494	isave(3) = nyn_z
	495	isave(4) = sendrecvcount_yz
	496	!
	497	!-- Set new indices for transformation
	498	nblk = ( ( ny+1 ) / pdims(1) ) - 1
	499	ny = pdims(1)
	500	nny_z = 1
	501	nys_z = myidx * nny_z
	502	nyn_z = ( myidx + 1 ) * nny_z - 1
	503	sendrecvcount_yz = ( nxr_y-nxl_y+1 ) * nny_z * ( nzt_y-nzb_y+1 )
	504
[1306]	505	ALLOCATE( ar1(nxl_z:nxr_z,nys_z:nyn_z,1:nz) )
[1216]	506	ALLOCATE( f_in(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
	507
[1306]	508	DO jj = 0, nblk
[1216]	509	!
[1306]	510	!-- Forward Fourier Transformation
	511	!-- Transposition y --> z
[1318]	512	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
[1216]	513
[1306]	514	DO jnew = 0, ny-1
	515	jind = jj + ( nblk + 1 ) * jnew
	516	f_in(:,:,jnew) = f_inv(:,:,jind)
	517	ENDDO
[1216]	518
[1306]	519	CALL transpose_yz( f_in, ar1(:,:,:) )
[1216]	520
[1306]	521	IF ( jj == nblk ) THEN
	522	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
	523	ELSE
	524	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
[1216]	525	ENDIF
	526
	527	!
[1306]	528	!-- Solve the tridiagonal equation system along z
[1318]	529	CALL cpu_log( log_point_s(6), 'tridia', 'start', cpu_log_nowait )
[1216]	530
[1306]	531	n = isave(2) + jj
	532	CALL tridia_substi_overlap( ar1(:,:,:), n )
[1216]	533
[1306]	534	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
[1216]	535
[1306]	536	!
	537	!-- Inverse Fourier Transformation
	538	!-- Transposition z --> y
	539	!-- Only one thread should call MPI routines, therefore forward and
	540	!-- backward tranpose are in the same section
	541	IF ( jj == 0 ) THEN
[1318]	542	CALL cpu_log( log_point_s(8), 'transpo invers', 'start', cpu_log_nowait )
[1306]	543	ELSE
[1318]	544	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
[1216]	545	ENDIF
	546
[1306]	547	CALL transpose_zy( ar1(:,:,:), f_in )
[1216]	548
[1306]	549	DO jnew = 0, ny-1
	550	jind = jj + ( nblk + 1 ) * jnew
	551	f_inv(:,:,jind) = f_in(:,:,jnew)
	552	ENDDO
[1216]	553
[1306]	554	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
[1216]	555
	556	ENDDO
	557	!
	558	!-- Restore original indices/counters
	559	ny = isave(1)
	560	nys_z = isave(2)
	561	nyn_z = isave(3)
	562	sendrecvcount_yz = isave(4)
	563
	564	CALL resort_for_zy( f_inv, f_out_y )
	565
	566	DEALLOCATE( ar1, f_in, f_inv )
	567
	568	!
	569	!-- fft along y backward + subsequent transposition y --> x
	570	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
	571	!
	572	!-- Save original indices and gridpoint counter
	573	isave(1) = nx
	574	isave(2) = nxl_y
	575	isave(3) = nxr_y
	576	isave(4) = sendrecvcount_xy
	577	!
	578	!-- Set new indices for transformation
	579	nblk = (( nx+1 ) / pdims(2) ) - 1
	580	nx = pdims(2)
	581	nnx_y = 1
	582	nxl_y = myidy * nnx_y
	583	nxr_y = ( myidy + 1 ) * nnx_y - 1
	584	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
	585
[1306]	586	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
[1216]	587	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
	588
[1306]	589	DO ii = 0, nblk
[1216]	590
[1318]	591	CALL cpu_log( log_point_s(7), 'fft_y', 'continue', cpu_log_nowait )
[1216]	592
[1306]	593	n = isave(2) + ii
	594	nxl_y_bound = isave(2)
	595	nxr_y_bound = isave(3)
[1216]	596
[1306]	597	CALL fft_y( ar1(:,:,:), 'backward', ar_tr = f_out_y, &
	598	nxl_y_bound = nxl_y_bound, nxr_y_bound = nxr_y_bound, &
	599	nxl_y_l = n, nxr_y_l = n )
[1216]	600
[1306]	601	IF ( ii == nblk ) THEN
	602	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	603	ELSE
	604	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
[1216]	605	ENDIF
	606
[1318]	607	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
[1216]	608
[1306]	609	CALL transpose_yx( ar1(:,:,:), f_in )
[1216]	610
[1306]	611	DO inew = 0, nx-1
	612	iind = ii + (nblk+1) * inew
	613	f_inv(:,:,iind) = f_in(:,:,inew)
	614	ENDDO
[1216]	615
[1306]	616	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
[1216]	617
	618	ENDDO
	619	!
	620	!-- Restore original indices/counters
	621	nx = isave(1)
	622	nxl_y = isave(2)
	623	nxr_y = isave(3)
	624	sendrecvcount_xy = isave(4)
	625
	626	CALL resort_for_yx( f_inv, f_out_z )
	627
	628	DEALLOCATE( ar1, f_in, f_inv )
	629
	630	!
	631	!-- fft along x backward + subsequent final transposition x --> z
	632	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
	633	!
	634	!-- Save original indices and gridpoint counter
	635	isave(1) = nz
	636	isave(2) = nzb_x
	637	isave(3) = nzt_x
	638	isave(4) = sendrecvcount_zx
	639	!
	640	!-- Set new indices for transformation
	641	nblk = nz / pdims(1)
	642	nz = pdims(1)
	643	nnz_x = 1
	644	nzb_x = 1 + myidx * nnz_x
	645	nzt_x = ( myidx + 1 ) * nnz_x
	646	sendrecvcount_zx = nnx * nny * nnz_x
	647
[1306]	648	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
[1216]	649	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
	650
[1306]	651	DO kk = 1, nblk
[1216]	652
[1318]	653	CALL cpu_log( log_point_s(4), 'fft_x', 'continue', cpu_log_nowait )
[1216]	654
[1306]	655	n = isave(2) + kk - 1
	656	CALL fft_x( ar1(:,:,:), 'backward', f_out_z(:,:,n))
[1216]	657
[1306]	658	IF ( kk == nblk ) THEN
	659	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	660	ELSE
	661	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
[1216]	662	ENDIF
	663
[1318]	664	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
[1216]	665
[1306]	666	CALL transpose_xz( ar1(:,:,:), f_in )
[1216]	667
[1306]	668	DO knew = 1, nz
	669	ki = kk + nblk * (knew-1)
	670	f_inv(:,:,ki) = f_in(:,:,knew)
	671	ENDDO
[1216]	672
[1306]	673	IF ( kk == nblk ) THEN
	674	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
	675	ELSE
	676	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
[1216]	677	ENDIF
	678
	679	ENDDO
	680	!
	681	!-- Restore original indices/counters
	682	nz = isave(1)
	683	nzb_x = isave(2)
	684	nzt_x = isave(3)
	685	sendrecvcount_zx = isave(4)
	686
	687	CALL resort_for_xz( f_inv, ar )
	688
	689	DEALLOCATE( ar1, f_in, f_inv )
	690
[1]	691	ENDIF
	692
	693	CALL cpu_log( log_point_s(3), 'poisfft', 'stop' )
	694
	695	END SUBROUTINE poisfft
	696
	697
	698	!------------------------------------------------------------------------------!
[1682]	699	! Description:
	700	! ------------
	701	!> Fourier-transformation along y with subsequent transposition y --> x for
	702	!> a 1d-decomposition along x.
	703	!>
	704	!> @attention The performance of this routine is much faster on the NEC-SX6,
	705	!> if the first index of work_ffty_vec is odd. Otherwise
	706	!> memory bank conflicts may occur (especially if the index is a
	707	!> multiple of 128). That's why work_ffty_vec is dimensioned as
	708	!> 0:ny+1.
	709	!> Of course, this will not work if users are using an odd number
	710	!> of gridpoints along y.
[1]	711	!------------------------------------------------------------------------------!
[1682]	712	SUBROUTINE ffty_tr_yx( f_in, f_out )
[1]	713
[1682]	714
[1320]	715	USE control_parameters, &
	716	ONLY: host
	717
	718	USE cpulog, &
	719	ONLY: cpu_log, log_point_s
	720
	721	USE kinds
	722
[1]	723	USE pegrid
	724
	725	IMPLICIT NONE
	726
[1682]	727	INTEGER(iwp) :: i !<
	728	INTEGER(iwp) :: iend !<
	729	INTEGER(iwp) :: iouter !<
	730	INTEGER(iwp) :: ir !<
	731	INTEGER(iwp) :: j !<
	732	INTEGER(iwp) :: k !<
[1]	733
[1682]	734	INTEGER(iwp), PARAMETER :: stridex = 4 !<
[1320]	735
[1682]	736	REAL(wp), DIMENSION(0:ny,stridex) :: work_ffty !<
[1]	737	#if defined( __nec )
[1682]	738	REAL(wp), DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec !<
[1]	739	#endif
[1682]	740	REAL(wp), DIMENSION(1:nz,0:ny,nxl:nxr) :: f_in !<
	741	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_out !<
	742	REAL(wp), DIMENSION(nxl:nxr,1:nz,0:ny) :: work !<
[1]	743
	744	!
	745	!-- Carry out the FFT along y, where all data are present due to the
	746	!-- 1d-decomposition along x. Resort the data in a way that x becomes
	747	!-- the first index.
[1106]	748	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'start' )
[1]	749
	750	IF ( host(1:3) == 'nec' ) THEN
	751	#if defined( __nec )
	752	!
	753	!-- Code optimized for vector processors
[85]	754	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	755	!$OMP DO
	756	DO i = nxl, nxr
	757
	758	DO j = 0, ny
	759	DO k = 1, nz
	760	work_ffty_vec(j,k,i) = f_in(k,j,i)
	761	ENDDO
	762	ENDDO
	763
	764	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' )
	765
	766	ENDDO
	767
	768	!$OMP DO
	769	DO k = 1, nz
	770	DO j = 0, ny
	771	DO i = nxl, nxr
	772	work(i,k,j) = work_ffty_vec(j,k,i)
	773	ENDDO
	774	ENDDO
	775	ENDDO
	776	!$OMP END PARALLEL
	777	#endif
	778
	779	ELSE
	780
	781	!
	782	!-- Cache optimized code.
	783	!-- The i-(x-)direction is split into a strided outer loop and an inner
	784	!-- loop for better cache performance
	785	!$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty)
	786	!$OMP DO
	787	DO iouter = nxl, nxr, stridex
	788
	789	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
	790
	791	DO k = 1, nz
	792
	793	DO i = iouter, iend
	794
	795	ir = i-iouter+1 ! counter within a stride
	796	DO j = 0, ny
	797	work_ffty(j,ir) = f_in(k,j,i)
	798	ENDDO
	799	!
	800	!-- FFT along y
[1106]	801	CALL fft_y_1d( work_ffty(:,ir), 'forward' )
[1]	802
	803	ENDDO
	804
	805	!
	806	!-- Resort
	807	DO j = 0, ny
	808	DO i = iouter, iend
	809	work(i,k,j) = work_ffty(j,i-iouter+1)
	810	ENDDO
	811	ENDDO
	812
	813	ENDDO
	814
	815	ENDDO
	816	!$OMP END PARALLEL
	817
	818	ENDIF
[1106]	819	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'pause' )
[1]	820
	821	!
	822	!-- Transpose array
[1111]	823	#if defined( __parallel )
[1]	824	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	825	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	826	CALL MPI_ALLTOALL( work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
	827	f_out(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
	828	comm1dx, ierr )
	829	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1111]	830	#endif
[1]	831
	832	END SUBROUTINE ffty_tr_yx
	833
	834
	835	!------------------------------------------------------------------------------!
[1682]	836	! Description:
	837	! ------------
	838	!> Transposition x --> y with a subsequent backward Fourier transformation for
	839	!> a 1d-decomposition along x
[1]	840	!------------------------------------------------------------------------------!
[1682]	841	SUBROUTINE tr_xy_ffty( f_in, f_out )
[1]	842
[1682]	843
[1320]	844	USE control_parameters, &
	845	ONLY: host
	846
	847	USE cpulog, &
	848	ONLY: cpu_log, log_point_s
	849
	850	USE kinds
	851
[1]	852	USE pegrid
	853
	854	IMPLICIT NONE
	855
[1682]	856	INTEGER(iwp) :: i !<
	857	INTEGER(iwp) :: iend !<
	858	INTEGER(iwp) :: iouter !<
	859	INTEGER(iwp) :: ir !<
	860	INTEGER(iwp) :: j !<
	861	INTEGER(iwp) :: k !<
[1]	862
[1682]	863	INTEGER(iwp), PARAMETER :: stridex = 4 !<
[1320]	864
[1682]	865	REAL(wp), DIMENSION(0:ny,stridex) :: work_ffty !<
[1]	866	#if defined( __nec )
[1682]	867	REAL(wp), DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec !<
[1]	868	#endif
[1682]	869	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_in !<
	870	REAL(wp), DIMENSION(1:nz,0:ny,nxl:nxr) :: f_out !<
	871	REAL(wp), DIMENSION(nxl:nxr,1:nz,0:ny) :: work !<
[1]	872
	873	!
	874	!-- Transpose array
[1111]	875	#if defined( __parallel )
[1]	876	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	877	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	878	CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
	879	work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
	880	comm1dx, ierr )
	881	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1111]	882	#endif
[1]	883
	884	!
	885	!-- Resort the data in a way that y becomes the first index and carry out the
	886	!-- backward fft along y.
[1106]	887	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'continue' )
[1]	888
	889	IF ( host(1:3) == 'nec' ) THEN
	890	#if defined( __nec )
	891	!
	892	!-- Code optimized for vector processors
[85]	893	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	894	!$OMP DO
	895	DO k = 1, nz
	896	DO j = 0, ny
	897	DO i = nxl, nxr
	898	work_ffty_vec(j,k,i) = work(i,k,j)
	899	ENDDO
	900	ENDDO
	901	ENDDO
	902
	903	!$OMP DO
	904	DO i = nxl, nxr
	905
	906	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' )
	907
	908	DO j = 0, ny
	909	DO k = 1, nz
	910	f_out(k,j,i) = work_ffty_vec(j,k,i)
	911	ENDDO
	912	ENDDO
	913
	914	ENDDO
	915	!$OMP END PARALLEL
	916	#endif
	917
	918	ELSE
	919
	920	!
	921	!-- Cache optimized code.
	922	!-- The i-(x-)direction is split into a strided outer loop and an inner
	923	!-- loop for better cache performance
	924	!$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty )
	925	!$OMP DO
	926	DO iouter = nxl, nxr, stridex
	927
	928	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
	929
	930	DO k = 1, nz
	931	!
	932	!-- Resort
	933	DO j = 0, ny
	934	DO i = iouter, iend
	935	work_ffty(j,i-iouter+1) = work(i,k,j)
	936	ENDDO
	937	ENDDO
	938
	939	DO i = iouter, iend
	940
	941	!
	942	!-- FFT along y
	943	ir = i-iouter+1 ! counter within a stride
[1106]	944	CALL fft_y_1d( work_ffty(:,ir), 'backward' )
[1]	945
	946	DO j = 0, ny
	947	f_out(k,j,i) = work_ffty(j,ir)
	948	ENDDO
	949	ENDDO
	950
	951	ENDDO
	952
	953	ENDDO
	954	!$OMP END PARALLEL
	955
	956	ENDIF
	957
[1106]	958	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'stop' )
[1]	959
	960	END SUBROUTINE tr_xy_ffty
	961
	962
	963	!------------------------------------------------------------------------------!
[1682]	964	! Description:
	965	! ------------
	966	!> FFT along x, solution of the tridiagonal system and backward FFT for
	967	!> a 1d-decomposition along x
	968	!>
	969	!> @warning this subroutine may still not work for hybrid parallelization
	970	!> with OpenMP (for possible necessary changes see the original
	971	!> routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
[1]	972	!------------------------------------------------------------------------------!
[1682]	973	SUBROUTINE fftx_tri_fftx( ar )
[1]	974
[1682]	975
[1320]	976	USE control_parameters, &
	977	ONLY: host
	978
	979	USE cpulog, &
	980	ONLY: cpu_log, log_point_s
	981
	982	USE grid_variables, &
	983	ONLY: ddx2, ddy2
	984
	985	USE kinds
	986
[1]	987	USE pegrid
	988
	989	IMPLICIT NONE
	990
[1682]	991	INTEGER(iwp) :: i !<
	992	INTEGER(iwp) :: j !<
	993	INTEGER(iwp) :: k !<
	994	INTEGER(iwp) :: m !<
	995	INTEGER(iwp) :: n !<
	996	INTEGER(iwp) :: omp_get_thread_num !<
	997	INTEGER(iwp) :: tn !<
[1]	998
[1682]	999	REAL(wp), DIMENSION(0:nx) :: work_fftx !<
	1000	REAL(wp), DIMENSION(0:nx,1:nz) :: work_trix !<
	1001	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: ar !<
	1002	REAL(wp), DIMENSION(:,:,:,:), ALLOCATABLE :: tri !<
[1]	1003
	1004
[1106]	1005	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'start' )
[1]	1006
	1007	ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) )
	1008
	1009	tn = 0 ! Default thread number in case of one thread
	1010	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_fftx, work_trix )
	1011	DO j = nys_x, nyn_x
	1012
	1013	!$ tn = omp_get_thread_num()
	1014
	1015	IF ( host(1:3) == 'nec' ) THEN
	1016	!
	1017	!-- Code optimized for vector processors
	1018	DO k = 1, nz
	1019
	1020	m = 0
	1021	DO n = 1, pdims(1)
[1003]	1022	DO i = 1, nnx
[1]	1023	work_trix(m,k) = ar(i,k,j,n)
	1024	m = m + 1
	1025	ENDDO
	1026	ENDDO
	1027
	1028	ENDDO
	1029
	1030	CALL fft_x_m( work_trix, 'forward' )
	1031
	1032	ELSE
	1033	!
	1034	!-- Cache optimized code
	1035	DO k = 1, nz
	1036
	1037	m = 0
	1038	DO n = 1, pdims(1)
[1003]	1039	DO i = 1, nnx
[1]	1040	work_fftx(m) = ar(i,k,j,n)
	1041	m = m + 1
	1042	ENDDO
	1043	ENDDO
	1044
[1106]	1045	CALL fft_x_1d( work_fftx, 'forward' )
[1]	1046
	1047	DO i = 0, nx
	1048	work_trix(i,k) = work_fftx(i)
	1049	ENDDO
	1050
	1051	ENDDO
	1052
	1053	ENDIF
	1054
	1055	!
	1056	!-- Solve the linear equation system
	1057	CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) )
	1058
	1059	IF ( host(1:3) == 'nec' ) THEN
	1060	!
	1061	!-- Code optimized for vector processors
	1062	CALL fft_x_m( work_trix, 'backward' )
	1063
	1064	DO k = 1, nz
	1065
	1066	m = 0
	1067	DO n = 1, pdims(1)
[1003]	1068	DO i = 1, nnx
[1]	1069	ar(i,k,j,n) = work_trix(m,k)
	1070	m = m + 1
	1071	ENDDO
	1072	ENDDO
	1073
	1074	ENDDO
	1075
	1076	ELSE
	1077	!
	1078	!-- Cache optimized code
	1079	DO k = 1, nz
	1080
	1081	DO i = 0, nx
	1082	work_fftx(i) = work_trix(i,k)
	1083	ENDDO
	1084
[1106]	1085	CALL fft_x_1d( work_fftx, 'backward' )
[1]	1086
	1087	m = 0
	1088	DO n = 1, pdims(1)
[1003]	1089	DO i = 1, nnx
[1]	1090	ar(i,k,j,n) = work_fftx(m)
	1091	m = m + 1
	1092	ENDDO
	1093	ENDDO
	1094
	1095	ENDDO
	1096
	1097	ENDIF
	1098
	1099	ENDDO
	1100
	1101	DEALLOCATE( tri )
	1102
[1106]	1103	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'stop' )
[1]	1104
	1105	END SUBROUTINE fftx_tri_fftx
	1106
	1107
	1108	!------------------------------------------------------------------------------!
[1682]	1109	! Description:
	1110	! ------------
	1111	!> Fourier-transformation along x with subsequent transposition x --> y for
	1112	!> a 1d-decomposition along y.
	1113	!>
	1114	!> @attention NEC-branch of this routine may significantly profit from
	1115	!> further optimizations. So far, performance is much worse than
	1116	!> for routine ffty_tr_yx (more than three times slower).
[1]	1117	!------------------------------------------------------------------------------!
[1682]	1118	SUBROUTINE fftx_tr_xy( f_in, f_out )
[1]	1119
[1682]	1120
[1320]	1121	USE control_parameters, &
	1122	ONLY: host
	1123
	1124	USE cpulog, &
	1125	ONLY: cpu_log, log_point_s
	1126
	1127	USE kinds
	1128
[1]	1129	USE pegrid
	1130
	1131	IMPLICIT NONE
	1132
[1682]	1133	INTEGER(iwp) :: i !<
	1134	INTEGER(iwp) :: j !<
	1135	INTEGER(iwp) :: k !<
[1]	1136
[1682]	1137	REAL(wp), DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx !<
	1138	REAL(wp), DIMENSION(1:nz,nys:nyn,0:nx) :: f_in !<
	1139	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_out !<
	1140	REAL(wp), DIMENSION(nys:nyn,1:nz,0:nx) :: work !<
[1]	1141
	1142	!
	1143	!-- Carry out the FFT along x, where all data are present due to the
	1144	!-- 1d-decomposition along y. Resort the data in a way that y becomes
	1145	!-- the first index.
[1106]	1146	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'start' )
[1]	1147
	1148	IF ( host(1:3) == 'nec' ) THEN
	1149	!
	1150	!-- Code for vector processors
[85]	1151	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	1152	!$OMP DO
	1153	DO i = 0, nx
	1154
	1155	DO j = nys, nyn
	1156	DO k = 1, nz
	1157	work_fftx(i,k,j) = f_in(k,j,i)
	1158	ENDDO
	1159	ENDDO
	1160
	1161	ENDDO
	1162
	1163	!$OMP DO
	1164	DO j = nys, nyn
	1165
	1166	CALL fft_x_m( work_fftx(:,:,j), 'forward' )
	1167
	1168	DO k = 1, nz
	1169	DO i = 0, nx
	1170	work(j,k,i) = work_fftx(i,k,j)
	1171	ENDDO
	1172	ENDDO
	1173
	1174	ENDDO
	1175	!$OMP END PARALLEL
	1176
	1177	ELSE
	1178
	1179	!
	1180	!-- Cache optimized code (there might be still a potential for better
	1181	!-- optimization).
[696]	1182	!$OMP PARALLEL PRIVATE (i,j,k)
[1]	1183	!$OMP DO
	1184	DO i = 0, nx
	1185
	1186	DO j = nys, nyn
	1187	DO k = 1, nz
	1188	work_fftx(i,k,j) = f_in(k,j,i)
	1189	ENDDO
	1190	ENDDO
	1191
	1192	ENDDO
	1193
	1194	!$OMP DO
	1195	DO j = nys, nyn
	1196	DO k = 1, nz
	1197
[1106]	1198	CALL fft_x_1d( work_fftx(0:nx,k,j), 'forward' )
[1]	1199
	1200	DO i = 0, nx
	1201	work(j,k,i) = work_fftx(i,k,j)
	1202	ENDDO
	1203	ENDDO
	1204
	1205	ENDDO
	1206	!$OMP END PARALLEL
	1207
	1208	ENDIF
[1106]	1209	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'pause' )
[1]	1210
	1211	!
	1212	!-- Transpose array
[1111]	1213	#if defined( __parallel )
[1]	1214	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	1215	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	1216	CALL MPI_ALLTOALL( work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
	1217	f_out(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
	1218	comm1dy, ierr )
	1219	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1111]	1220	#endif
[1]	1221
	1222	END SUBROUTINE fftx_tr_xy
	1223
	1224
	1225	!------------------------------------------------------------------------------!
[1682]	1226	! Description:
	1227	! ------------
	1228	!> Transposition y --> x with a subsequent backward Fourier transformation for
	1229	!> a 1d-decomposition along x.
[1]	1230	!------------------------------------------------------------------------------!
[1682]	1231	SUBROUTINE tr_yx_fftx( f_in, f_out )
[1]	1232
[1682]	1233
[1320]	1234	USE control_parameters, &
	1235	ONLY: host
	1236
	1237	USE cpulog, &
	1238	ONLY: cpu_log, log_point_s
	1239
	1240	USE kinds
	1241
[1]	1242	USE pegrid
	1243
	1244	IMPLICIT NONE
	1245
[1682]	1246	INTEGER(iwp) :: i !<
	1247	INTEGER(iwp) :: j !<
	1248	INTEGER(iwp) :: k !<
[1]	1249
[1682]	1250	REAL(wp), DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx !<
	1251	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_in !<
	1252	REAL(wp), DIMENSION(1:nz,nys:nyn,0:nx) :: f_out !<
	1253	REAL(wp), DIMENSION(nys:nyn,1:nz,0:nx) :: work !<
[1]	1254
	1255	!
	1256	!-- Transpose array
[1111]	1257	#if defined( __parallel )
[1]	1258	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	1259	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	1260	CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
	1261	work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
	1262	comm1dy, ierr )
	1263	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1111]	1264	#endif
[1]	1265
	1266	!
	1267	!-- Carry out the FFT along x, where all data are present due to the
	1268	!-- 1d-decomposition along y. Resort the data in a way that y becomes
	1269	!-- the first index.
[1106]	1270	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'continue' )
[1]	1271
	1272	IF ( host(1:3) == 'nec' ) THEN
	1273	!
	1274	!-- Code optimized for vector processors
[85]	1275	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	1276	!$OMP DO
	1277	DO j = nys, nyn
	1278
	1279	DO k = 1, nz
	1280	DO i = 0, nx
	1281	work_fftx(i,k,j) = work(j,k,i)
	1282	ENDDO
	1283	ENDDO
	1284
	1285	CALL fft_x_m( work_fftx(:,:,j), 'backward' )
	1286
	1287	ENDDO
	1288
	1289	!$OMP DO
	1290	DO i = 0, nx
	1291	DO j = nys, nyn
	1292	DO k = 1, nz
	1293	f_out(k,j,i) = work_fftx(i,k,j)
	1294	ENDDO
	1295	ENDDO
	1296	ENDDO
	1297	!$OMP END PARALLEL
	1298
	1299	ELSE
	1300
	1301	!
	1302	!-- Cache optimized code (there might be still a potential for better
	1303	!-- optimization).
[696]	1304	!$OMP PARALLEL PRIVATE (i,j,k)
[1]	1305	!$OMP DO
	1306	DO j = nys, nyn
	1307	DO k = 1, nz
	1308
	1309	DO i = 0, nx
	1310	work_fftx(i,k,j) = work(j,k,i)
	1311	ENDDO
	1312
[1106]	1313	CALL fft_x_1d( work_fftx(0:nx,k,j), 'backward' )
[1]	1314
	1315	ENDDO
	1316	ENDDO
	1317
	1318	!$OMP DO
	1319	DO i = 0, nx
	1320	DO j = nys, nyn
	1321	DO k = 1, nz
	1322	f_out(k,j,i) = work_fftx(i,k,j)
	1323	ENDDO
	1324	ENDDO
	1325	ENDDO
	1326	!$OMP END PARALLEL
	1327
	1328	ENDIF
[1106]	1329	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'stop' )
[1]	1330
	1331	END SUBROUTINE tr_yx_fftx
	1332
	1333
	1334	!------------------------------------------------------------------------------!
[1682]	1335	! Description:
	1336	! ------------
	1337	!> FFT along y, solution of the tridiagonal system and backward FFT for
	1338	!> a 1d-decomposition along y.
	1339	!>
	1340	!> @warning this subroutine may still not work for hybrid parallelization
	1341	!> with OpenMP (for possible necessary changes see the original
	1342	!> routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
[1]	1343	!------------------------------------------------------------------------------!
[1682]	1344	SUBROUTINE ffty_tri_ffty( ar )
[1]	1345
[1682]	1346
[1320]	1347	USE control_parameters, &
	1348	ONLY: host
	1349
	1350	USE cpulog, &
	1351	ONLY: cpu_log, log_point_s
	1352
	1353	USE grid_variables, &
	1354	ONLY: ddx2, ddy2
	1355
	1356	USE kinds
	1357
[1]	1358	USE pegrid
	1359
	1360	IMPLICIT NONE
	1361
[1682]	1362	INTEGER(iwp) :: i !<
	1363	INTEGER(iwp) :: j !<
	1364	INTEGER(iwp) :: k !<
	1365	INTEGER(iwp) :: m !<
	1366	INTEGER(iwp) :: n !<
	1367	INTEGER(iwp) :: omp_get_thread_num !<
	1368	INTEGER(iwp) :: tn !<
[1]	1369
[1682]	1370	REAL(wp), DIMENSION(0:ny) :: work_ffty !<
	1371	REAL(wp), DIMENSION(0:ny,1:nz) :: work_triy !<
	1372	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: ar !<
	1373	REAL(wp), DIMENSION(:,:,:,:), ALLOCATABLE :: tri !<
[1]	1374
	1375
[1106]	1376	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'start' )
[1]	1377
	1378	ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) )
	1379
	1380	tn = 0 ! Default thread number in case of one thread
[696]	1381	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy )
[1]	1382	DO i = nxl_y, nxr_y
	1383
	1384	!$ tn = omp_get_thread_num()
	1385
	1386	IF ( host(1:3) == 'nec' ) THEN
	1387	!
	1388	!-- Code optimized for vector processors
	1389	DO k = 1, nz
	1390
	1391	m = 0
	1392	DO n = 1, pdims(2)
[1003]	1393	DO j = 1, nny
[1]	1394	work_triy(m,k) = ar(j,k,i,n)
	1395	m = m + 1
	1396	ENDDO
	1397	ENDDO
	1398
	1399	ENDDO
	1400
	1401	CALL fft_y_m( work_triy, ny, 'forward' )
	1402
	1403	ELSE
	1404	!
	1405	!-- Cache optimized code
	1406	DO k = 1, nz
	1407
	1408	m = 0
	1409	DO n = 1, pdims(2)
[1003]	1410	DO j = 1, nny
[1]	1411	work_ffty(m) = ar(j,k,i,n)
	1412	m = m + 1
	1413	ENDDO
	1414	ENDDO
	1415
[1106]	1416	CALL fft_y_1d( work_ffty, 'forward' )
[1]	1417
	1418	DO j = 0, ny
	1419	work_triy(j,k) = work_ffty(j)
	1420	ENDDO
	1421
	1422	ENDDO
	1423
	1424	ENDIF
	1425
	1426	!
	1427	!-- Solve the linear equation system
	1428	CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) )
	1429
	1430	IF ( host(1:3) == 'nec' ) THEN
	1431	!
	1432	!-- Code optimized for vector processors
	1433	CALL fft_y_m( work_triy, ny, 'backward' )
	1434
	1435	DO k = 1, nz
	1436
	1437	m = 0
	1438	DO n = 1, pdims(2)
[1003]	1439	DO j = 1, nny
[1]	1440	ar(j,k,i,n) = work_triy(m,k)
	1441	m = m + 1
	1442	ENDDO
	1443	ENDDO
	1444
	1445	ENDDO
	1446
	1447	ELSE
	1448	!
	1449	!-- Cache optimized code
	1450	DO k = 1, nz
	1451
	1452	DO j = 0, ny
	1453	work_ffty(j) = work_triy(j,k)
	1454	ENDDO
	1455
[1106]	1456	CALL fft_y_1d( work_ffty, 'backward' )
[1]	1457
	1458	m = 0
	1459	DO n = 1, pdims(2)
[1003]	1460	DO j = 1, nny
[1]	1461	ar(j,k,i,n) = work_ffty(m)
	1462	m = m + 1
	1463	ENDDO
	1464	ENDDO
	1465
	1466	ENDDO
	1467
	1468	ENDIF
	1469
	1470	ENDDO
	1471
	1472	DEALLOCATE( tri )
	1473
[1106]	1474	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'stop' )
[1]	1475
	1476	END SUBROUTINE ffty_tri_ffty
	1477
	1478	END MODULE poisfft_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |