Home

Context Navigation

source: palm/trunk/SOURCE/poisfft_mod.f90 @ 1850

Last change on this file since 1850 was 1850, checked in by maronga, 8 years ago
added _mod string to several filenames to meet the naming convection for modules
Property svn:keywords set to `Id`
File size: 45.6 KB

Rev	Line
[1850]	1	!> @file poisfft_mod.f90
[1036]	2	!--------------------------------------------------------------------------------!
	3	! This file is part of PALM.
	4	!
	5	! PALM is free software: you can redistribute it and/or modify it under the terms
	6	! of the GNU General Public License as published by the Free Software Foundation,
	7	! either version 3 of the License, or (at your option) any later version.
	8	!
	9	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
	10	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
	11	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
	12	!
	13	! You should have received a copy of the GNU General Public License along with
	14	! PALM. If not, see <http://www.gnu.org/licenses/>.
	15	!
[1310]	16	! Copyright 1997-2014 Leibniz Universitaet Hannover
[1036]	17	!--------------------------------------------------------------------------------!
	18	!
[484]	19	! Current revisions:
[1]	20	! -----------------
[1850]	21	! Module renamed
[1683]	22	!
[1805]	23	!
[1321]	24	! Former revisions:
	25	! -----------------
	26	! $Id: poisfft_mod.f90 1850 2016-04-08 13:29:27Z maronga $
	27	!
[1805]	28	! 1804 2016-04-05 16:30:18Z maronga
	29	! Removed code for parameter file check (__check)
	30	!
[1683]	31	! 1682 2015-10-07 23:56:08Z knoop
	32	! Code annotations made doxygen readable
	33	!
[1483]	34	! 1482 2014-10-18 12:34:45Z raasch
	35	! use 2d-decomposition, if accelerator boards are used
	36	!
[1407]	37	! 1406 2014-05-16 13:47:01Z raasch
	38	! bugfix for pgi 14.4: declare create moved after array declaration
	39	!
[1321]	40	! 1320 2014-03-20 08:40:49Z raasch
[1320]	41	! ONLY-attribute added to USE-statements,
	42	! kind-parameters added to all INTEGER and REAL declaration statements,
	43	! kinds are defined in new module kinds,
	44	! old module precision_kind is removed,
	45	! revision history before 2012 removed,
	46	! comment fields (!:) to be used for variable explanations added to
	47	! all variable declaration statements
[1217]	48	!
[1319]	49	! 1318 2014-03-17 13:35:16Z raasch
	50	! module interfaces removed
	51	!
[1307]	52	! 1306 2014-03-13 14:30:59Z raasch
	53	! openmp sections removed from the overlap branch,
	54	! second argument removed from parameter list
	55	!
[1217]	56	! 1216 2013-08-26 09:31:42Z raasch
[1216]	57	! resorting of arrays moved to separate routines resort_for_...,
	58	! one argument, used as temporary work array, removed from all transpose
	59	! routines
	60	! overlapping fft / transposition implemented
[1112]	61	!
[1213]	62	! 1212 2013-08-15 08:46:27Z raasch
	63	! tridia routines moved to seperate module tridia_solver
	64	!
[1209]	65	! 1208 2013-08-13 06:41:49Z raasch
	66	! acc-update clauses added for "ar" so that ffts other than cufft can also be
	67	! used (although they are not ported and will give a poor performance)
	68	!
[1112]	69	! 1111 2013-03-08 23:54:10Z raasch
[1111]	70	! further openACC porting of non-parallel (MPI) branch:
	71	! tridiagonal routines split into extermal subroutines (instead using CONTAINS),
	72	! no distinction between parallel/non-parallel in poisfft and tridia any more,
[1112]	73	! tridia routines moved to end of file because of probable bug in PGI compiler 12.5
[1111]	74	! (otherwise "invalid device function" is indicated during runtime),
	75	! optimization of tridia routines: constant elements and coefficients of tri are
	76	! stored in seperate arrays ddzuw and tric, last dimension of tri reduced from 5
	77	! to 2,
	78	! poisfft_init is now called internally from poisfft, maketri is called from
	79	! poisfft_init,
	80	! ibc_p_b = 2 removed
[1]	81	!
[1107]	82	! 1106 2013-03-04 05:31:38Z raasch
	83	! routines fftx, ffty, fftxp, fftyp removed, calls replaced by fft_x, fft_y,
	84	! in the 1D-decomposition routines fft_x, ffty are replaced by fft_x_1d,
	85	! fft_y_1d
	86	!
[1104]	87	! 1103 2013-02-20 02:15:53Z raasch
	88	! tri, ar, and ar1 arguments in tridia-routines (2d) are removed because they
	89	! sometimes cause segmentation faults with intel 12.1 compiler
	90	!
[1093]	91	! 1092 2013-02-02 11:24:22Z raasch
	92	! unused variables removed
	93	!
[1037]	94	! 1036 2012-10-22 13:43:42Z raasch
	95	! code put under GPL (PALM 3.9)
	96	!
[1014]	97	! 2012-09-21 07:03:55Z raasch
	98	! FLOAT type conversion replaced by REAL
	99	!
[1004]	100	! 1003 2012-09-14 14:35:53Z raasch
	101	! indices nxa, nya, etc. replaced by nx, ny, etc.
	102	!
[941]	103	! 940 2012-07-09 14:31:00Z raasch
	104	! special handling of tri-array as an argument in tridia_1dd routines switched
	105	! off because it caused segmentation faults with intel 12.1 compiler
	106	!
[878]	107	! 877 2012-04-03 11:21:44Z suehring
	108	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
	109	! pressure at the top of the model domain.
	110	!
[810]	111	! 809 2012-01-30 13:32:58Z maronga
	112	! Bugfix: replaced .AND. and .NOT. with && and ! in the preprocessor directives
	113	!
[808]	114	! 807 2012-01-25 11:53:51Z maronga
	115	! New cpp directive "__check" implemented which is used by check_namelist_files
	116	! (most of the code is unneeded by check_namelist_files).
	117	!
[1]	118	! Revision 1.1 1997/07/24 11:24:14 raasch
	119	! Initial revision
	120	!
	121	!
	122	! Description:
	123	! ------------
[1682]	124	!> Solves the Poisson equation with a 2D spectral method
	125	!> d^2 p / dx^2 + d^2 p / dy^2 + d^2 p / dz^2 = s
	126	!>
	127	!> Input:
	128	!> real ar contains (nnz,nny,nnx) elements of the velocity divergence,
	129	!> starting from (1,nys,nxl)
	130	!>
	131	!> Output:
	132	!> real ar contains the solution for perturbation pressure p
[1]	133	!------------------------------------------------------------------------------!
[1682]	134	MODULE poisfft_mod
	135
[1]	136
[1320]	137	USE fft_xy, &
	138	ONLY: fft_init, fft_y, fft_y_1d, fft_y_m, fft_x, fft_x_1d, fft_x_m
[1]	139
[1320]	140	USE indices, &
	141	ONLY: nnx, nny, nx, nxl, nxr, ny, nys, nyn, nz
	142
	143	USE transpose_indices, &
	144	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nys_x, nys_z, nyn_x, nyn_z, nzb_x, &
	145	nzb_y, nzt_x, nzt_y
	146
	147	USE tridia_solver, &
	148	ONLY: tridia_1dd, tridia_init, tridia_substi, tridia_substi_overlap
	149
[1]	150	IMPLICIT NONE
	151
[1111]	152	LOGICAL, SAVE :: poisfft_initialized = .FALSE.
	153
[1]	154	PRIVATE
[807]	155
[1]	156	PUBLIC poisfft, poisfft_init
	157
	158	INTERFACE poisfft
	159	MODULE PROCEDURE poisfft
	160	END INTERFACE poisfft
	161
	162	INTERFACE poisfft_init
	163	MODULE PROCEDURE poisfft_init
	164	END INTERFACE poisfft_init
	165
[807]	166
[1]	167	CONTAINS
	168
[1682]	169	!------------------------------------------------------------------------------!
	170	! Description:
	171	! ------------
	172	!> @todo Missing subroutine description.
	173	!------------------------------------------------------------------------------!
[1]	174	SUBROUTINE poisfft_init
	175
[1320]	176	USE arrays_3d, &
	177	ONLY: ddzu_pres, ddzw
[1111]	178
[1320]	179	USE kinds
	180
[1111]	181	IMPLICIT NONE
	182
[1682]	183	INTEGER(iwp) :: k !<
[1111]	184
	185
[1]	186	CALL fft_init
	187
[1212]	188	CALL tridia_init
[1111]	189
	190	poisfft_initialized = .TRUE.
	191
[1]	192	END SUBROUTINE poisfft_init
	193
[1111]	194
[1804]	195
[1682]	196	!------------------------------------------------------------------------------!
	197	! Description:
	198	! ------------
	199	!> Two-dimensional Fourier Transformation in x- and y-direction.
	200	!------------------------------------------------------------------------------!
[1306]	201	SUBROUTINE poisfft( ar )
[1]	202
[1320]	203	USE control_parameters, &
	204	ONLY: fft_method, transpose_compute_overlap
	205
	206	USE cpulog, &
	207	ONLY: cpu_log, cpu_log_nowait, log_point_s
	208
	209	USE kinds
	210
[1]	211	USE pegrid
	212
	213	IMPLICIT NONE
	214
[1682]	215	INTEGER(iwp) :: ii !<
	216	INTEGER(iwp) :: iind !<
	217	INTEGER(iwp) :: inew !<
	218	INTEGER(iwp) :: jj !<
	219	INTEGER(iwp) :: jind !<
	220	INTEGER(iwp) :: jnew !<
	221	INTEGER(iwp) :: ki !<
	222	INTEGER(iwp) :: kk !<
	223	INTEGER(iwp) :: knew !<
	224	INTEGER(iwp) :: n !<
	225	INTEGER(iwp) :: nblk !<
	226	INTEGER(iwp) :: nnx_y !<
	227	INTEGER(iwp) :: nny_z !<
	228	INTEGER(iwp) :: nnz_t !<
	229	INTEGER(iwp) :: nnz_x !<
	230	INTEGER(iwp) :: nxl_y_bound !<
	231	INTEGER(iwp) :: nxr_y_bound !<
[1]	232
[1682]	233	INTEGER(iwp), DIMENSION(4) :: isave !<
[1320]	234
[1682]	235	REAL(wp), DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar !<
	236	REAL(wp), DIMENSION(nys:nyn,nxl:nxr,1:nz) :: ar_inv !<
[1216]	237	!$acc declare create( ar_inv )
[1]	238
[1682]	239	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: ar1 !<
	240	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_in !<
	241	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_inv !<
	242	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_out_y !<
	243	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_out_z !<
[1216]	244
	245
[1]	246	CALL cpu_log( log_point_s(3), 'poisfft', 'start' )
	247
[1111]	248	IF ( .NOT. poisfft_initialized ) CALL poisfft_init
	249
[1]	250	!
	251	!-- Two-dimensional Fourier Transformation in x- and y-direction.
[1482]	252	IF ( pdims(2) == 1 .AND. pdims(1) > 1 .AND. num_acc_per_node == 0 ) &
	253	THEN
[1]	254
	255	!
	256	!-- 1d-domain-decomposition along x:
	257	!-- FFT along y and transposition y --> x
[1216]	258	CALL ffty_tr_yx( ar, ar )
[1]	259
	260	!
	261	!-- FFT along x, solving the tridiagonal system and backward FFT
	262	CALL fftx_tri_fftx( ar )
	263
	264	!
	265	!-- Transposition x --> y and backward FFT along y
[1216]	266	CALL tr_xy_ffty( ar, ar )
[1]	267
[1482]	268	ELSEIF ( pdims(1) == 1 .AND. pdims(2) > 1 .AND. num_acc_per_node == 0 ) &
	269	THEN
[1]	270
	271	!
	272	!-- 1d-domain-decomposition along y:
	273	!-- FFT along x and transposition x --> y
[1216]	274	CALL fftx_tr_xy( ar, ar )
[1]	275
	276	!
	277	!-- FFT along y, solving the tridiagonal system and backward FFT
	278	CALL ffty_tri_ffty( ar )
	279
	280	!
	281	!-- Transposition y --> x and backward FFT along x
[1216]	282	CALL tr_yx_fftx( ar, ar )
[1]	283
[1216]	284	ELSEIF ( .NOT. transpose_compute_overlap ) THEN
[1]	285
	286	!
[1111]	287	!-- 2d-domain-decomposition or no decomposition (1 PE run)
[1]	288	!-- Transposition z --> x
	289	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
[1216]	290	CALL resort_for_zx( ar, ar_inv )
	291	CALL transpose_zx( ar_inv, ar )
[1]	292	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
	293
	294	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
[1208]	295	IF ( fft_method /= 'system-specific' ) THEN
	296	!$acc update host( ar )
	297	ENDIF
[1106]	298	CALL fft_x( ar, 'forward' )
[1208]	299	IF ( fft_method /= 'system-specific' ) THEN
	300	!$acc update device( ar )
	301	ENDIF
[1]	302	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
	303
	304	!
	305	!-- Transposition x --> y
	306	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
[1216]	307	CALL resort_for_xy( ar, ar_inv )
	308	CALL transpose_xy( ar_inv, ar )
[1]	309	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
	310
	311	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
[1208]	312	IF ( fft_method /= 'system-specific' ) THEN
	313	!$acc update host( ar )
	314	ENDIF
[1216]	315	CALL fft_y( ar, 'forward', ar_tr = ar, &
	316	nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
	317	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
[1208]	318	IF ( fft_method /= 'system-specific' ) THEN
	319	!$acc update device( ar )
	320	ENDIF
[1]	321	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	322
	323	!
	324	!-- Transposition y --> z
	325	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
[1216]	326	CALL resort_for_yz( ar, ar_inv )
	327	CALL transpose_yz( ar_inv, ar )
[1]	328	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
	329
	330	!
[1106]	331	!-- Solve the tridiagonal equation system along z
[1]	332	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
[1212]	333	CALL tridia_substi( ar )
[1]	334	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
	335
	336	!
	337	!-- Inverse Fourier Transformation
	338	!-- Transposition z --> y
	339	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
[1216]	340	CALL transpose_zy( ar, ar_inv )
	341	CALL resort_for_zy( ar_inv, ar )
[1]	342	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
	343
	344	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
[1208]	345	IF ( fft_method /= 'system-specific' ) THEN
	346	!$acc update host( ar )
	347	ENDIF
[1216]	348	CALL fft_y( ar, 'backward', ar_tr = ar, &
	349	nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
	350	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
[1208]	351	IF ( fft_method /= 'system-specific' ) THEN
	352	!$acc update device( ar )
	353	ENDIF
[1]	354	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	355
	356	!
	357	!-- Transposition y --> x
	358	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
[1216]	359	CALL transpose_yx( ar, ar_inv )
	360	CALL resort_for_yx( ar_inv, ar )
[1]	361	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
	362
	363	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
[1208]	364	IF ( fft_method /= 'system-specific' ) THEN
	365	!$acc update host( ar )
	366	ENDIF
[1106]	367	CALL fft_x( ar, 'backward' )
[1208]	368	IF ( fft_method /= 'system-specific' ) THEN
	369	!$acc update device( ar )
	370	ENDIF
[1]	371	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	372
	373	!
	374	!-- Transposition x --> z
	375	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
[1216]	376	CALL transpose_xz( ar, ar_inv )
	377	CALL resort_for_xz( ar_inv, ar )
[1]	378	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
	379
[1216]	380	ELSE
	381
	382	!
	383	!-- 2d-domain-decomposition or no decomposition (1 PE run) with
	384	!-- overlapping transposition / fft
[1318]	385	!-- cputime logging must not use barriers, which would prevent overlapping
[1216]	386	ALLOCATE( f_out_y(0:ny,nxl_y:nxr_y,nzb_y:nzt_y), &
	387	f_out_z(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
	388	!
	389	!-- Transposition z --> x + subsequent fft along x
	390	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
	391	CALL resort_for_zx( ar, f_inv )
	392	!
	393	!-- Save original indices and gridpoint counter
	394	isave(1) = nz
	395	isave(2) = nzb_x
	396	isave(3) = nzt_x
	397	isave(4) = sendrecvcount_zx
	398	!
	399	!-- Set new indices for transformation
	400	nblk = nz / pdims(1)
	401	nz = pdims(1)
	402	nnz_x = 1
	403	nzb_x = 1 + myidx * nnz_x
	404	nzt_x = ( myidx + 1 ) * nnz_x
	405	sendrecvcount_zx = nnx * nny * nnz_x
	406
[1306]	407	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
[1216]	408	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
	409
[1306]	410	DO kk = 1, nblk
[1216]	411
[1306]	412	IF ( kk == 1 ) THEN
[1318]	413	CALL cpu_log( log_point_s(5), 'transpo forward', 'start', cpu_log_nowait )
[1306]	414	ELSE
[1318]	415	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
[1306]	416	ENDIF
[1216]	417
[1306]	418	DO knew = 1, nz
	419	ki = kk + nblk * ( knew - 1 )
	420	f_in(:,:,knew) = f_inv(:,:,ki)
	421	ENDDO
[1216]	422
[1306]	423	CALL transpose_zx( f_in, ar1(:,:,:))
	424	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
[1216]	425
[1306]	426	IF ( kk == 1 ) THEN
[1318]	427	CALL cpu_log( log_point_s(4), 'fft_x', 'start', cpu_log_nowait )
[1306]	428	ELSE
[1318]	429	CALL cpu_log( log_point_s(4), 'fft_x', 'continue', cpu_log_nowait )
[1216]	430	ENDIF
	431
[1306]	432	n = isave(2) + kk - 1
	433	CALL fft_x( ar1(:,:,:), 'forward', ar_2d = f_out_z(:,:,n))
	434	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
[1216]	435
	436	ENDDO
	437	!
	438	!-- Restore original indices/counters
	439	nz = isave(1)
	440	nzb_x = isave(2)
	441	nzt_x = isave(3)
	442	sendrecvcount_zx = isave(4)
	443
	444	DEALLOCATE( ar1, f_in, f_inv )
	445
	446	!
	447	!-- Transposition x --> y + subsequent fft along y
	448	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
	449	CALL resort_for_xy( f_out_z, f_inv )
	450	!
	451	!-- Save original indices and gridpoint counter
	452	isave(1) = nx
	453	isave(2) = nxl_y
	454	isave(3) = nxr_y
	455	isave(4) = sendrecvcount_xy
	456	!
	457	!-- Set new indices for transformation
	458	nblk = ( ( nx+1 ) / pdims(2) ) - 1
	459	nx = pdims(2)
	460	nnx_y = 1
	461	nxl_y = myidy * nnx_y
	462	nxr_y = ( myidy + 1 ) * nnx_y - 1
	463	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
	464
[1306]	465	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
[1216]	466	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
	467
[1306]	468	DO ii = 0, nblk
[1216]	469
[1318]	470	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
[1216]	471
[1306]	472	DO inew = 0, nx-1
	473	iind = ii + ( nblk + 1 ) * inew
	474	f_in(:,:,inew) = f_inv(:,:,iind)
	475	ENDDO
[1216]	476
[1306]	477	CALL transpose_xy( f_in, ar1(:,:,:) )
[1216]	478
[1306]	479	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
[1216]	480
[1306]	481	IF ( ii == 1 ) THEN
[1318]	482	CALL cpu_log( log_point_s(7), 'fft_y', 'start', cpu_log_nowait )
[1306]	483	ELSE
[1318]	484	CALL cpu_log( log_point_s(7), 'fft_y', 'continue', cpu_log_nowait )
[1216]	485	ENDIF
	486
[1306]	487	nxl_y_bound = isave(2)
	488	nxr_y_bound = isave(3)
	489	n = isave(2) + ii
	490	CALL fft_y( ar1(:,:,:), 'forward', ar_tr = f_out_y, &
	491	nxl_y_bound = nxl_y_bound, nxr_y_bound = nxr_y_bound, &
	492	nxl_y_l = n, nxr_y_l = n )
[1216]	493
[1306]	494	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
[1216]	495
	496	ENDDO
	497	!
	498	!-- Restore original indices/counters
	499	nx = isave(1)
	500	nxl_y = isave(2)
	501	nxr_y = isave(3)
	502	sendrecvcount_xy = isave(4)
	503
	504	DEALLOCATE( ar1, f_in, f_inv )
	505
	506	!
	507	!-- Transposition y --> z + subsequent tridia + resort for z --> y
	508	ALLOCATE( f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
	509	CALL resort_for_yz( f_out_y, f_inv )
	510	!
	511	!-- Save original indices and gridpoint counter
	512	isave(1) = ny
	513	isave(2) = nys_z
	514	isave(3) = nyn_z
	515	isave(4) = sendrecvcount_yz
	516	!
	517	!-- Set new indices for transformation
	518	nblk = ( ( ny+1 ) / pdims(1) ) - 1
	519	ny = pdims(1)
	520	nny_z = 1
	521	nys_z = myidx * nny_z
	522	nyn_z = ( myidx + 1 ) * nny_z - 1
	523	sendrecvcount_yz = ( nxr_y-nxl_y+1 ) * nny_z * ( nzt_y-nzb_y+1 )
	524
[1306]	525	ALLOCATE( ar1(nxl_z:nxr_z,nys_z:nyn_z,1:nz) )
[1216]	526	ALLOCATE( f_in(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
	527
[1306]	528	DO jj = 0, nblk
[1216]	529	!
[1306]	530	!-- Forward Fourier Transformation
	531	!-- Transposition y --> z
[1318]	532	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
[1216]	533
[1306]	534	DO jnew = 0, ny-1
	535	jind = jj + ( nblk + 1 ) * jnew
	536	f_in(:,:,jnew) = f_inv(:,:,jind)
	537	ENDDO
[1216]	538
[1306]	539	CALL transpose_yz( f_in, ar1(:,:,:) )
[1216]	540
[1306]	541	IF ( jj == nblk ) THEN
	542	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
	543	ELSE
	544	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
[1216]	545	ENDIF
	546
	547	!
[1306]	548	!-- Solve the tridiagonal equation system along z
[1318]	549	CALL cpu_log( log_point_s(6), 'tridia', 'start', cpu_log_nowait )
[1216]	550
[1306]	551	n = isave(2) + jj
	552	CALL tridia_substi_overlap( ar1(:,:,:), n )
[1216]	553
[1306]	554	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
[1216]	555
[1306]	556	!
	557	!-- Inverse Fourier Transformation
	558	!-- Transposition z --> y
	559	!-- Only one thread should call MPI routines, therefore forward and
	560	!-- backward tranpose are in the same section
	561	IF ( jj == 0 ) THEN
[1318]	562	CALL cpu_log( log_point_s(8), 'transpo invers', 'start', cpu_log_nowait )
[1306]	563	ELSE
[1318]	564	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
[1216]	565	ENDIF
	566
[1306]	567	CALL transpose_zy( ar1(:,:,:), f_in )
[1216]	568
[1306]	569	DO jnew = 0, ny-1
	570	jind = jj + ( nblk + 1 ) * jnew
	571	f_inv(:,:,jind) = f_in(:,:,jnew)
	572	ENDDO
[1216]	573
[1306]	574	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
[1216]	575
	576	ENDDO
	577	!
	578	!-- Restore original indices/counters
	579	ny = isave(1)
	580	nys_z = isave(2)
	581	nyn_z = isave(3)
	582	sendrecvcount_yz = isave(4)
	583
	584	CALL resort_for_zy( f_inv, f_out_y )
	585
	586	DEALLOCATE( ar1, f_in, f_inv )
	587
	588	!
	589	!-- fft along y backward + subsequent transposition y --> x
	590	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
	591	!
	592	!-- Save original indices and gridpoint counter
	593	isave(1) = nx
	594	isave(2) = nxl_y
	595	isave(3) = nxr_y
	596	isave(4) = sendrecvcount_xy
	597	!
	598	!-- Set new indices for transformation
	599	nblk = (( nx+1 ) / pdims(2) ) - 1
	600	nx = pdims(2)
	601	nnx_y = 1
	602	nxl_y = myidy * nnx_y
	603	nxr_y = ( myidy + 1 ) * nnx_y - 1
	604	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
	605
[1306]	606	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
[1216]	607	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
	608
[1306]	609	DO ii = 0, nblk
[1216]	610
[1318]	611	CALL cpu_log( log_point_s(7), 'fft_y', 'continue', cpu_log_nowait )
[1216]	612
[1306]	613	n = isave(2) + ii
	614	nxl_y_bound = isave(2)
	615	nxr_y_bound = isave(3)
[1216]	616
[1306]	617	CALL fft_y( ar1(:,:,:), 'backward', ar_tr = f_out_y, &
	618	nxl_y_bound = nxl_y_bound, nxr_y_bound = nxr_y_bound, &
	619	nxl_y_l = n, nxr_y_l = n )
[1216]	620
[1306]	621	IF ( ii == nblk ) THEN
	622	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	623	ELSE
	624	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
[1216]	625	ENDIF
	626
[1318]	627	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
[1216]	628
[1306]	629	CALL transpose_yx( ar1(:,:,:), f_in )
[1216]	630
[1306]	631	DO inew = 0, nx-1
	632	iind = ii + (nblk+1) * inew
	633	f_inv(:,:,iind) = f_in(:,:,inew)
	634	ENDDO
[1216]	635
[1306]	636	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
[1216]	637
	638	ENDDO
	639	!
	640	!-- Restore original indices/counters
	641	nx = isave(1)
	642	nxl_y = isave(2)
	643	nxr_y = isave(3)
	644	sendrecvcount_xy = isave(4)
	645
	646	CALL resort_for_yx( f_inv, f_out_z )
	647
	648	DEALLOCATE( ar1, f_in, f_inv )
	649
	650	!
	651	!-- fft along x backward + subsequent final transposition x --> z
	652	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
	653	!
	654	!-- Save original indices and gridpoint counter
	655	isave(1) = nz
	656	isave(2) = nzb_x
	657	isave(3) = nzt_x
	658	isave(4) = sendrecvcount_zx
	659	!
	660	!-- Set new indices for transformation
	661	nblk = nz / pdims(1)
	662	nz = pdims(1)
	663	nnz_x = 1
	664	nzb_x = 1 + myidx * nnz_x
	665	nzt_x = ( myidx + 1 ) * nnz_x
	666	sendrecvcount_zx = nnx * nny * nnz_x
	667
[1306]	668	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
[1216]	669	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
	670
[1306]	671	DO kk = 1, nblk
[1216]	672
[1318]	673	CALL cpu_log( log_point_s(4), 'fft_x', 'continue', cpu_log_nowait )
[1216]	674
[1306]	675	n = isave(2) + kk - 1
	676	CALL fft_x( ar1(:,:,:), 'backward', f_out_z(:,:,n))
[1216]	677
[1306]	678	IF ( kk == nblk ) THEN
	679	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	680	ELSE
	681	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
[1216]	682	ENDIF
	683
[1318]	684	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
[1216]	685
[1306]	686	CALL transpose_xz( ar1(:,:,:), f_in )
[1216]	687
[1306]	688	DO knew = 1, nz
	689	ki = kk + nblk * (knew-1)
	690	f_inv(:,:,ki) = f_in(:,:,knew)
	691	ENDDO
[1216]	692
[1306]	693	IF ( kk == nblk ) THEN
	694	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
	695	ELSE
	696	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
[1216]	697	ENDIF
	698
	699	ENDDO
	700	!
	701	!-- Restore original indices/counters
	702	nz = isave(1)
	703	nzb_x = isave(2)
	704	nzt_x = isave(3)
	705	sendrecvcount_zx = isave(4)
	706
	707	CALL resort_for_xz( f_inv, ar )
	708
	709	DEALLOCATE( ar1, f_in, f_inv )
	710
[1]	711	ENDIF
	712
	713	CALL cpu_log( log_point_s(3), 'poisfft', 'stop' )
	714
	715	END SUBROUTINE poisfft
	716
	717
	718	!------------------------------------------------------------------------------!
[1682]	719	! Description:
	720	! ------------
	721	!> Fourier-transformation along y with subsequent transposition y --> x for
	722	!> a 1d-decomposition along x.
	723	!>
	724	!> @attention The performance of this routine is much faster on the NEC-SX6,
	725	!> if the first index of work_ffty_vec is odd. Otherwise
	726	!> memory bank conflicts may occur (especially if the index is a
	727	!> multiple of 128). That's why work_ffty_vec is dimensioned as
	728	!> 0:ny+1.
	729	!> Of course, this will not work if users are using an odd number
	730	!> of gridpoints along y.
[1]	731	!------------------------------------------------------------------------------!
[1682]	732	SUBROUTINE ffty_tr_yx( f_in, f_out )
[1]	733
[1682]	734
[1320]	735	USE control_parameters, &
	736	ONLY: host
	737
	738	USE cpulog, &
	739	ONLY: cpu_log, log_point_s
	740
	741	USE kinds
	742
[1]	743	USE pegrid
	744
	745	IMPLICIT NONE
	746
[1682]	747	INTEGER(iwp) :: i !<
	748	INTEGER(iwp) :: iend !<
	749	INTEGER(iwp) :: iouter !<
	750	INTEGER(iwp) :: ir !<
	751	INTEGER(iwp) :: j !<
	752	INTEGER(iwp) :: k !<
[1]	753
[1682]	754	INTEGER(iwp), PARAMETER :: stridex = 4 !<
[1320]	755
[1682]	756	REAL(wp), DIMENSION(0:ny,stridex) :: work_ffty !<
[1]	757	#if defined( __nec )
[1682]	758	REAL(wp), DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec !<
[1]	759	#endif
[1682]	760	REAL(wp), DIMENSION(1:nz,0:ny,nxl:nxr) :: f_in !<
	761	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_out !<
	762	REAL(wp), DIMENSION(nxl:nxr,1:nz,0:ny) :: work !<
[1]	763
	764	!
	765	!-- Carry out the FFT along y, where all data are present due to the
	766	!-- 1d-decomposition along x. Resort the data in a way that x becomes
	767	!-- the first index.
[1106]	768	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'start' )
[1]	769
	770	IF ( host(1:3) == 'nec' ) THEN
	771	#if defined( __nec )
	772	!
	773	!-- Code optimized for vector processors
[85]	774	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	775	!$OMP DO
	776	DO i = nxl, nxr
	777
	778	DO j = 0, ny
	779	DO k = 1, nz
	780	work_ffty_vec(j,k,i) = f_in(k,j,i)
	781	ENDDO
	782	ENDDO
	783
	784	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' )
	785
	786	ENDDO
	787
	788	!$OMP DO
	789	DO k = 1, nz
	790	DO j = 0, ny
	791	DO i = nxl, nxr
	792	work(i,k,j) = work_ffty_vec(j,k,i)
	793	ENDDO
	794	ENDDO
	795	ENDDO
	796	!$OMP END PARALLEL
	797	#endif
	798
	799	ELSE
	800
	801	!
	802	!-- Cache optimized code.
	803	!-- The i-(x-)direction is split into a strided outer loop and an inner
	804	!-- loop for better cache performance
	805	!$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty)
	806	!$OMP DO
	807	DO iouter = nxl, nxr, stridex
	808
	809	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
	810
	811	DO k = 1, nz
	812
	813	DO i = iouter, iend
	814
	815	ir = i-iouter+1 ! counter within a stride
	816	DO j = 0, ny
	817	work_ffty(j,ir) = f_in(k,j,i)
	818	ENDDO
	819	!
	820	!-- FFT along y
[1106]	821	CALL fft_y_1d( work_ffty(:,ir), 'forward' )
[1]	822
	823	ENDDO
	824
	825	!
	826	!-- Resort
	827	DO j = 0, ny
	828	DO i = iouter, iend
	829	work(i,k,j) = work_ffty(j,i-iouter+1)
	830	ENDDO
	831	ENDDO
	832
	833	ENDDO
	834
	835	ENDDO
	836	!$OMP END PARALLEL
	837
	838	ENDIF
[1106]	839	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'pause' )
[1]	840
	841	!
	842	!-- Transpose array
[1111]	843	#if defined( __parallel )
[1]	844	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	845	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	846	CALL MPI_ALLTOALL( work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
	847	f_out(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
	848	comm1dx, ierr )
	849	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1111]	850	#endif
[1]	851
	852	END SUBROUTINE ffty_tr_yx
	853
	854
	855	!------------------------------------------------------------------------------!
[1682]	856	! Description:
	857	! ------------
	858	!> Transposition x --> y with a subsequent backward Fourier transformation for
	859	!> a 1d-decomposition along x
[1]	860	!------------------------------------------------------------------------------!
[1682]	861	SUBROUTINE tr_xy_ffty( f_in, f_out )
[1]	862
[1682]	863
[1320]	864	USE control_parameters, &
	865	ONLY: host
	866
	867	USE cpulog, &
	868	ONLY: cpu_log, log_point_s
	869
	870	USE kinds
	871
[1]	872	USE pegrid
	873
	874	IMPLICIT NONE
	875
[1682]	876	INTEGER(iwp) :: i !<
	877	INTEGER(iwp) :: iend !<
	878	INTEGER(iwp) :: iouter !<
	879	INTEGER(iwp) :: ir !<
	880	INTEGER(iwp) :: j !<
	881	INTEGER(iwp) :: k !<
[1]	882
[1682]	883	INTEGER(iwp), PARAMETER :: stridex = 4 !<
[1320]	884
[1682]	885	REAL(wp), DIMENSION(0:ny,stridex) :: work_ffty !<
[1]	886	#if defined( __nec )
[1682]	887	REAL(wp), DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec !<
[1]	888	#endif
[1682]	889	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_in !<
	890	REAL(wp), DIMENSION(1:nz,0:ny,nxl:nxr) :: f_out !<
	891	REAL(wp), DIMENSION(nxl:nxr,1:nz,0:ny) :: work !<
[1]	892
	893	!
	894	!-- Transpose array
[1111]	895	#if defined( __parallel )
[1]	896	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	897	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	898	CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
	899	work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
	900	comm1dx, ierr )
	901	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1111]	902	#endif
[1]	903
	904	!
	905	!-- Resort the data in a way that y becomes the first index and carry out the
	906	!-- backward fft along y.
[1106]	907	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'continue' )
[1]	908
	909	IF ( host(1:3) == 'nec' ) THEN
	910	#if defined( __nec )
	911	!
	912	!-- Code optimized for vector processors
[85]	913	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	914	!$OMP DO
	915	DO k = 1, nz
	916	DO j = 0, ny
	917	DO i = nxl, nxr
	918	work_ffty_vec(j,k,i) = work(i,k,j)
	919	ENDDO
	920	ENDDO
	921	ENDDO
	922
	923	!$OMP DO
	924	DO i = nxl, nxr
	925
	926	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' )
	927
	928	DO j = 0, ny
	929	DO k = 1, nz
	930	f_out(k,j,i) = work_ffty_vec(j,k,i)
	931	ENDDO
	932	ENDDO
	933
	934	ENDDO
	935	!$OMP END PARALLEL
	936	#endif
	937
	938	ELSE
	939
	940	!
	941	!-- Cache optimized code.
	942	!-- The i-(x-)direction is split into a strided outer loop and an inner
	943	!-- loop for better cache performance
	944	!$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty )
	945	!$OMP DO
	946	DO iouter = nxl, nxr, stridex
	947
	948	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
	949
	950	DO k = 1, nz
	951	!
	952	!-- Resort
	953	DO j = 0, ny
	954	DO i = iouter, iend
	955	work_ffty(j,i-iouter+1) = work(i,k,j)
	956	ENDDO
	957	ENDDO
	958
	959	DO i = iouter, iend
	960
	961	!
	962	!-- FFT along y
	963	ir = i-iouter+1 ! counter within a stride
[1106]	964	CALL fft_y_1d( work_ffty(:,ir), 'backward' )
[1]	965
	966	DO j = 0, ny
	967	f_out(k,j,i) = work_ffty(j,ir)
	968	ENDDO
	969	ENDDO
	970
	971	ENDDO
	972
	973	ENDDO
	974	!$OMP END PARALLEL
	975
	976	ENDIF
	977
[1106]	978	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'stop' )
[1]	979
	980	END SUBROUTINE tr_xy_ffty
	981
	982
	983	!------------------------------------------------------------------------------!
[1682]	984	! Description:
	985	! ------------
	986	!> FFT along x, solution of the tridiagonal system and backward FFT for
	987	!> a 1d-decomposition along x
	988	!>
	989	!> @warning this subroutine may still not work for hybrid parallelization
	990	!> with OpenMP (for possible necessary changes see the original
	991	!> routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
[1]	992	!------------------------------------------------------------------------------!
[1682]	993	SUBROUTINE fftx_tri_fftx( ar )
[1]	994
[1682]	995
[1320]	996	USE control_parameters, &
	997	ONLY: host
	998
	999	USE cpulog, &
	1000	ONLY: cpu_log, log_point_s
	1001
	1002	USE grid_variables, &
	1003	ONLY: ddx2, ddy2
	1004
	1005	USE kinds
	1006
[1]	1007	USE pegrid
	1008
	1009	IMPLICIT NONE
	1010
[1682]	1011	INTEGER(iwp) :: i !<
	1012	INTEGER(iwp) :: j !<
	1013	INTEGER(iwp) :: k !<
	1014	INTEGER(iwp) :: m !<
	1015	INTEGER(iwp) :: n !<
	1016	INTEGER(iwp) :: omp_get_thread_num !<
	1017	INTEGER(iwp) :: tn !<
[1]	1018
[1682]	1019	REAL(wp), DIMENSION(0:nx) :: work_fftx !<
	1020	REAL(wp), DIMENSION(0:nx,1:nz) :: work_trix !<
	1021	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: ar !<
	1022	REAL(wp), DIMENSION(:,:,:,:), ALLOCATABLE :: tri !<
[1]	1023
	1024
[1106]	1025	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'start' )
[1]	1026
	1027	ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) )
	1028
	1029	tn = 0 ! Default thread number in case of one thread
	1030	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_fftx, work_trix )
	1031	DO j = nys_x, nyn_x
	1032
	1033	!$ tn = omp_get_thread_num()
	1034
	1035	IF ( host(1:3) == 'nec' ) THEN
	1036	!
	1037	!-- Code optimized for vector processors
	1038	DO k = 1, nz
	1039
	1040	m = 0
	1041	DO n = 1, pdims(1)
[1003]	1042	DO i = 1, nnx
[1]	1043	work_trix(m,k) = ar(i,k,j,n)
	1044	m = m + 1
	1045	ENDDO
	1046	ENDDO
	1047
	1048	ENDDO
	1049
	1050	CALL fft_x_m( work_trix, 'forward' )
	1051
	1052	ELSE
	1053	!
	1054	!-- Cache optimized code
	1055	DO k = 1, nz
	1056
	1057	m = 0
	1058	DO n = 1, pdims(1)
[1003]	1059	DO i = 1, nnx
[1]	1060	work_fftx(m) = ar(i,k,j,n)
	1061	m = m + 1
	1062	ENDDO
	1063	ENDDO
	1064
[1106]	1065	CALL fft_x_1d( work_fftx, 'forward' )
[1]	1066
	1067	DO i = 0, nx
	1068	work_trix(i,k) = work_fftx(i)
	1069	ENDDO
	1070
	1071	ENDDO
	1072
	1073	ENDIF
	1074
	1075	!
	1076	!-- Solve the linear equation system
	1077	CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) )
	1078
	1079	IF ( host(1:3) == 'nec' ) THEN
	1080	!
	1081	!-- Code optimized for vector processors
	1082	CALL fft_x_m( work_trix, 'backward' )
	1083
	1084	DO k = 1, nz
	1085
	1086	m = 0
	1087	DO n = 1, pdims(1)
[1003]	1088	DO i = 1, nnx
[1]	1089	ar(i,k,j,n) = work_trix(m,k)
	1090	m = m + 1
	1091	ENDDO
	1092	ENDDO
	1093
	1094	ENDDO
	1095
	1096	ELSE
	1097	!
	1098	!-- Cache optimized code
	1099	DO k = 1, nz
	1100
	1101	DO i = 0, nx
	1102	work_fftx(i) = work_trix(i,k)
	1103	ENDDO
	1104
[1106]	1105	CALL fft_x_1d( work_fftx, 'backward' )
[1]	1106
	1107	m = 0
	1108	DO n = 1, pdims(1)
[1003]	1109	DO i = 1, nnx
[1]	1110	ar(i,k,j,n) = work_fftx(m)
	1111	m = m + 1
	1112	ENDDO
	1113	ENDDO
	1114
	1115	ENDDO
	1116
	1117	ENDIF
	1118
	1119	ENDDO
	1120
	1121	DEALLOCATE( tri )
	1122
[1106]	1123	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'stop' )
[1]	1124
	1125	END SUBROUTINE fftx_tri_fftx
	1126
	1127
	1128	!------------------------------------------------------------------------------!
[1682]	1129	! Description:
	1130	! ------------
	1131	!> Fourier-transformation along x with subsequent transposition x --> y for
	1132	!> a 1d-decomposition along y.
	1133	!>
	1134	!> @attention NEC-branch of this routine may significantly profit from
	1135	!> further optimizations. So far, performance is much worse than
	1136	!> for routine ffty_tr_yx (more than three times slower).
[1]	1137	!------------------------------------------------------------------------------!
[1682]	1138	SUBROUTINE fftx_tr_xy( f_in, f_out )
[1]	1139
[1682]	1140
[1320]	1141	USE control_parameters, &
	1142	ONLY: host
	1143
	1144	USE cpulog, &
	1145	ONLY: cpu_log, log_point_s
	1146
	1147	USE kinds
	1148
[1]	1149	USE pegrid
	1150
	1151	IMPLICIT NONE
	1152
[1682]	1153	INTEGER(iwp) :: i !<
	1154	INTEGER(iwp) :: j !<
	1155	INTEGER(iwp) :: k !<
[1]	1156
[1682]	1157	REAL(wp), DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx !<
	1158	REAL(wp), DIMENSION(1:nz,nys:nyn,0:nx) :: f_in !<
	1159	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_out !<
	1160	REAL(wp), DIMENSION(nys:nyn,1:nz,0:nx) :: work !<
[1]	1161
	1162	!
	1163	!-- Carry out the FFT along x, where all data are present due to the
	1164	!-- 1d-decomposition along y. Resort the data in a way that y becomes
	1165	!-- the first index.
[1106]	1166	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'start' )
[1]	1167
	1168	IF ( host(1:3) == 'nec' ) THEN
	1169	!
	1170	!-- Code for vector processors
[85]	1171	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	1172	!$OMP DO
	1173	DO i = 0, nx
	1174
	1175	DO j = nys, nyn
	1176	DO k = 1, nz
	1177	work_fftx(i,k,j) = f_in(k,j,i)
	1178	ENDDO
	1179	ENDDO
	1180
	1181	ENDDO
	1182
	1183	!$OMP DO
	1184	DO j = nys, nyn
	1185
	1186	CALL fft_x_m( work_fftx(:,:,j), 'forward' )
	1187
	1188	DO k = 1, nz
	1189	DO i = 0, nx
	1190	work(j,k,i) = work_fftx(i,k,j)
	1191	ENDDO
	1192	ENDDO
	1193
	1194	ENDDO
	1195	!$OMP END PARALLEL
	1196
	1197	ELSE
	1198
	1199	!
	1200	!-- Cache optimized code (there might be still a potential for better
	1201	!-- optimization).
[696]	1202	!$OMP PARALLEL PRIVATE (i,j,k)
[1]	1203	!$OMP DO
	1204	DO i = 0, nx
	1205
	1206	DO j = nys, nyn
	1207	DO k = 1, nz
	1208	work_fftx(i,k,j) = f_in(k,j,i)
	1209	ENDDO
	1210	ENDDO
	1211
	1212	ENDDO
	1213
	1214	!$OMP DO
	1215	DO j = nys, nyn
	1216	DO k = 1, nz
	1217
[1106]	1218	CALL fft_x_1d( work_fftx(0:nx,k,j), 'forward' )
[1]	1219
	1220	DO i = 0, nx
	1221	work(j,k,i) = work_fftx(i,k,j)
	1222	ENDDO
	1223	ENDDO
	1224
	1225	ENDDO
	1226	!$OMP END PARALLEL
	1227
	1228	ENDIF
[1106]	1229	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'pause' )
[1]	1230
	1231	!
	1232	!-- Transpose array
[1111]	1233	#if defined( __parallel )
[1]	1234	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	1235	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	1236	CALL MPI_ALLTOALL( work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
	1237	f_out(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
	1238	comm1dy, ierr )
	1239	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1111]	1240	#endif
[1]	1241
	1242	END SUBROUTINE fftx_tr_xy
	1243
	1244
	1245	!------------------------------------------------------------------------------!
[1682]	1246	! Description:
	1247	! ------------
	1248	!> Transposition y --> x with a subsequent backward Fourier transformation for
	1249	!> a 1d-decomposition along x.
[1]	1250	!------------------------------------------------------------------------------!
[1682]	1251	SUBROUTINE tr_yx_fftx( f_in, f_out )
[1]	1252
[1682]	1253
[1320]	1254	USE control_parameters, &
	1255	ONLY: host
	1256
	1257	USE cpulog, &
	1258	ONLY: cpu_log, log_point_s
	1259
	1260	USE kinds
	1261
[1]	1262	USE pegrid
	1263
	1264	IMPLICIT NONE
	1265
[1682]	1266	INTEGER(iwp) :: i !<
	1267	INTEGER(iwp) :: j !<
	1268	INTEGER(iwp) :: k !<
[1]	1269
[1682]	1270	REAL(wp), DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx !<
	1271	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_in !<
	1272	REAL(wp), DIMENSION(1:nz,nys:nyn,0:nx) :: f_out !<
	1273	REAL(wp), DIMENSION(nys:nyn,1:nz,0:nx) :: work !<
[1]	1274
	1275	!
	1276	!-- Transpose array
[1111]	1277	#if defined( __parallel )
[1]	1278	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	1279	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	1280	CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
	1281	work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
	1282	comm1dy, ierr )
	1283	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1111]	1284	#endif
[1]	1285
	1286	!
	1287	!-- Carry out the FFT along x, where all data are present due to the
	1288	!-- 1d-decomposition along y. Resort the data in a way that y becomes
	1289	!-- the first index.
[1106]	1290	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'continue' )
[1]	1291
	1292	IF ( host(1:3) == 'nec' ) THEN
	1293	!
	1294	!-- Code optimized for vector processors
[85]	1295	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	1296	!$OMP DO
	1297	DO j = nys, nyn
	1298
	1299	DO k = 1, nz
	1300	DO i = 0, nx
	1301	work_fftx(i,k,j) = work(j,k,i)
	1302	ENDDO
	1303	ENDDO
	1304
	1305	CALL fft_x_m( work_fftx(:,:,j), 'backward' )
	1306
	1307	ENDDO
	1308
	1309	!$OMP DO
	1310	DO i = 0, nx
	1311	DO j = nys, nyn
	1312	DO k = 1, nz
	1313	f_out(k,j,i) = work_fftx(i,k,j)
	1314	ENDDO
	1315	ENDDO
	1316	ENDDO
	1317	!$OMP END PARALLEL
	1318
	1319	ELSE
	1320
	1321	!
	1322	!-- Cache optimized code (there might be still a potential for better
	1323	!-- optimization).
[696]	1324	!$OMP PARALLEL PRIVATE (i,j,k)
[1]	1325	!$OMP DO
	1326	DO j = nys, nyn
	1327	DO k = 1, nz
	1328
	1329	DO i = 0, nx
	1330	work_fftx(i,k,j) = work(j,k,i)
	1331	ENDDO
	1332
[1106]	1333	CALL fft_x_1d( work_fftx(0:nx,k,j), 'backward' )
[1]	1334
	1335	ENDDO
	1336	ENDDO
	1337
	1338	!$OMP DO
	1339	DO i = 0, nx
	1340	DO j = nys, nyn
	1341	DO k = 1, nz
	1342	f_out(k,j,i) = work_fftx(i,k,j)
	1343	ENDDO
	1344	ENDDO
	1345	ENDDO
	1346	!$OMP END PARALLEL
	1347
	1348	ENDIF
[1106]	1349	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'stop' )
[1]	1350
	1351	END SUBROUTINE tr_yx_fftx
	1352
	1353
	1354	!------------------------------------------------------------------------------!
[1682]	1355	! Description:
	1356	! ------------
	1357	!> FFT along y, solution of the tridiagonal system and backward FFT for
	1358	!> a 1d-decomposition along y.
	1359	!>
	1360	!> @warning this subroutine may still not work for hybrid parallelization
	1361	!> with OpenMP (for possible necessary changes see the original
	1362	!> routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
[1]	1363	!------------------------------------------------------------------------------!
[1682]	1364	SUBROUTINE ffty_tri_ffty( ar )
[1]	1365
[1682]	1366
[1320]	1367	USE control_parameters, &
	1368	ONLY: host
	1369
	1370	USE cpulog, &
	1371	ONLY: cpu_log, log_point_s
	1372
	1373	USE grid_variables, &
	1374	ONLY: ddx2, ddy2
	1375
	1376	USE kinds
	1377
[1]	1378	USE pegrid
	1379
	1380	IMPLICIT NONE
	1381
[1682]	1382	INTEGER(iwp) :: i !<
	1383	INTEGER(iwp) :: j !<
	1384	INTEGER(iwp) :: k !<
	1385	INTEGER(iwp) :: m !<
	1386	INTEGER(iwp) :: n !<
	1387	INTEGER(iwp) :: omp_get_thread_num !<
	1388	INTEGER(iwp) :: tn !<
[1]	1389
[1682]	1390	REAL(wp), DIMENSION(0:ny) :: work_ffty !<
	1391	REAL(wp), DIMENSION(0:ny,1:nz) :: work_triy !<
	1392	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: ar !<
	1393	REAL(wp), DIMENSION(:,:,:,:), ALLOCATABLE :: tri !<
[1]	1394
	1395
[1106]	1396	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'start' )
[1]	1397
	1398	ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) )
	1399
	1400	tn = 0 ! Default thread number in case of one thread
[696]	1401	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy )
[1]	1402	DO i = nxl_y, nxr_y
	1403
	1404	!$ tn = omp_get_thread_num()
	1405
	1406	IF ( host(1:3) == 'nec' ) THEN
	1407	!
	1408	!-- Code optimized for vector processors
	1409	DO k = 1, nz
	1410
	1411	m = 0
	1412	DO n = 1, pdims(2)
[1003]	1413	DO j = 1, nny
[1]	1414	work_triy(m,k) = ar(j,k,i,n)
	1415	m = m + 1
	1416	ENDDO
	1417	ENDDO
	1418
	1419	ENDDO
	1420
	1421	CALL fft_y_m( work_triy, ny, 'forward' )
	1422
	1423	ELSE
	1424	!
	1425	!-- Cache optimized code
	1426	DO k = 1, nz
	1427
	1428	m = 0
	1429	DO n = 1, pdims(2)
[1003]	1430	DO j = 1, nny
[1]	1431	work_ffty(m) = ar(j,k,i,n)
	1432	m = m + 1
	1433	ENDDO
	1434	ENDDO
	1435
[1106]	1436	CALL fft_y_1d( work_ffty, 'forward' )
[1]	1437
	1438	DO j = 0, ny
	1439	work_triy(j,k) = work_ffty(j)
	1440	ENDDO
	1441
	1442	ENDDO
	1443
	1444	ENDIF
	1445
	1446	!
	1447	!-- Solve the linear equation system
	1448	CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) )
	1449
	1450	IF ( host(1:3) == 'nec' ) THEN
	1451	!
	1452	!-- Code optimized for vector processors
	1453	CALL fft_y_m( work_triy, ny, 'backward' )
	1454
	1455	DO k = 1, nz
	1456
	1457	m = 0
	1458	DO n = 1, pdims(2)
[1003]	1459	DO j = 1, nny
[1]	1460	ar(j,k,i,n) = work_triy(m,k)
	1461	m = m + 1
	1462	ENDDO
	1463	ENDDO
	1464
	1465	ENDDO
	1466
	1467	ELSE
	1468	!
	1469	!-- Cache optimized code
	1470	DO k = 1, nz
	1471
	1472	DO j = 0, ny
	1473	work_ffty(j) = work_triy(j,k)
	1474	ENDDO
	1475
[1106]	1476	CALL fft_y_1d( work_ffty, 'backward' )
[1]	1477
	1478	m = 0
	1479	DO n = 1, pdims(2)
[1003]	1480	DO j = 1, nny
[1]	1481	ar(j,k,i,n) = work_ffty(m)
	1482	m = m + 1
	1483	ENDDO
	1484	ENDDO
	1485
	1486	ENDDO
	1487
	1488	ENDIF
	1489
	1490	ENDDO
	1491
	1492	DEALLOCATE( tri )
	1493
[1106]	1494	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'stop' )
[1]	1495
	1496	END SUBROUTINE ffty_tri_ffty
	1497
	1498	END MODULE poisfft_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |