Home

Context Navigation

source: palm/trunk/SOURCE/poisfft_mod.f90 @ 1963

Last change on this file since 1963 was 1851, checked in by maronga, 9 years ago
last commit documented
Property svn:keywords set to `Id`
File size: 45.6 KB

Rev	Line
[1850]	1	!> @file poisfft_mod.f90
[1036]	2	!--------------------------------------------------------------------------------!
	3	! This file is part of PALM.
	4	!
	5	! PALM is free software: you can redistribute it and/or modify it under the terms
	6	! of the GNU General Public License as published by the Free Software Foundation,
	7	! either version 3 of the License, or (at your option) any later version.
	8	!
	9	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
	10	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
	11	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
	12	!
	13	! You should have received a copy of the GNU General Public License along with
	14	! PALM. If not, see <http://www.gnu.org/licenses/>.
	15	!
[1310]	16	! Copyright 1997-2014 Leibniz Universitaet Hannover
[1036]	17	!--------------------------------------------------------------------------------!
	18	!
[484]	19	! Current revisions:
[1]	20	! -----------------
[1683]	21	!
[1805]	22	!
[1321]	23	! Former revisions:
	24	! -----------------
	25	! $Id: poisfft_mod.f90 1851 2016-04-08 13:32:50Z suehring $
	26	!
[1851]	27	! 1850 2016-04-08 13:29:27Z maronga
	28	! Module renamed
	29	!
	30	!
[1805]	31	! 1804 2016-04-05 16:30:18Z maronga
	32	! Removed code for parameter file check (__check)
	33	!
[1683]	34	! 1682 2015-10-07 23:56:08Z knoop
	35	! Code annotations made doxygen readable
	36	!
[1483]	37	! 1482 2014-10-18 12:34:45Z raasch
	38	! use 2d-decomposition, if accelerator boards are used
	39	!
[1407]	40	! 1406 2014-05-16 13:47:01Z raasch
	41	! bugfix for pgi 14.4: declare create moved after array declaration
	42	!
[1321]	43	! 1320 2014-03-20 08:40:49Z raasch
[1320]	44	! ONLY-attribute added to USE-statements,
	45	! kind-parameters added to all INTEGER and REAL declaration statements,
	46	! kinds are defined in new module kinds,
	47	! old module precision_kind is removed,
	48	! revision history before 2012 removed,
	49	! comment fields (!:) to be used for variable explanations added to
	50	! all variable declaration statements
[1217]	51	!
[1319]	52	! 1318 2014-03-17 13:35:16Z raasch
	53	! module interfaces removed
	54	!
[1307]	55	! 1306 2014-03-13 14:30:59Z raasch
	56	! openmp sections removed from the overlap branch,
	57	! second argument removed from parameter list
	58	!
[1217]	59	! 1216 2013-08-26 09:31:42Z raasch
[1216]	60	! resorting of arrays moved to separate routines resort_for_...,
	61	! one argument, used as temporary work array, removed from all transpose
	62	! routines
	63	! overlapping fft / transposition implemented
[1112]	64	!
[1213]	65	! 1212 2013-08-15 08:46:27Z raasch
	66	! tridia routines moved to seperate module tridia_solver
	67	!
[1209]	68	! 1208 2013-08-13 06:41:49Z raasch
	69	! acc-update clauses added for "ar" so that ffts other than cufft can also be
	70	! used (although they are not ported and will give a poor performance)
	71	!
[1112]	72	! 1111 2013-03-08 23:54:10Z raasch
[1111]	73	! further openACC porting of non-parallel (MPI) branch:
	74	! tridiagonal routines split into extermal subroutines (instead using CONTAINS),
	75	! no distinction between parallel/non-parallel in poisfft and tridia any more,
[1112]	76	! tridia routines moved to end of file because of probable bug in PGI compiler 12.5
[1111]	77	! (otherwise "invalid device function" is indicated during runtime),
	78	! optimization of tridia routines: constant elements and coefficients of tri are
	79	! stored in seperate arrays ddzuw and tric, last dimension of tri reduced from 5
	80	! to 2,
	81	! poisfft_init is now called internally from poisfft, maketri is called from
	82	! poisfft_init,
	83	! ibc_p_b = 2 removed
[1]	84	!
[1107]	85	! 1106 2013-03-04 05:31:38Z raasch
	86	! routines fftx, ffty, fftxp, fftyp removed, calls replaced by fft_x, fft_y,
	87	! in the 1D-decomposition routines fft_x, ffty are replaced by fft_x_1d,
	88	! fft_y_1d
	89	!
[1104]	90	! 1103 2013-02-20 02:15:53Z raasch
	91	! tri, ar, and ar1 arguments in tridia-routines (2d) are removed because they
	92	! sometimes cause segmentation faults with intel 12.1 compiler
	93	!
[1093]	94	! 1092 2013-02-02 11:24:22Z raasch
	95	! unused variables removed
	96	!
[1037]	97	! 1036 2012-10-22 13:43:42Z raasch
	98	! code put under GPL (PALM 3.9)
	99	!
[1014]	100	! 2012-09-21 07:03:55Z raasch
	101	! FLOAT type conversion replaced by REAL
	102	!
[1004]	103	! 1003 2012-09-14 14:35:53Z raasch
	104	! indices nxa, nya, etc. replaced by nx, ny, etc.
	105	!
[941]	106	! 940 2012-07-09 14:31:00Z raasch
	107	! special handling of tri-array as an argument in tridia_1dd routines switched
	108	! off because it caused segmentation faults with intel 12.1 compiler
	109	!
[878]	110	! 877 2012-04-03 11:21:44Z suehring
	111	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
	112	! pressure at the top of the model domain.
	113	!
[810]	114	! 809 2012-01-30 13:32:58Z maronga
	115	! Bugfix: replaced .AND. and .NOT. with && and ! in the preprocessor directives
	116	!
[808]	117	! 807 2012-01-25 11:53:51Z maronga
	118	! New cpp directive "__check" implemented which is used by check_namelist_files
	119	! (most of the code is unneeded by check_namelist_files).
	120	!
[1]	121	! Revision 1.1 1997/07/24 11:24:14 raasch
	122	! Initial revision
	123	!
	124	!
	125	! Description:
	126	! ------------
[1682]	127	!> Solves the Poisson equation with a 2D spectral method
	128	!> d^2 p / dx^2 + d^2 p / dy^2 + d^2 p / dz^2 = s
	129	!>
	130	!> Input:
	131	!> real ar contains (nnz,nny,nnx) elements of the velocity divergence,
	132	!> starting from (1,nys,nxl)
	133	!>
	134	!> Output:
	135	!> real ar contains the solution for perturbation pressure p
[1]	136	!------------------------------------------------------------------------------!
[1682]	137	MODULE poisfft_mod
	138
[1]	139
[1320]	140	USE fft_xy, &
	141	ONLY: fft_init, fft_y, fft_y_1d, fft_y_m, fft_x, fft_x_1d, fft_x_m
[1]	142
[1320]	143	USE indices, &
	144	ONLY: nnx, nny, nx, nxl, nxr, ny, nys, nyn, nz
	145
	146	USE transpose_indices, &
	147	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nys_x, nys_z, nyn_x, nyn_z, nzb_x, &
	148	nzb_y, nzt_x, nzt_y
	149
	150	USE tridia_solver, &
	151	ONLY: tridia_1dd, tridia_init, tridia_substi, tridia_substi_overlap
	152
[1]	153	IMPLICIT NONE
	154
[1111]	155	LOGICAL, SAVE :: poisfft_initialized = .FALSE.
	156
[1]	157	PRIVATE
[807]	158
[1]	159	PUBLIC poisfft, poisfft_init
	160
	161	INTERFACE poisfft
	162	MODULE PROCEDURE poisfft
	163	END INTERFACE poisfft
	164
	165	INTERFACE poisfft_init
	166	MODULE PROCEDURE poisfft_init
	167	END INTERFACE poisfft_init
	168
[807]	169
[1]	170	CONTAINS
	171
[1682]	172	!------------------------------------------------------------------------------!
	173	! Description:
	174	! ------------
	175	!> @todo Missing subroutine description.
	176	!------------------------------------------------------------------------------!
[1]	177	SUBROUTINE poisfft_init
	178
[1320]	179	USE arrays_3d, &
	180	ONLY: ddzu_pres, ddzw
[1111]	181
[1320]	182	USE kinds
	183
[1111]	184	IMPLICIT NONE
	185
[1682]	186	INTEGER(iwp) :: k !<
[1111]	187
	188
[1]	189	CALL fft_init
	190
[1212]	191	CALL tridia_init
[1111]	192
	193	poisfft_initialized = .TRUE.
	194
[1]	195	END SUBROUTINE poisfft_init
	196
[1111]	197
[1804]	198
[1682]	199	!------------------------------------------------------------------------------!
	200	! Description:
	201	! ------------
	202	!> Two-dimensional Fourier Transformation in x- and y-direction.
	203	!------------------------------------------------------------------------------!
[1306]	204	SUBROUTINE poisfft( ar )
[1]	205
[1320]	206	USE control_parameters, &
	207	ONLY: fft_method, transpose_compute_overlap
	208
	209	USE cpulog, &
	210	ONLY: cpu_log, cpu_log_nowait, log_point_s
	211
	212	USE kinds
	213
[1]	214	USE pegrid
	215
	216	IMPLICIT NONE
	217
[1682]	218	INTEGER(iwp) :: ii !<
	219	INTEGER(iwp) :: iind !<
	220	INTEGER(iwp) :: inew !<
	221	INTEGER(iwp) :: jj !<
	222	INTEGER(iwp) :: jind !<
	223	INTEGER(iwp) :: jnew !<
	224	INTEGER(iwp) :: ki !<
	225	INTEGER(iwp) :: kk !<
	226	INTEGER(iwp) :: knew !<
	227	INTEGER(iwp) :: n !<
	228	INTEGER(iwp) :: nblk !<
	229	INTEGER(iwp) :: nnx_y !<
	230	INTEGER(iwp) :: nny_z !<
	231	INTEGER(iwp) :: nnz_t !<
	232	INTEGER(iwp) :: nnz_x !<
	233	INTEGER(iwp) :: nxl_y_bound !<
	234	INTEGER(iwp) :: nxr_y_bound !<
[1]	235
[1682]	236	INTEGER(iwp), DIMENSION(4) :: isave !<
[1320]	237
[1682]	238	REAL(wp), DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar !<
	239	REAL(wp), DIMENSION(nys:nyn,nxl:nxr,1:nz) :: ar_inv !<
[1216]	240	!$acc declare create( ar_inv )
[1]	241
[1682]	242	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: ar1 !<
	243	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_in !<
	244	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_inv !<
	245	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_out_y !<
	246	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_out_z !<
[1216]	247
	248
[1]	249	CALL cpu_log( log_point_s(3), 'poisfft', 'start' )
	250
[1111]	251	IF ( .NOT. poisfft_initialized ) CALL poisfft_init
	252
[1]	253	!
	254	!-- Two-dimensional Fourier Transformation in x- and y-direction.
[1482]	255	IF ( pdims(2) == 1 .AND. pdims(1) > 1 .AND. num_acc_per_node == 0 ) &
	256	THEN
[1]	257
	258	!
	259	!-- 1d-domain-decomposition along x:
	260	!-- FFT along y and transposition y --> x
[1216]	261	CALL ffty_tr_yx( ar, ar )
[1]	262
	263	!
	264	!-- FFT along x, solving the tridiagonal system and backward FFT
	265	CALL fftx_tri_fftx( ar )
	266
	267	!
	268	!-- Transposition x --> y and backward FFT along y
[1216]	269	CALL tr_xy_ffty( ar, ar )
[1]	270
[1482]	271	ELSEIF ( pdims(1) == 1 .AND. pdims(2) > 1 .AND. num_acc_per_node == 0 ) &
	272	THEN
[1]	273
	274	!
	275	!-- 1d-domain-decomposition along y:
	276	!-- FFT along x and transposition x --> y
[1216]	277	CALL fftx_tr_xy( ar, ar )
[1]	278
	279	!
	280	!-- FFT along y, solving the tridiagonal system and backward FFT
	281	CALL ffty_tri_ffty( ar )
	282
	283	!
	284	!-- Transposition y --> x and backward FFT along x
[1216]	285	CALL tr_yx_fftx( ar, ar )
[1]	286
[1216]	287	ELSEIF ( .NOT. transpose_compute_overlap ) THEN
[1]	288
	289	!
[1111]	290	!-- 2d-domain-decomposition or no decomposition (1 PE run)
[1]	291	!-- Transposition z --> x
	292	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
[1216]	293	CALL resort_for_zx( ar, ar_inv )
	294	CALL transpose_zx( ar_inv, ar )
[1]	295	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
	296
	297	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
[1208]	298	IF ( fft_method /= 'system-specific' ) THEN
	299	!$acc update host( ar )
	300	ENDIF
[1106]	301	CALL fft_x( ar, 'forward' )
[1208]	302	IF ( fft_method /= 'system-specific' ) THEN
	303	!$acc update device( ar )
	304	ENDIF
[1]	305	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
	306
	307	!
	308	!-- Transposition x --> y
	309	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
[1216]	310	CALL resort_for_xy( ar, ar_inv )
	311	CALL transpose_xy( ar_inv, ar )
[1]	312	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
	313
	314	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
[1208]	315	IF ( fft_method /= 'system-specific' ) THEN
	316	!$acc update host( ar )
	317	ENDIF
[1216]	318	CALL fft_y( ar, 'forward', ar_tr = ar, &
	319	nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
	320	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
[1208]	321	IF ( fft_method /= 'system-specific' ) THEN
	322	!$acc update device( ar )
	323	ENDIF
[1]	324	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	325
	326	!
	327	!-- Transposition y --> z
	328	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
[1216]	329	CALL resort_for_yz( ar, ar_inv )
	330	CALL transpose_yz( ar_inv, ar )
[1]	331	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
	332
	333	!
[1106]	334	!-- Solve the tridiagonal equation system along z
[1]	335	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
[1212]	336	CALL tridia_substi( ar )
[1]	337	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
	338
	339	!
	340	!-- Inverse Fourier Transformation
	341	!-- Transposition z --> y
	342	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
[1216]	343	CALL transpose_zy( ar, ar_inv )
	344	CALL resort_for_zy( ar_inv, ar )
[1]	345	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
	346
	347	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
[1208]	348	IF ( fft_method /= 'system-specific' ) THEN
	349	!$acc update host( ar )
	350	ENDIF
[1216]	351	CALL fft_y( ar, 'backward', ar_tr = ar, &
	352	nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
	353	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
[1208]	354	IF ( fft_method /= 'system-specific' ) THEN
	355	!$acc update device( ar )
	356	ENDIF
[1]	357	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	358
	359	!
	360	!-- Transposition y --> x
	361	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
[1216]	362	CALL transpose_yx( ar, ar_inv )
	363	CALL resort_for_yx( ar_inv, ar )
[1]	364	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
	365
	366	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
[1208]	367	IF ( fft_method /= 'system-specific' ) THEN
	368	!$acc update host( ar )
	369	ENDIF
[1106]	370	CALL fft_x( ar, 'backward' )
[1208]	371	IF ( fft_method /= 'system-specific' ) THEN
	372	!$acc update device( ar )
	373	ENDIF
[1]	374	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	375
	376	!
	377	!-- Transposition x --> z
	378	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
[1216]	379	CALL transpose_xz( ar, ar_inv )
	380	CALL resort_for_xz( ar_inv, ar )
[1]	381	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
	382
[1216]	383	ELSE
	384
	385	!
	386	!-- 2d-domain-decomposition or no decomposition (1 PE run) with
	387	!-- overlapping transposition / fft
[1318]	388	!-- cputime logging must not use barriers, which would prevent overlapping
[1216]	389	ALLOCATE( f_out_y(0:ny,nxl_y:nxr_y,nzb_y:nzt_y), &
	390	f_out_z(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
	391	!
	392	!-- Transposition z --> x + subsequent fft along x
	393	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
	394	CALL resort_for_zx( ar, f_inv )
	395	!
	396	!-- Save original indices and gridpoint counter
	397	isave(1) = nz
	398	isave(2) = nzb_x
	399	isave(3) = nzt_x
	400	isave(4) = sendrecvcount_zx
	401	!
	402	!-- Set new indices for transformation
	403	nblk = nz / pdims(1)
	404	nz = pdims(1)
	405	nnz_x = 1
	406	nzb_x = 1 + myidx * nnz_x
	407	nzt_x = ( myidx + 1 ) * nnz_x
	408	sendrecvcount_zx = nnx * nny * nnz_x
	409
[1306]	410	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
[1216]	411	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
	412
[1306]	413	DO kk = 1, nblk
[1216]	414
[1306]	415	IF ( kk == 1 ) THEN
[1318]	416	CALL cpu_log( log_point_s(5), 'transpo forward', 'start', cpu_log_nowait )
[1306]	417	ELSE
[1318]	418	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
[1306]	419	ENDIF
[1216]	420
[1306]	421	DO knew = 1, nz
	422	ki = kk + nblk * ( knew - 1 )
	423	f_in(:,:,knew) = f_inv(:,:,ki)
	424	ENDDO
[1216]	425
[1306]	426	CALL transpose_zx( f_in, ar1(:,:,:))
	427	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
[1216]	428
[1306]	429	IF ( kk == 1 ) THEN
[1318]	430	CALL cpu_log( log_point_s(4), 'fft_x', 'start', cpu_log_nowait )
[1306]	431	ELSE
[1318]	432	CALL cpu_log( log_point_s(4), 'fft_x', 'continue', cpu_log_nowait )
[1216]	433	ENDIF
	434
[1306]	435	n = isave(2) + kk - 1
	436	CALL fft_x( ar1(:,:,:), 'forward', ar_2d = f_out_z(:,:,n))
	437	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
[1216]	438
	439	ENDDO
	440	!
	441	!-- Restore original indices/counters
	442	nz = isave(1)
	443	nzb_x = isave(2)
	444	nzt_x = isave(3)
	445	sendrecvcount_zx = isave(4)
	446
	447	DEALLOCATE( ar1, f_in, f_inv )
	448
	449	!
	450	!-- Transposition x --> y + subsequent fft along y
	451	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
	452	CALL resort_for_xy( f_out_z, f_inv )
	453	!
	454	!-- Save original indices and gridpoint counter
	455	isave(1) = nx
	456	isave(2) = nxl_y
	457	isave(3) = nxr_y
	458	isave(4) = sendrecvcount_xy
	459	!
	460	!-- Set new indices for transformation
	461	nblk = ( ( nx+1 ) / pdims(2) ) - 1
	462	nx = pdims(2)
	463	nnx_y = 1
	464	nxl_y = myidy * nnx_y
	465	nxr_y = ( myidy + 1 ) * nnx_y - 1
	466	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
	467
[1306]	468	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
[1216]	469	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
	470
[1306]	471	DO ii = 0, nblk
[1216]	472
[1318]	473	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
[1216]	474
[1306]	475	DO inew = 0, nx-1
	476	iind = ii + ( nblk + 1 ) * inew
	477	f_in(:,:,inew) = f_inv(:,:,iind)
	478	ENDDO
[1216]	479
[1306]	480	CALL transpose_xy( f_in, ar1(:,:,:) )
[1216]	481
[1306]	482	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
[1216]	483
[1306]	484	IF ( ii == 1 ) THEN
[1318]	485	CALL cpu_log( log_point_s(7), 'fft_y', 'start', cpu_log_nowait )
[1306]	486	ELSE
[1318]	487	CALL cpu_log( log_point_s(7), 'fft_y', 'continue', cpu_log_nowait )
[1216]	488	ENDIF
	489
[1306]	490	nxl_y_bound = isave(2)
	491	nxr_y_bound = isave(3)
	492	n = isave(2) + ii
	493	CALL fft_y( ar1(:,:,:), 'forward', ar_tr = f_out_y, &
	494	nxl_y_bound = nxl_y_bound, nxr_y_bound = nxr_y_bound, &
	495	nxl_y_l = n, nxr_y_l = n )
[1216]	496
[1306]	497	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
[1216]	498
	499	ENDDO
	500	!
	501	!-- Restore original indices/counters
	502	nx = isave(1)
	503	nxl_y = isave(2)
	504	nxr_y = isave(3)
	505	sendrecvcount_xy = isave(4)
	506
	507	DEALLOCATE( ar1, f_in, f_inv )
	508
	509	!
	510	!-- Transposition y --> z + subsequent tridia + resort for z --> y
	511	ALLOCATE( f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
	512	CALL resort_for_yz( f_out_y, f_inv )
	513	!
	514	!-- Save original indices and gridpoint counter
	515	isave(1) = ny
	516	isave(2) = nys_z
	517	isave(3) = nyn_z
	518	isave(4) = sendrecvcount_yz
	519	!
	520	!-- Set new indices for transformation
	521	nblk = ( ( ny+1 ) / pdims(1) ) - 1
	522	ny = pdims(1)
	523	nny_z = 1
	524	nys_z = myidx * nny_z
	525	nyn_z = ( myidx + 1 ) * nny_z - 1
	526	sendrecvcount_yz = ( nxr_y-nxl_y+1 ) * nny_z * ( nzt_y-nzb_y+1 )
	527
[1306]	528	ALLOCATE( ar1(nxl_z:nxr_z,nys_z:nyn_z,1:nz) )
[1216]	529	ALLOCATE( f_in(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
	530
[1306]	531	DO jj = 0, nblk
[1216]	532	!
[1306]	533	!-- Forward Fourier Transformation
	534	!-- Transposition y --> z
[1318]	535	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
[1216]	536
[1306]	537	DO jnew = 0, ny-1
	538	jind = jj + ( nblk + 1 ) * jnew
	539	f_in(:,:,jnew) = f_inv(:,:,jind)
	540	ENDDO
[1216]	541
[1306]	542	CALL transpose_yz( f_in, ar1(:,:,:) )
[1216]	543
[1306]	544	IF ( jj == nblk ) THEN
	545	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
	546	ELSE
	547	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
[1216]	548	ENDIF
	549
	550	!
[1306]	551	!-- Solve the tridiagonal equation system along z
[1318]	552	CALL cpu_log( log_point_s(6), 'tridia', 'start', cpu_log_nowait )
[1216]	553
[1306]	554	n = isave(2) + jj
	555	CALL tridia_substi_overlap( ar1(:,:,:), n )
[1216]	556
[1306]	557	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
[1216]	558
[1306]	559	!
	560	!-- Inverse Fourier Transformation
	561	!-- Transposition z --> y
	562	!-- Only one thread should call MPI routines, therefore forward and
	563	!-- backward tranpose are in the same section
	564	IF ( jj == 0 ) THEN
[1318]	565	CALL cpu_log( log_point_s(8), 'transpo invers', 'start', cpu_log_nowait )
[1306]	566	ELSE
[1318]	567	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
[1216]	568	ENDIF
	569
[1306]	570	CALL transpose_zy( ar1(:,:,:), f_in )
[1216]	571
[1306]	572	DO jnew = 0, ny-1
	573	jind = jj + ( nblk + 1 ) * jnew
	574	f_inv(:,:,jind) = f_in(:,:,jnew)
	575	ENDDO
[1216]	576
[1306]	577	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
[1216]	578
	579	ENDDO
	580	!
	581	!-- Restore original indices/counters
	582	ny = isave(1)
	583	nys_z = isave(2)
	584	nyn_z = isave(3)
	585	sendrecvcount_yz = isave(4)
	586
	587	CALL resort_for_zy( f_inv, f_out_y )
	588
	589	DEALLOCATE( ar1, f_in, f_inv )
	590
	591	!
	592	!-- fft along y backward + subsequent transposition y --> x
	593	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
	594	!
	595	!-- Save original indices and gridpoint counter
	596	isave(1) = nx
	597	isave(2) = nxl_y
	598	isave(3) = nxr_y
	599	isave(4) = sendrecvcount_xy
	600	!
	601	!-- Set new indices for transformation
	602	nblk = (( nx+1 ) / pdims(2) ) - 1
	603	nx = pdims(2)
	604	nnx_y = 1
	605	nxl_y = myidy * nnx_y
	606	nxr_y = ( myidy + 1 ) * nnx_y - 1
	607	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
	608
[1306]	609	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
[1216]	610	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
	611
[1306]	612	DO ii = 0, nblk
[1216]	613
[1318]	614	CALL cpu_log( log_point_s(7), 'fft_y', 'continue', cpu_log_nowait )
[1216]	615
[1306]	616	n = isave(2) + ii
	617	nxl_y_bound = isave(2)
	618	nxr_y_bound = isave(3)
[1216]	619
[1306]	620	CALL fft_y( ar1(:,:,:), 'backward', ar_tr = f_out_y, &
	621	nxl_y_bound = nxl_y_bound, nxr_y_bound = nxr_y_bound, &
	622	nxl_y_l = n, nxr_y_l = n )
[1216]	623
[1306]	624	IF ( ii == nblk ) THEN
	625	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	626	ELSE
	627	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
[1216]	628	ENDIF
	629
[1318]	630	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
[1216]	631
[1306]	632	CALL transpose_yx( ar1(:,:,:), f_in )
[1216]	633
[1306]	634	DO inew = 0, nx-1
	635	iind = ii + (nblk+1) * inew
	636	f_inv(:,:,iind) = f_in(:,:,inew)
	637	ENDDO
[1216]	638
[1306]	639	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
[1216]	640
	641	ENDDO
	642	!
	643	!-- Restore original indices/counters
	644	nx = isave(1)
	645	nxl_y = isave(2)
	646	nxr_y = isave(3)
	647	sendrecvcount_xy = isave(4)
	648
	649	CALL resort_for_yx( f_inv, f_out_z )
	650
	651	DEALLOCATE( ar1, f_in, f_inv )
	652
	653	!
	654	!-- fft along x backward + subsequent final transposition x --> z
	655	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
	656	!
	657	!-- Save original indices and gridpoint counter
	658	isave(1) = nz
	659	isave(2) = nzb_x
	660	isave(3) = nzt_x
	661	isave(4) = sendrecvcount_zx
	662	!
	663	!-- Set new indices for transformation
	664	nblk = nz / pdims(1)
	665	nz = pdims(1)
	666	nnz_x = 1
	667	nzb_x = 1 + myidx * nnz_x
	668	nzt_x = ( myidx + 1 ) * nnz_x
	669	sendrecvcount_zx = nnx * nny * nnz_x
	670
[1306]	671	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
[1216]	672	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
	673
[1306]	674	DO kk = 1, nblk
[1216]	675
[1318]	676	CALL cpu_log( log_point_s(4), 'fft_x', 'continue', cpu_log_nowait )
[1216]	677
[1306]	678	n = isave(2) + kk - 1
	679	CALL fft_x( ar1(:,:,:), 'backward', f_out_z(:,:,n))
[1216]	680
[1306]	681	IF ( kk == nblk ) THEN
	682	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	683	ELSE
	684	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
[1216]	685	ENDIF
	686
[1318]	687	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
[1216]	688
[1306]	689	CALL transpose_xz( ar1(:,:,:), f_in )
[1216]	690
[1306]	691	DO knew = 1, nz
	692	ki = kk + nblk * (knew-1)
	693	f_inv(:,:,ki) = f_in(:,:,knew)
	694	ENDDO
[1216]	695
[1306]	696	IF ( kk == nblk ) THEN
	697	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
	698	ELSE
	699	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
[1216]	700	ENDIF
	701
	702	ENDDO
	703	!
	704	!-- Restore original indices/counters
	705	nz = isave(1)
	706	nzb_x = isave(2)
	707	nzt_x = isave(3)
	708	sendrecvcount_zx = isave(4)
	709
	710	CALL resort_for_xz( f_inv, ar )
	711
	712	DEALLOCATE( ar1, f_in, f_inv )
	713
[1]	714	ENDIF
	715
	716	CALL cpu_log( log_point_s(3), 'poisfft', 'stop' )
	717
	718	END SUBROUTINE poisfft
	719
	720
	721	!------------------------------------------------------------------------------!
[1682]	722	! Description:
	723	! ------------
	724	!> Fourier-transformation along y with subsequent transposition y --> x for
	725	!> a 1d-decomposition along x.
	726	!>
	727	!> @attention The performance of this routine is much faster on the NEC-SX6,
	728	!> if the first index of work_ffty_vec is odd. Otherwise
	729	!> memory bank conflicts may occur (especially if the index is a
	730	!> multiple of 128). That's why work_ffty_vec is dimensioned as
	731	!> 0:ny+1.
	732	!> Of course, this will not work if users are using an odd number
	733	!> of gridpoints along y.
[1]	734	!------------------------------------------------------------------------------!
[1682]	735	SUBROUTINE ffty_tr_yx( f_in, f_out )
[1]	736
[1682]	737
[1320]	738	USE control_parameters, &
	739	ONLY: host
	740
	741	USE cpulog, &
	742	ONLY: cpu_log, log_point_s
	743
	744	USE kinds
	745
[1]	746	USE pegrid
	747
	748	IMPLICIT NONE
	749
[1682]	750	INTEGER(iwp) :: i !<
	751	INTEGER(iwp) :: iend !<
	752	INTEGER(iwp) :: iouter !<
	753	INTEGER(iwp) :: ir !<
	754	INTEGER(iwp) :: j !<
	755	INTEGER(iwp) :: k !<
[1]	756
[1682]	757	INTEGER(iwp), PARAMETER :: stridex = 4 !<
[1320]	758
[1682]	759	REAL(wp), DIMENSION(0:ny,stridex) :: work_ffty !<
[1]	760	#if defined( __nec )
[1682]	761	REAL(wp), DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec !<
[1]	762	#endif
[1682]	763	REAL(wp), DIMENSION(1:nz,0:ny,nxl:nxr) :: f_in !<
	764	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_out !<
	765	REAL(wp), DIMENSION(nxl:nxr,1:nz,0:ny) :: work !<
[1]	766
	767	!
	768	!-- Carry out the FFT along y, where all data are present due to the
	769	!-- 1d-decomposition along x. Resort the data in a way that x becomes
	770	!-- the first index.
[1106]	771	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'start' )
[1]	772
	773	IF ( host(1:3) == 'nec' ) THEN
	774	#if defined( __nec )
	775	!
	776	!-- Code optimized for vector processors
[85]	777	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	778	!$OMP DO
	779	DO i = nxl, nxr
	780
	781	DO j = 0, ny
	782	DO k = 1, nz
	783	work_ffty_vec(j,k,i) = f_in(k,j,i)
	784	ENDDO
	785	ENDDO
	786
	787	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' )
	788
	789	ENDDO
	790
	791	!$OMP DO
	792	DO k = 1, nz
	793	DO j = 0, ny
	794	DO i = nxl, nxr
	795	work(i,k,j) = work_ffty_vec(j,k,i)
	796	ENDDO
	797	ENDDO
	798	ENDDO
	799	!$OMP END PARALLEL
	800	#endif
	801
	802	ELSE
	803
	804	!
	805	!-- Cache optimized code.
	806	!-- The i-(x-)direction is split into a strided outer loop and an inner
	807	!-- loop for better cache performance
	808	!$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty)
	809	!$OMP DO
	810	DO iouter = nxl, nxr, stridex
	811
	812	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
	813
	814	DO k = 1, nz
	815
	816	DO i = iouter, iend
	817
	818	ir = i-iouter+1 ! counter within a stride
	819	DO j = 0, ny
	820	work_ffty(j,ir) = f_in(k,j,i)
	821	ENDDO
	822	!
	823	!-- FFT along y
[1106]	824	CALL fft_y_1d( work_ffty(:,ir), 'forward' )
[1]	825
	826	ENDDO
	827
	828	!
	829	!-- Resort
	830	DO j = 0, ny
	831	DO i = iouter, iend
	832	work(i,k,j) = work_ffty(j,i-iouter+1)
	833	ENDDO
	834	ENDDO
	835
	836	ENDDO
	837
	838	ENDDO
	839	!$OMP END PARALLEL
	840
	841	ENDIF
[1106]	842	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'pause' )
[1]	843
	844	!
	845	!-- Transpose array
[1111]	846	#if defined( __parallel )
[1]	847	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	848	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	849	CALL MPI_ALLTOALL( work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
	850	f_out(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
	851	comm1dx, ierr )
	852	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1111]	853	#endif
[1]	854
	855	END SUBROUTINE ffty_tr_yx
	856
	857
	858	!------------------------------------------------------------------------------!
[1682]	859	! Description:
	860	! ------------
	861	!> Transposition x --> y with a subsequent backward Fourier transformation for
	862	!> a 1d-decomposition along x
[1]	863	!------------------------------------------------------------------------------!
[1682]	864	SUBROUTINE tr_xy_ffty( f_in, f_out )
[1]	865
[1682]	866
[1320]	867	USE control_parameters, &
	868	ONLY: host
	869
	870	USE cpulog, &
	871	ONLY: cpu_log, log_point_s
	872
	873	USE kinds
	874
[1]	875	USE pegrid
	876
	877	IMPLICIT NONE
	878
[1682]	879	INTEGER(iwp) :: i !<
	880	INTEGER(iwp) :: iend !<
	881	INTEGER(iwp) :: iouter !<
	882	INTEGER(iwp) :: ir !<
	883	INTEGER(iwp) :: j !<
	884	INTEGER(iwp) :: k !<
[1]	885
[1682]	886	INTEGER(iwp), PARAMETER :: stridex = 4 !<
[1320]	887
[1682]	888	REAL(wp), DIMENSION(0:ny,stridex) :: work_ffty !<
[1]	889	#if defined( __nec )
[1682]	890	REAL(wp), DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec !<
[1]	891	#endif
[1682]	892	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_in !<
	893	REAL(wp), DIMENSION(1:nz,0:ny,nxl:nxr) :: f_out !<
	894	REAL(wp), DIMENSION(nxl:nxr,1:nz,0:ny) :: work !<
[1]	895
	896	!
	897	!-- Transpose array
[1111]	898	#if defined( __parallel )
[1]	899	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	900	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	901	CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
	902	work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
	903	comm1dx, ierr )
	904	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1111]	905	#endif
[1]	906
	907	!
	908	!-- Resort the data in a way that y becomes the first index and carry out the
	909	!-- backward fft along y.
[1106]	910	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'continue' )
[1]	911
	912	IF ( host(1:3) == 'nec' ) THEN
	913	#if defined( __nec )
	914	!
	915	!-- Code optimized for vector processors
[85]	916	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	917	!$OMP DO
	918	DO k = 1, nz
	919	DO j = 0, ny
	920	DO i = nxl, nxr
	921	work_ffty_vec(j,k,i) = work(i,k,j)
	922	ENDDO
	923	ENDDO
	924	ENDDO
	925
	926	!$OMP DO
	927	DO i = nxl, nxr
	928
	929	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' )
	930
	931	DO j = 0, ny
	932	DO k = 1, nz
	933	f_out(k,j,i) = work_ffty_vec(j,k,i)
	934	ENDDO
	935	ENDDO
	936
	937	ENDDO
	938	!$OMP END PARALLEL
	939	#endif
	940
	941	ELSE
	942
	943	!
	944	!-- Cache optimized code.
	945	!-- The i-(x-)direction is split into a strided outer loop and an inner
	946	!-- loop for better cache performance
	947	!$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty )
	948	!$OMP DO
	949	DO iouter = nxl, nxr, stridex
	950
	951	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
	952
	953	DO k = 1, nz
	954	!
	955	!-- Resort
	956	DO j = 0, ny
	957	DO i = iouter, iend
	958	work_ffty(j,i-iouter+1) = work(i,k,j)
	959	ENDDO
	960	ENDDO
	961
	962	DO i = iouter, iend
	963
	964	!
	965	!-- FFT along y
	966	ir = i-iouter+1 ! counter within a stride
[1106]	967	CALL fft_y_1d( work_ffty(:,ir), 'backward' )
[1]	968
	969	DO j = 0, ny
	970	f_out(k,j,i) = work_ffty(j,ir)
	971	ENDDO
	972	ENDDO
	973
	974	ENDDO
	975
	976	ENDDO
	977	!$OMP END PARALLEL
	978
	979	ENDIF
	980
[1106]	981	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'stop' )
[1]	982
	983	END SUBROUTINE tr_xy_ffty
	984
	985
	986	!------------------------------------------------------------------------------!
[1682]	987	! Description:
	988	! ------------
	989	!> FFT along x, solution of the tridiagonal system and backward FFT for
	990	!> a 1d-decomposition along x
	991	!>
	992	!> @warning this subroutine may still not work for hybrid parallelization
	993	!> with OpenMP (for possible necessary changes see the original
	994	!> routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
[1]	995	!------------------------------------------------------------------------------!
[1682]	996	SUBROUTINE fftx_tri_fftx( ar )
[1]	997
[1682]	998
[1320]	999	USE control_parameters, &
	1000	ONLY: host
	1001
	1002	USE cpulog, &
	1003	ONLY: cpu_log, log_point_s
	1004
	1005	USE grid_variables, &
	1006	ONLY: ddx2, ddy2
	1007
	1008	USE kinds
	1009
[1]	1010	USE pegrid
	1011
	1012	IMPLICIT NONE
	1013
[1682]	1014	INTEGER(iwp) :: i !<
	1015	INTEGER(iwp) :: j !<
	1016	INTEGER(iwp) :: k !<
	1017	INTEGER(iwp) :: m !<
	1018	INTEGER(iwp) :: n !<
	1019	INTEGER(iwp) :: omp_get_thread_num !<
	1020	INTEGER(iwp) :: tn !<
[1]	1021
[1682]	1022	REAL(wp), DIMENSION(0:nx) :: work_fftx !<
	1023	REAL(wp), DIMENSION(0:nx,1:nz) :: work_trix !<
	1024	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: ar !<
	1025	REAL(wp), DIMENSION(:,:,:,:), ALLOCATABLE :: tri !<
[1]	1026
	1027
[1106]	1028	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'start' )
[1]	1029
	1030	ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) )
	1031
	1032	tn = 0 ! Default thread number in case of one thread
	1033	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_fftx, work_trix )
	1034	DO j = nys_x, nyn_x
	1035
	1036	!$ tn = omp_get_thread_num()
	1037
	1038	IF ( host(1:3) == 'nec' ) THEN
	1039	!
	1040	!-- Code optimized for vector processors
	1041	DO k = 1, nz
	1042
	1043	m = 0
	1044	DO n = 1, pdims(1)
[1003]	1045	DO i = 1, nnx
[1]	1046	work_trix(m,k) = ar(i,k,j,n)
	1047	m = m + 1
	1048	ENDDO
	1049	ENDDO
	1050
	1051	ENDDO
	1052
	1053	CALL fft_x_m( work_trix, 'forward' )
	1054
	1055	ELSE
	1056	!
	1057	!-- Cache optimized code
	1058	DO k = 1, nz
	1059
	1060	m = 0
	1061	DO n = 1, pdims(1)
[1003]	1062	DO i = 1, nnx
[1]	1063	work_fftx(m) = ar(i,k,j,n)
	1064	m = m + 1
	1065	ENDDO
	1066	ENDDO
	1067
[1106]	1068	CALL fft_x_1d( work_fftx, 'forward' )
[1]	1069
	1070	DO i = 0, nx
	1071	work_trix(i,k) = work_fftx(i)
	1072	ENDDO
	1073
	1074	ENDDO
	1075
	1076	ENDIF
	1077
	1078	!
	1079	!-- Solve the linear equation system
	1080	CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) )
	1081
	1082	IF ( host(1:3) == 'nec' ) THEN
	1083	!
	1084	!-- Code optimized for vector processors
	1085	CALL fft_x_m( work_trix, 'backward' )
	1086
	1087	DO k = 1, nz
	1088
	1089	m = 0
	1090	DO n = 1, pdims(1)
[1003]	1091	DO i = 1, nnx
[1]	1092	ar(i,k,j,n) = work_trix(m,k)
	1093	m = m + 1
	1094	ENDDO
	1095	ENDDO
	1096
	1097	ENDDO
	1098
	1099	ELSE
	1100	!
	1101	!-- Cache optimized code
	1102	DO k = 1, nz
	1103
	1104	DO i = 0, nx
	1105	work_fftx(i) = work_trix(i,k)
	1106	ENDDO
	1107
[1106]	1108	CALL fft_x_1d( work_fftx, 'backward' )
[1]	1109
	1110	m = 0
	1111	DO n = 1, pdims(1)
[1003]	1112	DO i = 1, nnx
[1]	1113	ar(i,k,j,n) = work_fftx(m)
	1114	m = m + 1
	1115	ENDDO
	1116	ENDDO
	1117
	1118	ENDDO
	1119
	1120	ENDIF
	1121
	1122	ENDDO
	1123
	1124	DEALLOCATE( tri )
	1125
[1106]	1126	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'stop' )
[1]	1127
	1128	END SUBROUTINE fftx_tri_fftx
	1129
	1130
	1131	!------------------------------------------------------------------------------!
[1682]	1132	! Description:
	1133	! ------------
	1134	!> Fourier-transformation along x with subsequent transposition x --> y for
	1135	!> a 1d-decomposition along y.
	1136	!>
	1137	!> @attention NEC-branch of this routine may significantly profit from
	1138	!> further optimizations. So far, performance is much worse than
	1139	!> for routine ffty_tr_yx (more than three times slower).
[1]	1140	!------------------------------------------------------------------------------!
[1682]	1141	SUBROUTINE fftx_tr_xy( f_in, f_out )
[1]	1142
[1682]	1143
[1320]	1144	USE control_parameters, &
	1145	ONLY: host
	1146
	1147	USE cpulog, &
	1148	ONLY: cpu_log, log_point_s
	1149
	1150	USE kinds
	1151
[1]	1152	USE pegrid
	1153
	1154	IMPLICIT NONE
	1155
[1682]	1156	INTEGER(iwp) :: i !<
	1157	INTEGER(iwp) :: j !<
	1158	INTEGER(iwp) :: k !<
[1]	1159
[1682]	1160	REAL(wp), DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx !<
	1161	REAL(wp), DIMENSION(1:nz,nys:nyn,0:nx) :: f_in !<
	1162	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_out !<
	1163	REAL(wp), DIMENSION(nys:nyn,1:nz,0:nx) :: work !<
[1]	1164
	1165	!
	1166	!-- Carry out the FFT along x, where all data are present due to the
	1167	!-- 1d-decomposition along y. Resort the data in a way that y becomes
	1168	!-- the first index.
[1106]	1169	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'start' )
[1]	1170
	1171	IF ( host(1:3) == 'nec' ) THEN
	1172	!
	1173	!-- Code for vector processors
[85]	1174	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	1175	!$OMP DO
	1176	DO i = 0, nx
	1177
	1178	DO j = nys, nyn
	1179	DO k = 1, nz
	1180	work_fftx(i,k,j) = f_in(k,j,i)
	1181	ENDDO
	1182	ENDDO
	1183
	1184	ENDDO
	1185
	1186	!$OMP DO
	1187	DO j = nys, nyn
	1188
	1189	CALL fft_x_m( work_fftx(:,:,j), 'forward' )
	1190
	1191	DO k = 1, nz
	1192	DO i = 0, nx
	1193	work(j,k,i) = work_fftx(i,k,j)
	1194	ENDDO
	1195	ENDDO
	1196
	1197	ENDDO
	1198	!$OMP END PARALLEL
	1199
	1200	ELSE
	1201
	1202	!
	1203	!-- Cache optimized code (there might be still a potential for better
	1204	!-- optimization).
[696]	1205	!$OMP PARALLEL PRIVATE (i,j,k)
[1]	1206	!$OMP DO
	1207	DO i = 0, nx
	1208
	1209	DO j = nys, nyn
	1210	DO k = 1, nz
	1211	work_fftx(i,k,j) = f_in(k,j,i)
	1212	ENDDO
	1213	ENDDO
	1214
	1215	ENDDO
	1216
	1217	!$OMP DO
	1218	DO j = nys, nyn
	1219	DO k = 1, nz
	1220
[1106]	1221	CALL fft_x_1d( work_fftx(0:nx,k,j), 'forward' )
[1]	1222
	1223	DO i = 0, nx
	1224	work(j,k,i) = work_fftx(i,k,j)
	1225	ENDDO
	1226	ENDDO
	1227
	1228	ENDDO
	1229	!$OMP END PARALLEL
	1230
	1231	ENDIF
[1106]	1232	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'pause' )
[1]	1233
	1234	!
	1235	!-- Transpose array
[1111]	1236	#if defined( __parallel )
[1]	1237	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	1238	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	1239	CALL MPI_ALLTOALL( work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
	1240	f_out(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
	1241	comm1dy, ierr )
	1242	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1111]	1243	#endif
[1]	1244
	1245	END SUBROUTINE fftx_tr_xy
	1246
	1247
	1248	!------------------------------------------------------------------------------!
[1682]	1249	! Description:
	1250	! ------------
	1251	!> Transposition y --> x with a subsequent backward Fourier transformation for
	1252	!> a 1d-decomposition along x.
[1]	1253	!------------------------------------------------------------------------------!
[1682]	1254	SUBROUTINE tr_yx_fftx( f_in, f_out )
[1]	1255
[1682]	1256
[1320]	1257	USE control_parameters, &
	1258	ONLY: host
	1259
	1260	USE cpulog, &
	1261	ONLY: cpu_log, log_point_s
	1262
	1263	USE kinds
	1264
[1]	1265	USE pegrid
	1266
	1267	IMPLICIT NONE
	1268
[1682]	1269	INTEGER(iwp) :: i !<
	1270	INTEGER(iwp) :: j !<
	1271	INTEGER(iwp) :: k !<
[1]	1272
[1682]	1273	REAL(wp), DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx !<
	1274	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_in !<
	1275	REAL(wp), DIMENSION(1:nz,nys:nyn,0:nx) :: f_out !<
	1276	REAL(wp), DIMENSION(nys:nyn,1:nz,0:nx) :: work !<
[1]	1277
	1278	!
	1279	!-- Transpose array
[1111]	1280	#if defined( __parallel )
[1]	1281	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	1282	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	1283	CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
	1284	work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
	1285	comm1dy, ierr )
	1286	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1111]	1287	#endif
[1]	1288
	1289	!
	1290	!-- Carry out the FFT along x, where all data are present due to the
	1291	!-- 1d-decomposition along y. Resort the data in a way that y becomes
	1292	!-- the first index.
[1106]	1293	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'continue' )
[1]	1294
	1295	IF ( host(1:3) == 'nec' ) THEN
	1296	!
	1297	!-- Code optimized for vector processors
[85]	1298	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	1299	!$OMP DO
	1300	DO j = nys, nyn
	1301
	1302	DO k = 1, nz
	1303	DO i = 0, nx
	1304	work_fftx(i,k,j) = work(j,k,i)
	1305	ENDDO
	1306	ENDDO
	1307
	1308	CALL fft_x_m( work_fftx(:,:,j), 'backward' )
	1309
	1310	ENDDO
	1311
	1312	!$OMP DO
	1313	DO i = 0, nx
	1314	DO j = nys, nyn
	1315	DO k = 1, nz
	1316	f_out(k,j,i) = work_fftx(i,k,j)
	1317	ENDDO
	1318	ENDDO
	1319	ENDDO
	1320	!$OMP END PARALLEL
	1321
	1322	ELSE
	1323
	1324	!
	1325	!-- Cache optimized code (there might be still a potential for better
	1326	!-- optimization).
[696]	1327	!$OMP PARALLEL PRIVATE (i,j,k)
[1]	1328	!$OMP DO
	1329	DO j = nys, nyn
	1330	DO k = 1, nz
	1331
	1332	DO i = 0, nx
	1333	work_fftx(i,k,j) = work(j,k,i)
	1334	ENDDO
	1335
[1106]	1336	CALL fft_x_1d( work_fftx(0:nx,k,j), 'backward' )
[1]	1337
	1338	ENDDO
	1339	ENDDO
	1340
	1341	!$OMP DO
	1342	DO i = 0, nx
	1343	DO j = nys, nyn
	1344	DO k = 1, nz
	1345	f_out(k,j,i) = work_fftx(i,k,j)
	1346	ENDDO
	1347	ENDDO
	1348	ENDDO
	1349	!$OMP END PARALLEL
	1350
	1351	ENDIF
[1106]	1352	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'stop' )
[1]	1353
	1354	END SUBROUTINE tr_yx_fftx
	1355
	1356
	1357	!------------------------------------------------------------------------------!
[1682]	1358	! Description:
	1359	! ------------
	1360	!> FFT along y, solution of the tridiagonal system and backward FFT for
	1361	!> a 1d-decomposition along y.
	1362	!>
	1363	!> @warning this subroutine may still not work for hybrid parallelization
	1364	!> with OpenMP (for possible necessary changes see the original
	1365	!> routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
[1]	1366	!------------------------------------------------------------------------------!
[1682]	1367	SUBROUTINE ffty_tri_ffty( ar )
[1]	1368
[1682]	1369
[1320]	1370	USE control_parameters, &
	1371	ONLY: host
	1372
	1373	USE cpulog, &
	1374	ONLY: cpu_log, log_point_s
	1375
	1376	USE grid_variables, &
	1377	ONLY: ddx2, ddy2
	1378
	1379	USE kinds
	1380
[1]	1381	USE pegrid
	1382
	1383	IMPLICIT NONE
	1384
[1682]	1385	INTEGER(iwp) :: i !<
	1386	INTEGER(iwp) :: j !<
	1387	INTEGER(iwp) :: k !<
	1388	INTEGER(iwp) :: m !<
	1389	INTEGER(iwp) :: n !<
	1390	INTEGER(iwp) :: omp_get_thread_num !<
	1391	INTEGER(iwp) :: tn !<
[1]	1392
[1682]	1393	REAL(wp), DIMENSION(0:ny) :: work_ffty !<
	1394	REAL(wp), DIMENSION(0:ny,1:nz) :: work_triy !<
	1395	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: ar !<
	1396	REAL(wp), DIMENSION(:,:,:,:), ALLOCATABLE :: tri !<
[1]	1397
	1398
[1106]	1399	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'start' )
[1]	1400
	1401	ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) )
	1402
	1403	tn = 0 ! Default thread number in case of one thread
[696]	1404	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy )
[1]	1405	DO i = nxl_y, nxr_y
	1406
	1407	!$ tn = omp_get_thread_num()
	1408
	1409	IF ( host(1:3) == 'nec' ) THEN
	1410	!
	1411	!-- Code optimized for vector processors
	1412	DO k = 1, nz
	1413
	1414	m = 0
	1415	DO n = 1, pdims(2)
[1003]	1416	DO j = 1, nny
[1]	1417	work_triy(m,k) = ar(j,k,i,n)
	1418	m = m + 1
	1419	ENDDO
	1420	ENDDO
	1421
	1422	ENDDO
	1423
	1424	CALL fft_y_m( work_triy, ny, 'forward' )
	1425
	1426	ELSE
	1427	!
	1428	!-- Cache optimized code
	1429	DO k = 1, nz
	1430
	1431	m = 0
	1432	DO n = 1, pdims(2)
[1003]	1433	DO j = 1, nny
[1]	1434	work_ffty(m) = ar(j,k,i,n)
	1435	m = m + 1
	1436	ENDDO
	1437	ENDDO
	1438
[1106]	1439	CALL fft_y_1d( work_ffty, 'forward' )
[1]	1440
	1441	DO j = 0, ny
	1442	work_triy(j,k) = work_ffty(j)
	1443	ENDDO
	1444
	1445	ENDDO
	1446
	1447	ENDIF
	1448
	1449	!
	1450	!-- Solve the linear equation system
	1451	CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) )
	1452
	1453	IF ( host(1:3) == 'nec' ) THEN
	1454	!
	1455	!-- Code optimized for vector processors
	1456	CALL fft_y_m( work_triy, ny, 'backward' )
	1457
	1458	DO k = 1, nz
	1459
	1460	m = 0
	1461	DO n = 1, pdims(2)
[1003]	1462	DO j = 1, nny
[1]	1463	ar(j,k,i,n) = work_triy(m,k)
	1464	m = m + 1
	1465	ENDDO
	1466	ENDDO
	1467
	1468	ENDDO
	1469
	1470	ELSE
	1471	!
	1472	!-- Cache optimized code
	1473	DO k = 1, nz
	1474
	1475	DO j = 0, ny
	1476	work_ffty(j) = work_triy(j,k)
	1477	ENDDO
	1478
[1106]	1479	CALL fft_y_1d( work_ffty, 'backward' )
[1]	1480
	1481	m = 0
	1482	DO n = 1, pdims(2)
[1003]	1483	DO j = 1, nny
[1]	1484	ar(j,k,i,n) = work_ffty(m)
	1485	m = m + 1
	1486	ENDDO
	1487	ENDDO
	1488
	1489	ENDDO
	1490
	1491	ENDIF
	1492
	1493	ENDDO
	1494
	1495	DEALLOCATE( tri )
	1496
[1106]	1497	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'stop' )
[1]	1498
	1499	END SUBROUTINE ffty_tri_ffty
	1500
	1501	END MODULE poisfft_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |