Home

Context Navigation

source: palm/trunk/SOURCE/poisfft.f90 @ 1804

Last change on this file since 1804 was 1804, checked in by maronga, 8 years ago
removed parameter file check. update of mrungui for compilation with qt5
Property svn:keywords set to `Id`
File size: 45.5 KB

Rev	Line
[1682]	1	!> @file poisfft.f90
[1036]	2	!--------------------------------------------------------------------------------!
	3	! This file is part of PALM.
	4	!
	5	! PALM is free software: you can redistribute it and/or modify it under the terms
	6	! of the GNU General Public License as published by the Free Software Foundation,
	7	! either version 3 of the License, or (at your option) any later version.
	8	!
	9	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
	10	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
	11	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
	12	!
	13	! You should have received a copy of the GNU General Public License along with
	14	! PALM. If not, see <http://www.gnu.org/licenses/>.
	15	!
[1310]	16	! Copyright 1997-2014 Leibniz Universitaet Hannover
[1036]	17	!--------------------------------------------------------------------------------!
	18	!
[484]	19	! Current revisions:
[1]	20	! -----------------
[1804]	21	! Removed code for parameter file check (__check)
[1683]	22	!
[1321]	23	! Former revisions:
	24	! -----------------
	25	! $Id: poisfft.f90 1804 2016-04-05 16:30:18Z maronga $
	26	!
[1683]	27	! 1682 2015-10-07 23:56:08Z knoop
	28	! Code annotations made doxygen readable
	29	!
[1483]	30	! 1482 2014-10-18 12:34:45Z raasch
	31	! use 2d-decomposition, if accelerator boards are used
	32	!
[1407]	33	! 1406 2014-05-16 13:47:01Z raasch
	34	! bugfix for pgi 14.4: declare create moved after array declaration
	35	!
[1321]	36	! 1320 2014-03-20 08:40:49Z raasch
[1320]	37	! ONLY-attribute added to USE-statements,
	38	! kind-parameters added to all INTEGER and REAL declaration statements,
	39	! kinds are defined in new module kinds,
	40	! old module precision_kind is removed,
	41	! revision history before 2012 removed,
	42	! comment fields (!:) to be used for variable explanations added to
	43	! all variable declaration statements
[1217]	44	!
[1319]	45	! 1318 2014-03-17 13:35:16Z raasch
	46	! module interfaces removed
	47	!
[1307]	48	! 1306 2014-03-13 14:30:59Z raasch
	49	! openmp sections removed from the overlap branch,
	50	! second argument removed from parameter list
	51	!
[1217]	52	! 1216 2013-08-26 09:31:42Z raasch
[1216]	53	! resorting of arrays moved to separate routines resort_for_...,
	54	! one argument, used as temporary work array, removed from all transpose
	55	! routines
	56	! overlapping fft / transposition implemented
[1112]	57	!
[1213]	58	! 1212 2013-08-15 08:46:27Z raasch
	59	! tridia routines moved to seperate module tridia_solver
	60	!
[1209]	61	! 1208 2013-08-13 06:41:49Z raasch
	62	! acc-update clauses added for "ar" so that ffts other than cufft can also be
	63	! used (although they are not ported and will give a poor performance)
	64	!
[1112]	65	! 1111 2013-03-08 23:54:10Z raasch
[1111]	66	! further openACC porting of non-parallel (MPI) branch:
	67	! tridiagonal routines split into extermal subroutines (instead using CONTAINS),
	68	! no distinction between parallel/non-parallel in poisfft and tridia any more,
[1112]	69	! tridia routines moved to end of file because of probable bug in PGI compiler 12.5
[1111]	70	! (otherwise "invalid device function" is indicated during runtime),
	71	! optimization of tridia routines: constant elements and coefficients of tri are
	72	! stored in seperate arrays ddzuw and tric, last dimension of tri reduced from 5
	73	! to 2,
	74	! poisfft_init is now called internally from poisfft, maketri is called from
	75	! poisfft_init,
	76	! ibc_p_b = 2 removed
[1]	77	!
[1107]	78	! 1106 2013-03-04 05:31:38Z raasch
	79	! routines fftx, ffty, fftxp, fftyp removed, calls replaced by fft_x, fft_y,
	80	! in the 1D-decomposition routines fft_x, ffty are replaced by fft_x_1d,
	81	! fft_y_1d
	82	!
[1104]	83	! 1103 2013-02-20 02:15:53Z raasch
	84	! tri, ar, and ar1 arguments in tridia-routines (2d) are removed because they
	85	! sometimes cause segmentation faults with intel 12.1 compiler
	86	!
[1093]	87	! 1092 2013-02-02 11:24:22Z raasch
	88	! unused variables removed
	89	!
[1037]	90	! 1036 2012-10-22 13:43:42Z raasch
	91	! code put under GPL (PALM 3.9)
	92	!
[1014]	93	! 2012-09-21 07:03:55Z raasch
	94	! FLOAT type conversion replaced by REAL
	95	!
[1004]	96	! 1003 2012-09-14 14:35:53Z raasch
	97	! indices nxa, nya, etc. replaced by nx, ny, etc.
	98	!
[941]	99	! 940 2012-07-09 14:31:00Z raasch
	100	! special handling of tri-array as an argument in tridia_1dd routines switched
	101	! off because it caused segmentation faults with intel 12.1 compiler
	102	!
[878]	103	! 877 2012-04-03 11:21:44Z suehring
	104	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
	105	! pressure at the top of the model domain.
	106	!
[810]	107	! 809 2012-01-30 13:32:58Z maronga
	108	! Bugfix: replaced .AND. and .NOT. with && and ! in the preprocessor directives
	109	!
[808]	110	! 807 2012-01-25 11:53:51Z maronga
	111	! New cpp directive "__check" implemented which is used by check_namelist_files
	112	! (most of the code is unneeded by check_namelist_files).
	113	!
[1]	114	! Revision 1.1 1997/07/24 11:24:14 raasch
	115	! Initial revision
	116	!
	117	!
	118	! Description:
	119	! ------------
[1682]	120	!> Solves the Poisson equation with a 2D spectral method
	121	!> d^2 p / dx^2 + d^2 p / dy^2 + d^2 p / dz^2 = s
	122	!>
	123	!> Input:
	124	!> real ar contains (nnz,nny,nnx) elements of the velocity divergence,
	125	!> starting from (1,nys,nxl)
	126	!>
	127	!> Output:
	128	!> real ar contains the solution for perturbation pressure p
[1]	129	!------------------------------------------------------------------------------!
[1682]	130	MODULE poisfft_mod
	131
[1]	132
[1320]	133	USE fft_xy, &
	134	ONLY: fft_init, fft_y, fft_y_1d, fft_y_m, fft_x, fft_x_1d, fft_x_m
[1]	135
[1320]	136	USE indices, &
	137	ONLY: nnx, nny, nx, nxl, nxr, ny, nys, nyn, nz
	138
	139	USE transpose_indices, &
	140	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nys_x, nys_z, nyn_x, nyn_z, nzb_x, &
	141	nzb_y, nzt_x, nzt_y
	142
	143	USE tridia_solver, &
	144	ONLY: tridia_1dd, tridia_init, tridia_substi, tridia_substi_overlap
	145
[1]	146	IMPLICIT NONE
	147
[1111]	148	LOGICAL, SAVE :: poisfft_initialized = .FALSE.
	149
[1]	150	PRIVATE
[807]	151
[1]	152	PUBLIC poisfft, poisfft_init
	153
	154	INTERFACE poisfft
	155	MODULE PROCEDURE poisfft
	156	END INTERFACE poisfft
	157
	158	INTERFACE poisfft_init
	159	MODULE PROCEDURE poisfft_init
	160	END INTERFACE poisfft_init
	161
[807]	162
[1]	163	CONTAINS
	164
[1682]	165	!------------------------------------------------------------------------------!
	166	! Description:
	167	! ------------
	168	!> @todo Missing subroutine description.
	169	!------------------------------------------------------------------------------!
[1]	170	SUBROUTINE poisfft_init
	171
[1320]	172	USE arrays_3d, &
	173	ONLY: ddzu_pres, ddzw
[1111]	174
[1320]	175	USE kinds
	176
[1111]	177	IMPLICIT NONE
	178
[1682]	179	INTEGER(iwp) :: k !<
[1111]	180
	181
[1]	182	CALL fft_init
	183
[1212]	184	CALL tridia_init
[1111]	185
	186	poisfft_initialized = .TRUE.
	187
[1]	188	END SUBROUTINE poisfft_init
	189
[1111]	190
[1804]	191
[1682]	192	!------------------------------------------------------------------------------!
	193	! Description:
	194	! ------------
	195	!> Two-dimensional Fourier Transformation in x- and y-direction.
	196	!------------------------------------------------------------------------------!
[1306]	197	SUBROUTINE poisfft( ar )
[1]	198
[1320]	199	USE control_parameters, &
	200	ONLY: fft_method, transpose_compute_overlap
	201
	202	USE cpulog, &
	203	ONLY: cpu_log, cpu_log_nowait, log_point_s
	204
	205	USE kinds
	206
[1]	207	USE pegrid
	208
	209	IMPLICIT NONE
	210
[1682]	211	INTEGER(iwp) :: ii !<
	212	INTEGER(iwp) :: iind !<
	213	INTEGER(iwp) :: inew !<
	214	INTEGER(iwp) :: jj !<
	215	INTEGER(iwp) :: jind !<
	216	INTEGER(iwp) :: jnew !<
	217	INTEGER(iwp) :: ki !<
	218	INTEGER(iwp) :: kk !<
	219	INTEGER(iwp) :: knew !<
	220	INTEGER(iwp) :: n !<
	221	INTEGER(iwp) :: nblk !<
	222	INTEGER(iwp) :: nnx_y !<
	223	INTEGER(iwp) :: nny_z !<
	224	INTEGER(iwp) :: nnz_t !<
	225	INTEGER(iwp) :: nnz_x !<
	226	INTEGER(iwp) :: nxl_y_bound !<
	227	INTEGER(iwp) :: nxr_y_bound !<
[1]	228
[1682]	229	INTEGER(iwp), DIMENSION(4) :: isave !<
[1320]	230
[1682]	231	REAL(wp), DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar !<
	232	REAL(wp), DIMENSION(nys:nyn,nxl:nxr,1:nz) :: ar_inv !<
[1216]	233	!$acc declare create( ar_inv )
[1]	234
[1682]	235	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: ar1 !<
	236	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_in !<
	237	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_inv !<
	238	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_out_y !<
	239	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_out_z !<
[1216]	240
	241
[1]	242	CALL cpu_log( log_point_s(3), 'poisfft', 'start' )
	243
[1111]	244	IF ( .NOT. poisfft_initialized ) CALL poisfft_init
	245
[1]	246	!
	247	!-- Two-dimensional Fourier Transformation in x- and y-direction.
[1482]	248	IF ( pdims(2) == 1 .AND. pdims(1) > 1 .AND. num_acc_per_node == 0 ) &
	249	THEN
[1]	250
	251	!
	252	!-- 1d-domain-decomposition along x:
	253	!-- FFT along y and transposition y --> x
[1216]	254	CALL ffty_tr_yx( ar, ar )
[1]	255
	256	!
	257	!-- FFT along x, solving the tridiagonal system and backward FFT
	258	CALL fftx_tri_fftx( ar )
	259
	260	!
	261	!-- Transposition x --> y and backward FFT along y
[1216]	262	CALL tr_xy_ffty( ar, ar )
[1]	263
[1482]	264	ELSEIF ( pdims(1) == 1 .AND. pdims(2) > 1 .AND. num_acc_per_node == 0 ) &
	265	THEN
[1]	266
	267	!
	268	!-- 1d-domain-decomposition along y:
	269	!-- FFT along x and transposition x --> y
[1216]	270	CALL fftx_tr_xy( ar, ar )
[1]	271
	272	!
	273	!-- FFT along y, solving the tridiagonal system and backward FFT
	274	CALL ffty_tri_ffty( ar )
	275
	276	!
	277	!-- Transposition y --> x and backward FFT along x
[1216]	278	CALL tr_yx_fftx( ar, ar )
[1]	279
[1216]	280	ELSEIF ( .NOT. transpose_compute_overlap ) THEN
[1]	281
	282	!
[1111]	283	!-- 2d-domain-decomposition or no decomposition (1 PE run)
[1]	284	!-- Transposition z --> x
	285	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
[1216]	286	CALL resort_for_zx( ar, ar_inv )
	287	CALL transpose_zx( ar_inv, ar )
[1]	288	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
	289
	290	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
[1208]	291	IF ( fft_method /= 'system-specific' ) THEN
	292	!$acc update host( ar )
	293	ENDIF
[1106]	294	CALL fft_x( ar, 'forward' )
[1208]	295	IF ( fft_method /= 'system-specific' ) THEN
	296	!$acc update device( ar )
	297	ENDIF
[1]	298	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
	299
	300	!
	301	!-- Transposition x --> y
	302	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
[1216]	303	CALL resort_for_xy( ar, ar_inv )
	304	CALL transpose_xy( ar_inv, ar )
[1]	305	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
	306
	307	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
[1208]	308	IF ( fft_method /= 'system-specific' ) THEN
	309	!$acc update host( ar )
	310	ENDIF
[1216]	311	CALL fft_y( ar, 'forward', ar_tr = ar, &
	312	nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
	313	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
[1208]	314	IF ( fft_method /= 'system-specific' ) THEN
	315	!$acc update device( ar )
	316	ENDIF
[1]	317	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	318
	319	!
	320	!-- Transposition y --> z
	321	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
[1216]	322	CALL resort_for_yz( ar, ar_inv )
	323	CALL transpose_yz( ar_inv, ar )
[1]	324	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
	325
	326	!
[1106]	327	!-- Solve the tridiagonal equation system along z
[1]	328	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
[1212]	329	CALL tridia_substi( ar )
[1]	330	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
	331
	332	!
	333	!-- Inverse Fourier Transformation
	334	!-- Transposition z --> y
	335	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
[1216]	336	CALL transpose_zy( ar, ar_inv )
	337	CALL resort_for_zy( ar_inv, ar )
[1]	338	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
	339
	340	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
[1208]	341	IF ( fft_method /= 'system-specific' ) THEN
	342	!$acc update host( ar )
	343	ENDIF
[1216]	344	CALL fft_y( ar, 'backward', ar_tr = ar, &
	345	nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
	346	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
[1208]	347	IF ( fft_method /= 'system-specific' ) THEN
	348	!$acc update device( ar )
	349	ENDIF
[1]	350	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	351
	352	!
	353	!-- Transposition y --> x
	354	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
[1216]	355	CALL transpose_yx( ar, ar_inv )
	356	CALL resort_for_yx( ar_inv, ar )
[1]	357	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
	358
	359	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
[1208]	360	IF ( fft_method /= 'system-specific' ) THEN
	361	!$acc update host( ar )
	362	ENDIF
[1106]	363	CALL fft_x( ar, 'backward' )
[1208]	364	IF ( fft_method /= 'system-specific' ) THEN
	365	!$acc update device( ar )
	366	ENDIF
[1]	367	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	368
	369	!
	370	!-- Transposition x --> z
	371	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
[1216]	372	CALL transpose_xz( ar, ar_inv )
	373	CALL resort_for_xz( ar_inv, ar )
[1]	374	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
	375
[1216]	376	ELSE
	377
	378	!
	379	!-- 2d-domain-decomposition or no decomposition (1 PE run) with
	380	!-- overlapping transposition / fft
[1318]	381	!-- cputime logging must not use barriers, which would prevent overlapping
[1216]	382	ALLOCATE( f_out_y(0:ny,nxl_y:nxr_y,nzb_y:nzt_y), &
	383	f_out_z(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
	384	!
	385	!-- Transposition z --> x + subsequent fft along x
	386	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
	387	CALL resort_for_zx( ar, f_inv )
	388	!
	389	!-- Save original indices and gridpoint counter
	390	isave(1) = nz
	391	isave(2) = nzb_x
	392	isave(3) = nzt_x
	393	isave(4) = sendrecvcount_zx
	394	!
	395	!-- Set new indices for transformation
	396	nblk = nz / pdims(1)
	397	nz = pdims(1)
	398	nnz_x = 1
	399	nzb_x = 1 + myidx * nnz_x
	400	nzt_x = ( myidx + 1 ) * nnz_x
	401	sendrecvcount_zx = nnx * nny * nnz_x
	402
[1306]	403	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
[1216]	404	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
	405
[1306]	406	DO kk = 1, nblk
[1216]	407
[1306]	408	IF ( kk == 1 ) THEN
[1318]	409	CALL cpu_log( log_point_s(5), 'transpo forward', 'start', cpu_log_nowait )
[1306]	410	ELSE
[1318]	411	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
[1306]	412	ENDIF
[1216]	413
[1306]	414	DO knew = 1, nz
	415	ki = kk + nblk * ( knew - 1 )
	416	f_in(:,:,knew) = f_inv(:,:,ki)
	417	ENDDO
[1216]	418
[1306]	419	CALL transpose_zx( f_in, ar1(:,:,:))
	420	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
[1216]	421
[1306]	422	IF ( kk == 1 ) THEN
[1318]	423	CALL cpu_log( log_point_s(4), 'fft_x', 'start', cpu_log_nowait )
[1306]	424	ELSE
[1318]	425	CALL cpu_log( log_point_s(4), 'fft_x', 'continue', cpu_log_nowait )
[1216]	426	ENDIF
	427
[1306]	428	n = isave(2) + kk - 1
	429	CALL fft_x( ar1(:,:,:), 'forward', ar_2d = f_out_z(:,:,n))
	430	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
[1216]	431
	432	ENDDO
	433	!
	434	!-- Restore original indices/counters
	435	nz = isave(1)
	436	nzb_x = isave(2)
	437	nzt_x = isave(3)
	438	sendrecvcount_zx = isave(4)
	439
	440	DEALLOCATE( ar1, f_in, f_inv )
	441
	442	!
	443	!-- Transposition x --> y + subsequent fft along y
	444	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
	445	CALL resort_for_xy( f_out_z, f_inv )
	446	!
	447	!-- Save original indices and gridpoint counter
	448	isave(1) = nx
	449	isave(2) = nxl_y
	450	isave(3) = nxr_y
	451	isave(4) = sendrecvcount_xy
	452	!
	453	!-- Set new indices for transformation
	454	nblk = ( ( nx+1 ) / pdims(2) ) - 1
	455	nx = pdims(2)
	456	nnx_y = 1
	457	nxl_y = myidy * nnx_y
	458	nxr_y = ( myidy + 1 ) * nnx_y - 1
	459	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
	460
[1306]	461	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
[1216]	462	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
	463
[1306]	464	DO ii = 0, nblk
[1216]	465
[1318]	466	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
[1216]	467
[1306]	468	DO inew = 0, nx-1
	469	iind = ii + ( nblk + 1 ) * inew
	470	f_in(:,:,inew) = f_inv(:,:,iind)
	471	ENDDO
[1216]	472
[1306]	473	CALL transpose_xy( f_in, ar1(:,:,:) )
[1216]	474
[1306]	475	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
[1216]	476
[1306]	477	IF ( ii == 1 ) THEN
[1318]	478	CALL cpu_log( log_point_s(7), 'fft_y', 'start', cpu_log_nowait )
[1306]	479	ELSE
[1318]	480	CALL cpu_log( log_point_s(7), 'fft_y', 'continue', cpu_log_nowait )
[1216]	481	ENDIF
	482
[1306]	483	nxl_y_bound = isave(2)
	484	nxr_y_bound = isave(3)
	485	n = isave(2) + ii
	486	CALL fft_y( ar1(:,:,:), 'forward', ar_tr = f_out_y, &
	487	nxl_y_bound = nxl_y_bound, nxr_y_bound = nxr_y_bound, &
	488	nxl_y_l = n, nxr_y_l = n )
[1216]	489
[1306]	490	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
[1216]	491
	492	ENDDO
	493	!
	494	!-- Restore original indices/counters
	495	nx = isave(1)
	496	nxl_y = isave(2)
	497	nxr_y = isave(3)
	498	sendrecvcount_xy = isave(4)
	499
	500	DEALLOCATE( ar1, f_in, f_inv )
	501
	502	!
	503	!-- Transposition y --> z + subsequent tridia + resort for z --> y
	504	ALLOCATE( f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
	505	CALL resort_for_yz( f_out_y, f_inv )
	506	!
	507	!-- Save original indices and gridpoint counter
	508	isave(1) = ny
	509	isave(2) = nys_z
	510	isave(3) = nyn_z
	511	isave(4) = sendrecvcount_yz
	512	!
	513	!-- Set new indices for transformation
	514	nblk = ( ( ny+1 ) / pdims(1) ) - 1
	515	ny = pdims(1)
	516	nny_z = 1
	517	nys_z = myidx * nny_z
	518	nyn_z = ( myidx + 1 ) * nny_z - 1
	519	sendrecvcount_yz = ( nxr_y-nxl_y+1 ) * nny_z * ( nzt_y-nzb_y+1 )
	520
[1306]	521	ALLOCATE( ar1(nxl_z:nxr_z,nys_z:nyn_z,1:nz) )
[1216]	522	ALLOCATE( f_in(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
	523
[1306]	524	DO jj = 0, nblk
[1216]	525	!
[1306]	526	!-- Forward Fourier Transformation
	527	!-- Transposition y --> z
[1318]	528	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
[1216]	529
[1306]	530	DO jnew = 0, ny-1
	531	jind = jj + ( nblk + 1 ) * jnew
	532	f_in(:,:,jnew) = f_inv(:,:,jind)
	533	ENDDO
[1216]	534
[1306]	535	CALL transpose_yz( f_in, ar1(:,:,:) )
[1216]	536
[1306]	537	IF ( jj == nblk ) THEN
	538	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
	539	ELSE
	540	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
[1216]	541	ENDIF
	542
	543	!
[1306]	544	!-- Solve the tridiagonal equation system along z
[1318]	545	CALL cpu_log( log_point_s(6), 'tridia', 'start', cpu_log_nowait )
[1216]	546
[1306]	547	n = isave(2) + jj
	548	CALL tridia_substi_overlap( ar1(:,:,:), n )
[1216]	549
[1306]	550	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
[1216]	551
[1306]	552	!
	553	!-- Inverse Fourier Transformation
	554	!-- Transposition z --> y
	555	!-- Only one thread should call MPI routines, therefore forward and
	556	!-- backward tranpose are in the same section
	557	IF ( jj == 0 ) THEN
[1318]	558	CALL cpu_log( log_point_s(8), 'transpo invers', 'start', cpu_log_nowait )
[1306]	559	ELSE
[1318]	560	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
[1216]	561	ENDIF
	562
[1306]	563	CALL transpose_zy( ar1(:,:,:), f_in )
[1216]	564
[1306]	565	DO jnew = 0, ny-1
	566	jind = jj + ( nblk + 1 ) * jnew
	567	f_inv(:,:,jind) = f_in(:,:,jnew)
	568	ENDDO
[1216]	569
[1306]	570	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
[1216]	571
	572	ENDDO
	573	!
	574	!-- Restore original indices/counters
	575	ny = isave(1)
	576	nys_z = isave(2)
	577	nyn_z = isave(3)
	578	sendrecvcount_yz = isave(4)
	579
	580	CALL resort_for_zy( f_inv, f_out_y )
	581
	582	DEALLOCATE( ar1, f_in, f_inv )
	583
	584	!
	585	!-- fft along y backward + subsequent transposition y --> x
	586	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
	587	!
	588	!-- Save original indices and gridpoint counter
	589	isave(1) = nx
	590	isave(2) = nxl_y
	591	isave(3) = nxr_y
	592	isave(4) = sendrecvcount_xy
	593	!
	594	!-- Set new indices for transformation
	595	nblk = (( nx+1 ) / pdims(2) ) - 1
	596	nx = pdims(2)
	597	nnx_y = 1
	598	nxl_y = myidy * nnx_y
	599	nxr_y = ( myidy + 1 ) * nnx_y - 1
	600	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
	601
[1306]	602	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
[1216]	603	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
	604
[1306]	605	DO ii = 0, nblk
[1216]	606
[1318]	607	CALL cpu_log( log_point_s(7), 'fft_y', 'continue', cpu_log_nowait )
[1216]	608
[1306]	609	n = isave(2) + ii
	610	nxl_y_bound = isave(2)
	611	nxr_y_bound = isave(3)
[1216]	612
[1306]	613	CALL fft_y( ar1(:,:,:), 'backward', ar_tr = f_out_y, &
	614	nxl_y_bound = nxl_y_bound, nxr_y_bound = nxr_y_bound, &
	615	nxl_y_l = n, nxr_y_l = n )
[1216]	616
[1306]	617	IF ( ii == nblk ) THEN
	618	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	619	ELSE
	620	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
[1216]	621	ENDIF
	622
[1318]	623	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
[1216]	624
[1306]	625	CALL transpose_yx( ar1(:,:,:), f_in )
[1216]	626
[1306]	627	DO inew = 0, nx-1
	628	iind = ii + (nblk+1) * inew
	629	f_inv(:,:,iind) = f_in(:,:,inew)
	630	ENDDO
[1216]	631
[1306]	632	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
[1216]	633
	634	ENDDO
	635	!
	636	!-- Restore original indices/counters
	637	nx = isave(1)
	638	nxl_y = isave(2)
	639	nxr_y = isave(3)
	640	sendrecvcount_xy = isave(4)
	641
	642	CALL resort_for_yx( f_inv, f_out_z )
	643
	644	DEALLOCATE( ar1, f_in, f_inv )
	645
	646	!
	647	!-- fft along x backward + subsequent final transposition x --> z
	648	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
	649	!
	650	!-- Save original indices and gridpoint counter
	651	isave(1) = nz
	652	isave(2) = nzb_x
	653	isave(3) = nzt_x
	654	isave(4) = sendrecvcount_zx
	655	!
	656	!-- Set new indices for transformation
	657	nblk = nz / pdims(1)
	658	nz = pdims(1)
	659	nnz_x = 1
	660	nzb_x = 1 + myidx * nnz_x
	661	nzt_x = ( myidx + 1 ) * nnz_x
	662	sendrecvcount_zx = nnx * nny * nnz_x
	663
[1306]	664	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
[1216]	665	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
	666
[1306]	667	DO kk = 1, nblk
[1216]	668
[1318]	669	CALL cpu_log( log_point_s(4), 'fft_x', 'continue', cpu_log_nowait )
[1216]	670
[1306]	671	n = isave(2) + kk - 1
	672	CALL fft_x( ar1(:,:,:), 'backward', f_out_z(:,:,n))
[1216]	673
[1306]	674	IF ( kk == nblk ) THEN
	675	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	676	ELSE
	677	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
[1216]	678	ENDIF
	679
[1318]	680	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
[1216]	681
[1306]	682	CALL transpose_xz( ar1(:,:,:), f_in )
[1216]	683
[1306]	684	DO knew = 1, nz
	685	ki = kk + nblk * (knew-1)
	686	f_inv(:,:,ki) = f_in(:,:,knew)
	687	ENDDO
[1216]	688
[1306]	689	IF ( kk == nblk ) THEN
	690	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
	691	ELSE
	692	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
[1216]	693	ENDIF
	694
	695	ENDDO
	696	!
	697	!-- Restore original indices/counters
	698	nz = isave(1)
	699	nzb_x = isave(2)
	700	nzt_x = isave(3)
	701	sendrecvcount_zx = isave(4)
	702
	703	CALL resort_for_xz( f_inv, ar )
	704
	705	DEALLOCATE( ar1, f_in, f_inv )
	706
[1]	707	ENDIF
	708
	709	CALL cpu_log( log_point_s(3), 'poisfft', 'stop' )
	710
	711	END SUBROUTINE poisfft
	712
	713
	714	!------------------------------------------------------------------------------!
[1682]	715	! Description:
	716	! ------------
	717	!> Fourier-transformation along y with subsequent transposition y --> x for
	718	!> a 1d-decomposition along x.
	719	!>
	720	!> @attention The performance of this routine is much faster on the NEC-SX6,
	721	!> if the first index of work_ffty_vec is odd. Otherwise
	722	!> memory bank conflicts may occur (especially if the index is a
	723	!> multiple of 128). That's why work_ffty_vec is dimensioned as
	724	!> 0:ny+1.
	725	!> Of course, this will not work if users are using an odd number
	726	!> of gridpoints along y.
[1]	727	!------------------------------------------------------------------------------!
[1682]	728	SUBROUTINE ffty_tr_yx( f_in, f_out )
[1]	729
[1682]	730
[1320]	731	USE control_parameters, &
	732	ONLY: host
	733
	734	USE cpulog, &
	735	ONLY: cpu_log, log_point_s
	736
	737	USE kinds
	738
[1]	739	USE pegrid
	740
	741	IMPLICIT NONE
	742
[1682]	743	INTEGER(iwp) :: i !<
	744	INTEGER(iwp) :: iend !<
	745	INTEGER(iwp) :: iouter !<
	746	INTEGER(iwp) :: ir !<
	747	INTEGER(iwp) :: j !<
	748	INTEGER(iwp) :: k !<
[1]	749
[1682]	750	INTEGER(iwp), PARAMETER :: stridex = 4 !<
[1320]	751
[1682]	752	REAL(wp), DIMENSION(0:ny,stridex) :: work_ffty !<
[1]	753	#if defined( __nec )
[1682]	754	REAL(wp), DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec !<
[1]	755	#endif
[1682]	756	REAL(wp), DIMENSION(1:nz,0:ny,nxl:nxr) :: f_in !<
	757	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_out !<
	758	REAL(wp), DIMENSION(nxl:nxr,1:nz,0:ny) :: work !<
[1]	759
	760	!
	761	!-- Carry out the FFT along y, where all data are present due to the
	762	!-- 1d-decomposition along x. Resort the data in a way that x becomes
	763	!-- the first index.
[1106]	764	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'start' )
[1]	765
	766	IF ( host(1:3) == 'nec' ) THEN
	767	#if defined( __nec )
	768	!
	769	!-- Code optimized for vector processors
[85]	770	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	771	!$OMP DO
	772	DO i = nxl, nxr
	773
	774	DO j = 0, ny
	775	DO k = 1, nz
	776	work_ffty_vec(j,k,i) = f_in(k,j,i)
	777	ENDDO
	778	ENDDO
	779
	780	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' )
	781
	782	ENDDO
	783
	784	!$OMP DO
	785	DO k = 1, nz
	786	DO j = 0, ny
	787	DO i = nxl, nxr
	788	work(i,k,j) = work_ffty_vec(j,k,i)
	789	ENDDO
	790	ENDDO
	791	ENDDO
	792	!$OMP END PARALLEL
	793	#endif
	794
	795	ELSE
	796
	797	!
	798	!-- Cache optimized code.
	799	!-- The i-(x-)direction is split into a strided outer loop and an inner
	800	!-- loop for better cache performance
	801	!$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty)
	802	!$OMP DO
	803	DO iouter = nxl, nxr, stridex
	804
	805	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
	806
	807	DO k = 1, nz
	808
	809	DO i = iouter, iend
	810
	811	ir = i-iouter+1 ! counter within a stride
	812	DO j = 0, ny
	813	work_ffty(j,ir) = f_in(k,j,i)
	814	ENDDO
	815	!
	816	!-- FFT along y
[1106]	817	CALL fft_y_1d( work_ffty(:,ir), 'forward' )
[1]	818
	819	ENDDO
	820
	821	!
	822	!-- Resort
	823	DO j = 0, ny
	824	DO i = iouter, iend
	825	work(i,k,j) = work_ffty(j,i-iouter+1)
	826	ENDDO
	827	ENDDO
	828
	829	ENDDO
	830
	831	ENDDO
	832	!$OMP END PARALLEL
	833
	834	ENDIF
[1106]	835	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'pause' )
[1]	836
	837	!
	838	!-- Transpose array
[1111]	839	#if defined( __parallel )
[1]	840	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	841	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	842	CALL MPI_ALLTOALL( work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
	843	f_out(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
	844	comm1dx, ierr )
	845	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1111]	846	#endif
[1]	847
	848	END SUBROUTINE ffty_tr_yx
	849
	850
	851	!------------------------------------------------------------------------------!
[1682]	852	! Description:
	853	! ------------
	854	!> Transposition x --> y with a subsequent backward Fourier transformation for
	855	!> a 1d-decomposition along x
[1]	856	!------------------------------------------------------------------------------!
[1682]	857	SUBROUTINE tr_xy_ffty( f_in, f_out )
[1]	858
[1682]	859
[1320]	860	USE control_parameters, &
	861	ONLY: host
	862
	863	USE cpulog, &
	864	ONLY: cpu_log, log_point_s
	865
	866	USE kinds
	867
[1]	868	USE pegrid
	869
	870	IMPLICIT NONE
	871
[1682]	872	INTEGER(iwp) :: i !<
	873	INTEGER(iwp) :: iend !<
	874	INTEGER(iwp) :: iouter !<
	875	INTEGER(iwp) :: ir !<
	876	INTEGER(iwp) :: j !<
	877	INTEGER(iwp) :: k !<
[1]	878
[1682]	879	INTEGER(iwp), PARAMETER :: stridex = 4 !<
[1320]	880
[1682]	881	REAL(wp), DIMENSION(0:ny,stridex) :: work_ffty !<
[1]	882	#if defined( __nec )
[1682]	883	REAL(wp), DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec !<
[1]	884	#endif
[1682]	885	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_in !<
	886	REAL(wp), DIMENSION(1:nz,0:ny,nxl:nxr) :: f_out !<
	887	REAL(wp), DIMENSION(nxl:nxr,1:nz,0:ny) :: work !<
[1]	888
	889	!
	890	!-- Transpose array
[1111]	891	#if defined( __parallel )
[1]	892	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	893	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	894	CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
	895	work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
	896	comm1dx, ierr )
	897	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1111]	898	#endif
[1]	899
	900	!
	901	!-- Resort the data in a way that y becomes the first index and carry out the
	902	!-- backward fft along y.
[1106]	903	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'continue' )
[1]	904
	905	IF ( host(1:3) == 'nec' ) THEN
	906	#if defined( __nec )
	907	!
	908	!-- Code optimized for vector processors
[85]	909	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	910	!$OMP DO
	911	DO k = 1, nz
	912	DO j = 0, ny
	913	DO i = nxl, nxr
	914	work_ffty_vec(j,k,i) = work(i,k,j)
	915	ENDDO
	916	ENDDO
	917	ENDDO
	918
	919	!$OMP DO
	920	DO i = nxl, nxr
	921
	922	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' )
	923
	924	DO j = 0, ny
	925	DO k = 1, nz
	926	f_out(k,j,i) = work_ffty_vec(j,k,i)
	927	ENDDO
	928	ENDDO
	929
	930	ENDDO
	931	!$OMP END PARALLEL
	932	#endif
	933
	934	ELSE
	935
	936	!
	937	!-- Cache optimized code.
	938	!-- The i-(x-)direction is split into a strided outer loop and an inner
	939	!-- loop for better cache performance
	940	!$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty )
	941	!$OMP DO
	942	DO iouter = nxl, nxr, stridex
	943
	944	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
	945
	946	DO k = 1, nz
	947	!
	948	!-- Resort
	949	DO j = 0, ny
	950	DO i = iouter, iend
	951	work_ffty(j,i-iouter+1) = work(i,k,j)
	952	ENDDO
	953	ENDDO
	954
	955	DO i = iouter, iend
	956
	957	!
	958	!-- FFT along y
	959	ir = i-iouter+1 ! counter within a stride
[1106]	960	CALL fft_y_1d( work_ffty(:,ir), 'backward' )
[1]	961
	962	DO j = 0, ny
	963	f_out(k,j,i) = work_ffty(j,ir)
	964	ENDDO
	965	ENDDO
	966
	967	ENDDO
	968
	969	ENDDO
	970	!$OMP END PARALLEL
	971
	972	ENDIF
	973
[1106]	974	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'stop' )
[1]	975
	976	END SUBROUTINE tr_xy_ffty
	977
	978
	979	!------------------------------------------------------------------------------!
[1682]	980	! Description:
	981	! ------------
	982	!> FFT along x, solution of the tridiagonal system and backward FFT for
	983	!> a 1d-decomposition along x
	984	!>
	985	!> @warning this subroutine may still not work for hybrid parallelization
	986	!> with OpenMP (for possible necessary changes see the original
	987	!> routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
[1]	988	!------------------------------------------------------------------------------!
[1682]	989	SUBROUTINE fftx_tri_fftx( ar )
[1]	990
[1682]	991
[1320]	992	USE control_parameters, &
	993	ONLY: host
	994
	995	USE cpulog, &
	996	ONLY: cpu_log, log_point_s
	997
	998	USE grid_variables, &
	999	ONLY: ddx2, ddy2
	1000
	1001	USE kinds
	1002
[1]	1003	USE pegrid
	1004
	1005	IMPLICIT NONE
	1006
[1682]	1007	INTEGER(iwp) :: i !<
	1008	INTEGER(iwp) :: j !<
	1009	INTEGER(iwp) :: k !<
	1010	INTEGER(iwp) :: m !<
	1011	INTEGER(iwp) :: n !<
	1012	INTEGER(iwp) :: omp_get_thread_num !<
	1013	INTEGER(iwp) :: tn !<
[1]	1014
[1682]	1015	REAL(wp), DIMENSION(0:nx) :: work_fftx !<
	1016	REAL(wp), DIMENSION(0:nx,1:nz) :: work_trix !<
	1017	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: ar !<
	1018	REAL(wp), DIMENSION(:,:,:,:), ALLOCATABLE :: tri !<
[1]	1019
	1020
[1106]	1021	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'start' )
[1]	1022
	1023	ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) )
	1024
	1025	tn = 0 ! Default thread number in case of one thread
	1026	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_fftx, work_trix )
	1027	DO j = nys_x, nyn_x
	1028
	1029	!$ tn = omp_get_thread_num()
	1030
	1031	IF ( host(1:3) == 'nec' ) THEN
	1032	!
	1033	!-- Code optimized for vector processors
	1034	DO k = 1, nz
	1035
	1036	m = 0
	1037	DO n = 1, pdims(1)
[1003]	1038	DO i = 1, nnx
[1]	1039	work_trix(m,k) = ar(i,k,j,n)
	1040	m = m + 1
	1041	ENDDO
	1042	ENDDO
	1043
	1044	ENDDO
	1045
	1046	CALL fft_x_m( work_trix, 'forward' )
	1047
	1048	ELSE
	1049	!
	1050	!-- Cache optimized code
	1051	DO k = 1, nz
	1052
	1053	m = 0
	1054	DO n = 1, pdims(1)
[1003]	1055	DO i = 1, nnx
[1]	1056	work_fftx(m) = ar(i,k,j,n)
	1057	m = m + 1
	1058	ENDDO
	1059	ENDDO
	1060
[1106]	1061	CALL fft_x_1d( work_fftx, 'forward' )
[1]	1062
	1063	DO i = 0, nx
	1064	work_trix(i,k) = work_fftx(i)
	1065	ENDDO
	1066
	1067	ENDDO
	1068
	1069	ENDIF
	1070
	1071	!
	1072	!-- Solve the linear equation system
	1073	CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) )
	1074
	1075	IF ( host(1:3) == 'nec' ) THEN
	1076	!
	1077	!-- Code optimized for vector processors
	1078	CALL fft_x_m( work_trix, 'backward' )
	1079
	1080	DO k = 1, nz
	1081
	1082	m = 0
	1083	DO n = 1, pdims(1)
[1003]	1084	DO i = 1, nnx
[1]	1085	ar(i,k,j,n) = work_trix(m,k)
	1086	m = m + 1
	1087	ENDDO
	1088	ENDDO
	1089
	1090	ENDDO
	1091
	1092	ELSE
	1093	!
	1094	!-- Cache optimized code
	1095	DO k = 1, nz
	1096
	1097	DO i = 0, nx
	1098	work_fftx(i) = work_trix(i,k)
	1099	ENDDO
	1100
[1106]	1101	CALL fft_x_1d( work_fftx, 'backward' )
[1]	1102
	1103	m = 0
	1104	DO n = 1, pdims(1)
[1003]	1105	DO i = 1, nnx
[1]	1106	ar(i,k,j,n) = work_fftx(m)
	1107	m = m + 1
	1108	ENDDO
	1109	ENDDO
	1110
	1111	ENDDO
	1112
	1113	ENDIF
	1114
	1115	ENDDO
	1116
	1117	DEALLOCATE( tri )
	1118
[1106]	1119	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'stop' )
[1]	1120
	1121	END SUBROUTINE fftx_tri_fftx
	1122
	1123
	1124	!------------------------------------------------------------------------------!
[1682]	1125	! Description:
	1126	! ------------
	1127	!> Fourier-transformation along x with subsequent transposition x --> y for
	1128	!> a 1d-decomposition along y.
	1129	!>
	1130	!> @attention NEC-branch of this routine may significantly profit from
	1131	!> further optimizations. So far, performance is much worse than
	1132	!> for routine ffty_tr_yx (more than three times slower).
[1]	1133	!------------------------------------------------------------------------------!
[1682]	1134	SUBROUTINE fftx_tr_xy( f_in, f_out )
[1]	1135
[1682]	1136
[1320]	1137	USE control_parameters, &
	1138	ONLY: host
	1139
	1140	USE cpulog, &
	1141	ONLY: cpu_log, log_point_s
	1142
	1143	USE kinds
	1144
[1]	1145	USE pegrid
	1146
	1147	IMPLICIT NONE
	1148
[1682]	1149	INTEGER(iwp) :: i !<
	1150	INTEGER(iwp) :: j !<
	1151	INTEGER(iwp) :: k !<
[1]	1152
[1682]	1153	REAL(wp), DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx !<
	1154	REAL(wp), DIMENSION(1:nz,nys:nyn,0:nx) :: f_in !<
	1155	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_out !<
	1156	REAL(wp), DIMENSION(nys:nyn,1:nz,0:nx) :: work !<
[1]	1157
	1158	!
	1159	!-- Carry out the FFT along x, where all data are present due to the
	1160	!-- 1d-decomposition along y. Resort the data in a way that y becomes
	1161	!-- the first index.
[1106]	1162	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'start' )
[1]	1163
	1164	IF ( host(1:3) == 'nec' ) THEN
	1165	!
	1166	!-- Code for vector processors
[85]	1167	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	1168	!$OMP DO
	1169	DO i = 0, nx
	1170
	1171	DO j = nys, nyn
	1172	DO k = 1, nz
	1173	work_fftx(i,k,j) = f_in(k,j,i)
	1174	ENDDO
	1175	ENDDO
	1176
	1177	ENDDO
	1178
	1179	!$OMP DO
	1180	DO j = nys, nyn
	1181
	1182	CALL fft_x_m( work_fftx(:,:,j), 'forward' )
	1183
	1184	DO k = 1, nz
	1185	DO i = 0, nx
	1186	work(j,k,i) = work_fftx(i,k,j)
	1187	ENDDO
	1188	ENDDO
	1189
	1190	ENDDO
	1191	!$OMP END PARALLEL
	1192
	1193	ELSE
	1194
	1195	!
	1196	!-- Cache optimized code (there might be still a potential for better
	1197	!-- optimization).
[696]	1198	!$OMP PARALLEL PRIVATE (i,j,k)
[1]	1199	!$OMP DO
	1200	DO i = 0, nx
	1201
	1202	DO j = nys, nyn
	1203	DO k = 1, nz
	1204	work_fftx(i,k,j) = f_in(k,j,i)
	1205	ENDDO
	1206	ENDDO
	1207
	1208	ENDDO
	1209
	1210	!$OMP DO
	1211	DO j = nys, nyn
	1212	DO k = 1, nz
	1213
[1106]	1214	CALL fft_x_1d( work_fftx(0:nx,k,j), 'forward' )
[1]	1215
	1216	DO i = 0, nx
	1217	work(j,k,i) = work_fftx(i,k,j)
	1218	ENDDO
	1219	ENDDO
	1220
	1221	ENDDO
	1222	!$OMP END PARALLEL
	1223
	1224	ENDIF
[1106]	1225	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'pause' )
[1]	1226
	1227	!
	1228	!-- Transpose array
[1111]	1229	#if defined( __parallel )
[1]	1230	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	1231	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	1232	CALL MPI_ALLTOALL( work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
	1233	f_out(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
	1234	comm1dy, ierr )
	1235	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1111]	1236	#endif
[1]	1237
	1238	END SUBROUTINE fftx_tr_xy
	1239
	1240
	1241	!------------------------------------------------------------------------------!
[1682]	1242	! Description:
	1243	! ------------
	1244	!> Transposition y --> x with a subsequent backward Fourier transformation for
	1245	!> a 1d-decomposition along x.
[1]	1246	!------------------------------------------------------------------------------!
[1682]	1247	SUBROUTINE tr_yx_fftx( f_in, f_out )
[1]	1248
[1682]	1249
[1320]	1250	USE control_parameters, &
	1251	ONLY: host
	1252
	1253	USE cpulog, &
	1254	ONLY: cpu_log, log_point_s
	1255
	1256	USE kinds
	1257
[1]	1258	USE pegrid
	1259
	1260	IMPLICIT NONE
	1261
[1682]	1262	INTEGER(iwp) :: i !<
	1263	INTEGER(iwp) :: j !<
	1264	INTEGER(iwp) :: k !<
[1]	1265
[1682]	1266	REAL(wp), DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx !<
	1267	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_in !<
	1268	REAL(wp), DIMENSION(1:nz,nys:nyn,0:nx) :: f_out !<
	1269	REAL(wp), DIMENSION(nys:nyn,1:nz,0:nx) :: work !<
[1]	1270
	1271	!
	1272	!-- Transpose array
[1111]	1273	#if defined( __parallel )
[1]	1274	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	1275	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	1276	CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
	1277	work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
	1278	comm1dy, ierr )
	1279	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1111]	1280	#endif
[1]	1281
	1282	!
	1283	!-- Carry out the FFT along x, where all data are present due to the
	1284	!-- 1d-decomposition along y. Resort the data in a way that y becomes
	1285	!-- the first index.
[1106]	1286	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'continue' )
[1]	1287
	1288	IF ( host(1:3) == 'nec' ) THEN
	1289	!
	1290	!-- Code optimized for vector processors
[85]	1291	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	1292	!$OMP DO
	1293	DO j = nys, nyn
	1294
	1295	DO k = 1, nz
	1296	DO i = 0, nx
	1297	work_fftx(i,k,j) = work(j,k,i)
	1298	ENDDO
	1299	ENDDO
	1300
	1301	CALL fft_x_m( work_fftx(:,:,j), 'backward' )
	1302
	1303	ENDDO
	1304
	1305	!$OMP DO
	1306	DO i = 0, nx
	1307	DO j = nys, nyn
	1308	DO k = 1, nz
	1309	f_out(k,j,i) = work_fftx(i,k,j)
	1310	ENDDO
	1311	ENDDO
	1312	ENDDO
	1313	!$OMP END PARALLEL
	1314
	1315	ELSE
	1316
	1317	!
	1318	!-- Cache optimized code (there might be still a potential for better
	1319	!-- optimization).
[696]	1320	!$OMP PARALLEL PRIVATE (i,j,k)
[1]	1321	!$OMP DO
	1322	DO j = nys, nyn
	1323	DO k = 1, nz
	1324
	1325	DO i = 0, nx
	1326	work_fftx(i,k,j) = work(j,k,i)
	1327	ENDDO
	1328
[1106]	1329	CALL fft_x_1d( work_fftx(0:nx,k,j), 'backward' )
[1]	1330
	1331	ENDDO
	1332	ENDDO
	1333
	1334	!$OMP DO
	1335	DO i = 0, nx
	1336	DO j = nys, nyn
	1337	DO k = 1, nz
	1338	f_out(k,j,i) = work_fftx(i,k,j)
	1339	ENDDO
	1340	ENDDO
	1341	ENDDO
	1342	!$OMP END PARALLEL
	1343
	1344	ENDIF
[1106]	1345	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'stop' )
[1]	1346
	1347	END SUBROUTINE tr_yx_fftx
	1348
	1349
	1350	!------------------------------------------------------------------------------!
[1682]	1351	! Description:
	1352	! ------------
	1353	!> FFT along y, solution of the tridiagonal system and backward FFT for
	1354	!> a 1d-decomposition along y.
	1355	!>
	1356	!> @warning this subroutine may still not work for hybrid parallelization
	1357	!> with OpenMP (for possible necessary changes see the original
	1358	!> routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
[1]	1359	!------------------------------------------------------------------------------!
[1682]	1360	SUBROUTINE ffty_tri_ffty( ar )
[1]	1361
[1682]	1362
[1320]	1363	USE control_parameters, &
	1364	ONLY: host
	1365
	1366	USE cpulog, &
	1367	ONLY: cpu_log, log_point_s
	1368
	1369	USE grid_variables, &
	1370	ONLY: ddx2, ddy2
	1371
	1372	USE kinds
	1373
[1]	1374	USE pegrid
	1375
	1376	IMPLICIT NONE
	1377
[1682]	1378	INTEGER(iwp) :: i !<
	1379	INTEGER(iwp) :: j !<
	1380	INTEGER(iwp) :: k !<
	1381	INTEGER(iwp) :: m !<
	1382	INTEGER(iwp) :: n !<
	1383	INTEGER(iwp) :: omp_get_thread_num !<
	1384	INTEGER(iwp) :: tn !<
[1]	1385
[1682]	1386	REAL(wp), DIMENSION(0:ny) :: work_ffty !<
	1387	REAL(wp), DIMENSION(0:ny,1:nz) :: work_triy !<
	1388	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: ar !<
	1389	REAL(wp), DIMENSION(:,:,:,:), ALLOCATABLE :: tri !<
[1]	1390
	1391
[1106]	1392	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'start' )
[1]	1393
	1394	ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) )
	1395
	1396	tn = 0 ! Default thread number in case of one thread
[696]	1397	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy )
[1]	1398	DO i = nxl_y, nxr_y
	1399
	1400	!$ tn = omp_get_thread_num()
	1401
	1402	IF ( host(1:3) == 'nec' ) THEN
	1403	!
	1404	!-- Code optimized for vector processors
	1405	DO k = 1, nz
	1406
	1407	m = 0
	1408	DO n = 1, pdims(2)
[1003]	1409	DO j = 1, nny
[1]	1410	work_triy(m,k) = ar(j,k,i,n)
	1411	m = m + 1
	1412	ENDDO
	1413	ENDDO
	1414
	1415	ENDDO
	1416
	1417	CALL fft_y_m( work_triy, ny, 'forward' )
	1418
	1419	ELSE
	1420	!
	1421	!-- Cache optimized code
	1422	DO k = 1, nz
	1423
	1424	m = 0
	1425	DO n = 1, pdims(2)
[1003]	1426	DO j = 1, nny
[1]	1427	work_ffty(m) = ar(j,k,i,n)
	1428	m = m + 1
	1429	ENDDO
	1430	ENDDO
	1431
[1106]	1432	CALL fft_y_1d( work_ffty, 'forward' )
[1]	1433
	1434	DO j = 0, ny
	1435	work_triy(j,k) = work_ffty(j)
	1436	ENDDO
	1437
	1438	ENDDO
	1439
	1440	ENDIF
	1441
	1442	!
	1443	!-- Solve the linear equation system
	1444	CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) )
	1445
	1446	IF ( host(1:3) == 'nec' ) THEN
	1447	!
	1448	!-- Code optimized for vector processors
	1449	CALL fft_y_m( work_triy, ny, 'backward' )
	1450
	1451	DO k = 1, nz
	1452
	1453	m = 0
	1454	DO n = 1, pdims(2)
[1003]	1455	DO j = 1, nny
[1]	1456	ar(j,k,i,n) = work_triy(m,k)
	1457	m = m + 1
	1458	ENDDO
	1459	ENDDO
	1460
	1461	ENDDO
	1462
	1463	ELSE
	1464	!
	1465	!-- Cache optimized code
	1466	DO k = 1, nz
	1467
	1468	DO j = 0, ny
	1469	work_ffty(j) = work_triy(j,k)
	1470	ENDDO
	1471
[1106]	1472	CALL fft_y_1d( work_ffty, 'backward' )
[1]	1473
	1474	m = 0
	1475	DO n = 1, pdims(2)
[1003]	1476	DO j = 1, nny
[1]	1477	ar(j,k,i,n) = work_ffty(m)
	1478	m = m + 1
	1479	ENDDO
	1480	ENDDO
	1481
	1482	ENDDO
	1483
	1484	ENDIF
	1485
	1486	ENDDO
	1487
	1488	DEALLOCATE( tri )
	1489
[1106]	1490	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'stop' )
[1]	1491
	1492	END SUBROUTINE ffty_tri_ffty
	1493
	1494	END MODULE poisfft_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |