Home

Context Navigation

source: palm/trunk/SOURCE/poisfft_mod.f90 @ 2039

Last change on this file since 2039 was 2001, checked in by knoop, 8 years ago
last commit documented
Property svn:keywords set to `Id`
File size: 45.7 KB

Rev	Line
[1850]	1	!> @file poisfft_mod.f90
[2000]	2	!------------------------------------------------------------------------------!
[1036]	3	! This file is part of PALM.
	4	!
[2000]	5	! PALM is free software: you can redistribute it and/or modify it under the
	6	! terms of the GNU General Public License as published by the Free Software
	7	! Foundation, either version 3 of the License, or (at your option) any later
	8	! version.
[1036]	9	!
	10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
	11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
	12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
	13	!
	14	! You should have received a copy of the GNU General Public License along with
	15	! PALM. If not, see <http://www.gnu.org/licenses/>.
	16	!
[2000]	17	! Copyright 1997-2016 Leibniz Universitaet Hannover
	18	!------------------------------------------------------------------------------!
[1036]	19	!
[484]	20	! Current revisions:
[1]	21	! -----------------
[1683]	22	!
[2001]	23	!
[1321]	24	! Former revisions:
	25	! -----------------
	26	! $Id: poisfft_mod.f90 2001 2016-08-20 18:41:22Z gronemeier $
	27	!
[2001]	28	! 2000 2016-08-20 18:09:15Z knoop
	29	! Forced header and separation lines into 80 columns
	30	!
[1851]	31	! 1850 2016-04-08 13:29:27Z maronga
	32	! Module renamed
	33	!
	34	!
[1805]	35	! 1804 2016-04-05 16:30:18Z maronga
	36	! Removed code for parameter file check (__check)
	37	!
[1683]	38	! 1682 2015-10-07 23:56:08Z knoop
	39	! Code annotations made doxygen readable
	40	!
[1483]	41	! 1482 2014-10-18 12:34:45Z raasch
	42	! use 2d-decomposition, if accelerator boards are used
	43	!
[1407]	44	! 1406 2014-05-16 13:47:01Z raasch
	45	! bugfix for pgi 14.4: declare create moved after array declaration
	46	!
[1321]	47	! 1320 2014-03-20 08:40:49Z raasch
[1320]	48	! ONLY-attribute added to USE-statements,
	49	! kind-parameters added to all INTEGER and REAL declaration statements,
	50	! kinds are defined in new module kinds,
	51	! old module precision_kind is removed,
	52	! revision history before 2012 removed,
	53	! comment fields (!:) to be used for variable explanations added to
	54	! all variable declaration statements
[1217]	55	!
[1319]	56	! 1318 2014-03-17 13:35:16Z raasch
	57	! module interfaces removed
	58	!
[1307]	59	! 1306 2014-03-13 14:30:59Z raasch
	60	! openmp sections removed from the overlap branch,
	61	! second argument removed from parameter list
	62	!
[1217]	63	! 1216 2013-08-26 09:31:42Z raasch
[1216]	64	! resorting of arrays moved to separate routines resort_for_...,
	65	! one argument, used as temporary work array, removed from all transpose
	66	! routines
	67	! overlapping fft / transposition implemented
[1112]	68	!
[1213]	69	! 1212 2013-08-15 08:46:27Z raasch
	70	! tridia routines moved to seperate module tridia_solver
	71	!
[1209]	72	! 1208 2013-08-13 06:41:49Z raasch
	73	! acc-update clauses added for "ar" so that ffts other than cufft can also be
	74	! used (although they are not ported and will give a poor performance)
	75	!
[1112]	76	! 1111 2013-03-08 23:54:10Z raasch
[1111]	77	! further openACC porting of non-parallel (MPI) branch:
	78	! tridiagonal routines split into extermal subroutines (instead using CONTAINS),
	79	! no distinction between parallel/non-parallel in poisfft and tridia any more,
[1112]	80	! tridia routines moved to end of file because of probable bug in PGI compiler 12.5
[1111]	81	! (otherwise "invalid device function" is indicated during runtime),
	82	! optimization of tridia routines: constant elements and coefficients of tri are
	83	! stored in seperate arrays ddzuw and tric, last dimension of tri reduced from 5
	84	! to 2,
	85	! poisfft_init is now called internally from poisfft, maketri is called from
	86	! poisfft_init,
	87	! ibc_p_b = 2 removed
[1]	88	!
[1107]	89	! 1106 2013-03-04 05:31:38Z raasch
	90	! routines fftx, ffty, fftxp, fftyp removed, calls replaced by fft_x, fft_y,
	91	! in the 1D-decomposition routines fft_x, ffty are replaced by fft_x_1d,
	92	! fft_y_1d
	93	!
[1104]	94	! 1103 2013-02-20 02:15:53Z raasch
	95	! tri, ar, and ar1 arguments in tridia-routines (2d) are removed because they
	96	! sometimes cause segmentation faults with intel 12.1 compiler
	97	!
[1093]	98	! 1092 2013-02-02 11:24:22Z raasch
	99	! unused variables removed
	100	!
[1037]	101	! 1036 2012-10-22 13:43:42Z raasch
	102	! code put under GPL (PALM 3.9)
	103	!
[1014]	104	! 2012-09-21 07:03:55Z raasch
	105	! FLOAT type conversion replaced by REAL
	106	!
[1004]	107	! 1003 2012-09-14 14:35:53Z raasch
	108	! indices nxa, nya, etc. replaced by nx, ny, etc.
	109	!
[941]	110	! 940 2012-07-09 14:31:00Z raasch
	111	! special handling of tri-array as an argument in tridia_1dd routines switched
	112	! off because it caused segmentation faults with intel 12.1 compiler
	113	!
[878]	114	! 877 2012-04-03 11:21:44Z suehring
	115	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
	116	! pressure at the top of the model domain.
	117	!
[810]	118	! 809 2012-01-30 13:32:58Z maronga
	119	! Bugfix: replaced .AND. and .NOT. with && and ! in the preprocessor directives
	120	!
[808]	121	! 807 2012-01-25 11:53:51Z maronga
	122	! New cpp directive "__check" implemented which is used by check_namelist_files
	123	! (most of the code is unneeded by check_namelist_files).
	124	!
[1]	125	! Revision 1.1 1997/07/24 11:24:14 raasch
	126	! Initial revision
	127	!
	128	!
	129	! Description:
	130	! ------------
[1682]	131	!> Solves the Poisson equation with a 2D spectral method
	132	!> d^2 p / dx^2 + d^2 p / dy^2 + d^2 p / dz^2 = s
	133	!>
	134	!> Input:
	135	!> real ar contains (nnz,nny,nnx) elements of the velocity divergence,
	136	!> starting from (1,nys,nxl)
	137	!>
	138	!> Output:
	139	!> real ar contains the solution for perturbation pressure p
[1]	140	!------------------------------------------------------------------------------!
[1682]	141	MODULE poisfft_mod
	142
[1]	143
[1320]	144	USE fft_xy, &
	145	ONLY: fft_init, fft_y, fft_y_1d, fft_y_m, fft_x, fft_x_1d, fft_x_m
[1]	146
[1320]	147	USE indices, &
	148	ONLY: nnx, nny, nx, nxl, nxr, ny, nys, nyn, nz
	149
	150	USE transpose_indices, &
	151	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nys_x, nys_z, nyn_x, nyn_z, nzb_x, &
	152	nzb_y, nzt_x, nzt_y
	153
	154	USE tridia_solver, &
	155	ONLY: tridia_1dd, tridia_init, tridia_substi, tridia_substi_overlap
	156
[1]	157	IMPLICIT NONE
	158
[1111]	159	LOGICAL, SAVE :: poisfft_initialized = .FALSE.
	160
[1]	161	PRIVATE
[807]	162
[1]	163	PUBLIC poisfft, poisfft_init
	164
	165	INTERFACE poisfft
	166	MODULE PROCEDURE poisfft
	167	END INTERFACE poisfft
	168
	169	INTERFACE poisfft_init
	170	MODULE PROCEDURE poisfft_init
	171	END INTERFACE poisfft_init
	172
[807]	173
[1]	174	CONTAINS
	175
[1682]	176	!------------------------------------------------------------------------------!
	177	! Description:
	178	! ------------
	179	!> @todo Missing subroutine description.
	180	!------------------------------------------------------------------------------!
[1]	181	SUBROUTINE poisfft_init
	182
[1320]	183	USE arrays_3d, &
	184	ONLY: ddzu_pres, ddzw
[1111]	185
[1320]	186	USE kinds
	187
[1111]	188	IMPLICIT NONE
	189
[1682]	190	INTEGER(iwp) :: k !<
[1111]	191
	192
[1]	193	CALL fft_init
	194
[1212]	195	CALL tridia_init
[1111]	196
	197	poisfft_initialized = .TRUE.
	198
[1]	199	END SUBROUTINE poisfft_init
	200
[1111]	201
[1804]	202
[1682]	203	!------------------------------------------------------------------------------!
	204	! Description:
	205	! ------------
	206	!> Two-dimensional Fourier Transformation in x- and y-direction.
	207	!------------------------------------------------------------------------------!
[1306]	208	SUBROUTINE poisfft( ar )
[1]	209
[1320]	210	USE control_parameters, &
	211	ONLY: fft_method, transpose_compute_overlap
	212
	213	USE cpulog, &
	214	ONLY: cpu_log, cpu_log_nowait, log_point_s
	215
	216	USE kinds
	217
[1]	218	USE pegrid
	219
	220	IMPLICIT NONE
	221
[1682]	222	INTEGER(iwp) :: ii !<
	223	INTEGER(iwp) :: iind !<
	224	INTEGER(iwp) :: inew !<
	225	INTEGER(iwp) :: jj !<
	226	INTEGER(iwp) :: jind !<
	227	INTEGER(iwp) :: jnew !<
	228	INTEGER(iwp) :: ki !<
	229	INTEGER(iwp) :: kk !<
	230	INTEGER(iwp) :: knew !<
	231	INTEGER(iwp) :: n !<
	232	INTEGER(iwp) :: nblk !<
	233	INTEGER(iwp) :: nnx_y !<
	234	INTEGER(iwp) :: nny_z !<
	235	INTEGER(iwp) :: nnz_t !<
	236	INTEGER(iwp) :: nnz_x !<
	237	INTEGER(iwp) :: nxl_y_bound !<
	238	INTEGER(iwp) :: nxr_y_bound !<
[1]	239
[1682]	240	INTEGER(iwp), DIMENSION(4) :: isave !<
[1320]	241
[1682]	242	REAL(wp), DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar !<
	243	REAL(wp), DIMENSION(nys:nyn,nxl:nxr,1:nz) :: ar_inv !<
[1216]	244	!$acc declare create( ar_inv )
[1]	245
[1682]	246	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: ar1 !<
	247	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_in !<
	248	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_inv !<
	249	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_out_y !<
	250	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_out_z !<
[1216]	251
	252
[1]	253	CALL cpu_log( log_point_s(3), 'poisfft', 'start' )
	254
[1111]	255	IF ( .NOT. poisfft_initialized ) CALL poisfft_init
	256
[1]	257	!
	258	!-- Two-dimensional Fourier Transformation in x- and y-direction.
[1482]	259	IF ( pdims(2) == 1 .AND. pdims(1) > 1 .AND. num_acc_per_node == 0 ) &
	260	THEN
[1]	261
	262	!
	263	!-- 1d-domain-decomposition along x:
	264	!-- FFT along y and transposition y --> x
[1216]	265	CALL ffty_tr_yx( ar, ar )
[1]	266
	267	!
	268	!-- FFT along x, solving the tridiagonal system and backward FFT
	269	CALL fftx_tri_fftx( ar )
	270
	271	!
	272	!-- Transposition x --> y and backward FFT along y
[1216]	273	CALL tr_xy_ffty( ar, ar )
[1]	274
[1482]	275	ELSEIF ( pdims(1) == 1 .AND. pdims(2) > 1 .AND. num_acc_per_node == 0 ) &
	276	THEN
[1]	277
	278	!
	279	!-- 1d-domain-decomposition along y:
	280	!-- FFT along x and transposition x --> y
[1216]	281	CALL fftx_tr_xy( ar, ar )
[1]	282
	283	!
	284	!-- FFT along y, solving the tridiagonal system and backward FFT
	285	CALL ffty_tri_ffty( ar )
	286
	287	!
	288	!-- Transposition y --> x and backward FFT along x
[1216]	289	CALL tr_yx_fftx( ar, ar )
[1]	290
[1216]	291	ELSEIF ( .NOT. transpose_compute_overlap ) THEN
[1]	292
	293	!
[1111]	294	!-- 2d-domain-decomposition or no decomposition (1 PE run)
[1]	295	!-- Transposition z --> x
	296	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
[1216]	297	CALL resort_for_zx( ar, ar_inv )
	298	CALL transpose_zx( ar_inv, ar )
[1]	299	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
	300
	301	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
[1208]	302	IF ( fft_method /= 'system-specific' ) THEN
	303	!$acc update host( ar )
	304	ENDIF
[1106]	305	CALL fft_x( ar, 'forward' )
[1208]	306	IF ( fft_method /= 'system-specific' ) THEN
	307	!$acc update device( ar )
	308	ENDIF
[1]	309	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
	310
	311	!
	312	!-- Transposition x --> y
	313	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
[1216]	314	CALL resort_for_xy( ar, ar_inv )
	315	CALL transpose_xy( ar_inv, ar )
[1]	316	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
	317
	318	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
[1208]	319	IF ( fft_method /= 'system-specific' ) THEN
	320	!$acc update host( ar )
	321	ENDIF
[1216]	322	CALL fft_y( ar, 'forward', ar_tr = ar, &
	323	nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
	324	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
[1208]	325	IF ( fft_method /= 'system-specific' ) THEN
	326	!$acc update device( ar )
	327	ENDIF
[1]	328	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	329
	330	!
	331	!-- Transposition y --> z
	332	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
[1216]	333	CALL resort_for_yz( ar, ar_inv )
	334	CALL transpose_yz( ar_inv, ar )
[1]	335	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
	336
	337	!
[1106]	338	!-- Solve the tridiagonal equation system along z
[1]	339	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
[1212]	340	CALL tridia_substi( ar )
[1]	341	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
	342
	343	!
	344	!-- Inverse Fourier Transformation
	345	!-- Transposition z --> y
	346	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
[1216]	347	CALL transpose_zy( ar, ar_inv )
	348	CALL resort_for_zy( ar_inv, ar )
[1]	349	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
	350
	351	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
[1208]	352	IF ( fft_method /= 'system-specific' ) THEN
	353	!$acc update host( ar )
	354	ENDIF
[1216]	355	CALL fft_y( ar, 'backward', ar_tr = ar, &
	356	nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
	357	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
[1208]	358	IF ( fft_method /= 'system-specific' ) THEN
	359	!$acc update device( ar )
	360	ENDIF
[1]	361	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	362
	363	!
	364	!-- Transposition y --> x
	365	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
[1216]	366	CALL transpose_yx( ar, ar_inv )
	367	CALL resort_for_yx( ar_inv, ar )
[1]	368	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
	369
	370	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
[1208]	371	IF ( fft_method /= 'system-specific' ) THEN
	372	!$acc update host( ar )
	373	ENDIF
[1106]	374	CALL fft_x( ar, 'backward' )
[1208]	375	IF ( fft_method /= 'system-specific' ) THEN
	376	!$acc update device( ar )
	377	ENDIF
[1]	378	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	379
	380	!
	381	!-- Transposition x --> z
	382	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
[1216]	383	CALL transpose_xz( ar, ar_inv )
	384	CALL resort_for_xz( ar_inv, ar )
[1]	385	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
	386
[1216]	387	ELSE
	388
	389	!
	390	!-- 2d-domain-decomposition or no decomposition (1 PE run) with
	391	!-- overlapping transposition / fft
[1318]	392	!-- cputime logging must not use barriers, which would prevent overlapping
[1216]	393	ALLOCATE( f_out_y(0:ny,nxl_y:nxr_y,nzb_y:nzt_y), &
	394	f_out_z(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
	395	!
	396	!-- Transposition z --> x + subsequent fft along x
	397	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
	398	CALL resort_for_zx( ar, f_inv )
	399	!
	400	!-- Save original indices and gridpoint counter
	401	isave(1) = nz
	402	isave(2) = nzb_x
	403	isave(3) = nzt_x
	404	isave(4) = sendrecvcount_zx
	405	!
	406	!-- Set new indices for transformation
	407	nblk = nz / pdims(1)
	408	nz = pdims(1)
	409	nnz_x = 1
	410	nzb_x = 1 + myidx * nnz_x
	411	nzt_x = ( myidx + 1 ) * nnz_x
	412	sendrecvcount_zx = nnx * nny * nnz_x
	413
[1306]	414	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
[1216]	415	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
	416
[1306]	417	DO kk = 1, nblk
[1216]	418
[1306]	419	IF ( kk == 1 ) THEN
[1318]	420	CALL cpu_log( log_point_s(5), 'transpo forward', 'start', cpu_log_nowait )
[1306]	421	ELSE
[1318]	422	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
[1306]	423	ENDIF
[1216]	424
[1306]	425	DO knew = 1, nz
	426	ki = kk + nblk * ( knew - 1 )
	427	f_in(:,:,knew) = f_inv(:,:,ki)
	428	ENDDO
[1216]	429
[1306]	430	CALL transpose_zx( f_in, ar1(:,:,:))
	431	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
[1216]	432
[1306]	433	IF ( kk == 1 ) THEN
[1318]	434	CALL cpu_log( log_point_s(4), 'fft_x', 'start', cpu_log_nowait )
[1306]	435	ELSE
[1318]	436	CALL cpu_log( log_point_s(4), 'fft_x', 'continue', cpu_log_nowait )
[1216]	437	ENDIF
	438
[1306]	439	n = isave(2) + kk - 1
	440	CALL fft_x( ar1(:,:,:), 'forward', ar_2d = f_out_z(:,:,n))
	441	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
[1216]	442
	443	ENDDO
	444	!
	445	!-- Restore original indices/counters
	446	nz = isave(1)
	447	nzb_x = isave(2)
	448	nzt_x = isave(3)
	449	sendrecvcount_zx = isave(4)
	450
	451	DEALLOCATE( ar1, f_in, f_inv )
	452
	453	!
	454	!-- Transposition x --> y + subsequent fft along y
	455	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
	456	CALL resort_for_xy( f_out_z, f_inv )
	457	!
	458	!-- Save original indices and gridpoint counter
	459	isave(1) = nx
	460	isave(2) = nxl_y
	461	isave(3) = nxr_y
	462	isave(4) = sendrecvcount_xy
	463	!
	464	!-- Set new indices for transformation
	465	nblk = ( ( nx+1 ) / pdims(2) ) - 1
	466	nx = pdims(2)
	467	nnx_y = 1
	468	nxl_y = myidy * nnx_y
	469	nxr_y = ( myidy + 1 ) * nnx_y - 1
	470	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
	471
[1306]	472	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
[1216]	473	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
	474
[1306]	475	DO ii = 0, nblk
[1216]	476
[1318]	477	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
[1216]	478
[1306]	479	DO inew = 0, nx-1
	480	iind = ii + ( nblk + 1 ) * inew
	481	f_in(:,:,inew) = f_inv(:,:,iind)
	482	ENDDO
[1216]	483
[1306]	484	CALL transpose_xy( f_in, ar1(:,:,:) )
[1216]	485
[1306]	486	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
[1216]	487
[1306]	488	IF ( ii == 1 ) THEN
[1318]	489	CALL cpu_log( log_point_s(7), 'fft_y', 'start', cpu_log_nowait )
[1306]	490	ELSE
[1318]	491	CALL cpu_log( log_point_s(7), 'fft_y', 'continue', cpu_log_nowait )
[1216]	492	ENDIF
	493
[1306]	494	nxl_y_bound = isave(2)
	495	nxr_y_bound = isave(3)
	496	n = isave(2) + ii
	497	CALL fft_y( ar1(:,:,:), 'forward', ar_tr = f_out_y, &
	498	nxl_y_bound = nxl_y_bound, nxr_y_bound = nxr_y_bound, &
	499	nxl_y_l = n, nxr_y_l = n )
[1216]	500
[1306]	501	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
[1216]	502
	503	ENDDO
	504	!
	505	!-- Restore original indices/counters
	506	nx = isave(1)
	507	nxl_y = isave(2)
	508	nxr_y = isave(3)
	509	sendrecvcount_xy = isave(4)
	510
	511	DEALLOCATE( ar1, f_in, f_inv )
	512
	513	!
	514	!-- Transposition y --> z + subsequent tridia + resort for z --> y
	515	ALLOCATE( f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
	516	CALL resort_for_yz( f_out_y, f_inv )
	517	!
	518	!-- Save original indices and gridpoint counter
	519	isave(1) = ny
	520	isave(2) = nys_z
	521	isave(3) = nyn_z
	522	isave(4) = sendrecvcount_yz
	523	!
	524	!-- Set new indices for transformation
	525	nblk = ( ( ny+1 ) / pdims(1) ) - 1
	526	ny = pdims(1)
	527	nny_z = 1
	528	nys_z = myidx * nny_z
	529	nyn_z = ( myidx + 1 ) * nny_z - 1
	530	sendrecvcount_yz = ( nxr_y-nxl_y+1 ) * nny_z * ( nzt_y-nzb_y+1 )
	531
[1306]	532	ALLOCATE( ar1(nxl_z:nxr_z,nys_z:nyn_z,1:nz) )
[1216]	533	ALLOCATE( f_in(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
	534
[1306]	535	DO jj = 0, nblk
[1216]	536	!
[1306]	537	!-- Forward Fourier Transformation
	538	!-- Transposition y --> z
[1318]	539	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
[1216]	540
[1306]	541	DO jnew = 0, ny-1
	542	jind = jj + ( nblk + 1 ) * jnew
	543	f_in(:,:,jnew) = f_inv(:,:,jind)
	544	ENDDO
[1216]	545
[1306]	546	CALL transpose_yz( f_in, ar1(:,:,:) )
[1216]	547
[1306]	548	IF ( jj == nblk ) THEN
	549	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
	550	ELSE
	551	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
[1216]	552	ENDIF
	553
	554	!
[1306]	555	!-- Solve the tridiagonal equation system along z
[1318]	556	CALL cpu_log( log_point_s(6), 'tridia', 'start', cpu_log_nowait )
[1216]	557
[1306]	558	n = isave(2) + jj
	559	CALL tridia_substi_overlap( ar1(:,:,:), n )
[1216]	560
[1306]	561	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
[1216]	562
[1306]	563	!
	564	!-- Inverse Fourier Transformation
	565	!-- Transposition z --> y
	566	!-- Only one thread should call MPI routines, therefore forward and
	567	!-- backward tranpose are in the same section
	568	IF ( jj == 0 ) THEN
[1318]	569	CALL cpu_log( log_point_s(8), 'transpo invers', 'start', cpu_log_nowait )
[1306]	570	ELSE
[1318]	571	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
[1216]	572	ENDIF
	573
[1306]	574	CALL transpose_zy( ar1(:,:,:), f_in )
[1216]	575
[1306]	576	DO jnew = 0, ny-1
	577	jind = jj + ( nblk + 1 ) * jnew
	578	f_inv(:,:,jind) = f_in(:,:,jnew)
	579	ENDDO
[1216]	580
[1306]	581	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
[1216]	582
	583	ENDDO
	584	!
	585	!-- Restore original indices/counters
	586	ny = isave(1)
	587	nys_z = isave(2)
	588	nyn_z = isave(3)
	589	sendrecvcount_yz = isave(4)
	590
	591	CALL resort_for_zy( f_inv, f_out_y )
	592
	593	DEALLOCATE( ar1, f_in, f_inv )
	594
	595	!
	596	!-- fft along y backward + subsequent transposition y --> x
	597	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
	598	!
	599	!-- Save original indices and gridpoint counter
	600	isave(1) = nx
	601	isave(2) = nxl_y
	602	isave(3) = nxr_y
	603	isave(4) = sendrecvcount_xy
	604	!
	605	!-- Set new indices for transformation
	606	nblk = (( nx+1 ) / pdims(2) ) - 1
	607	nx = pdims(2)
	608	nnx_y = 1
	609	nxl_y = myidy * nnx_y
	610	nxr_y = ( myidy + 1 ) * nnx_y - 1
	611	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
	612
[1306]	613	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
[1216]	614	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
	615
[1306]	616	DO ii = 0, nblk
[1216]	617
[1318]	618	CALL cpu_log( log_point_s(7), 'fft_y', 'continue', cpu_log_nowait )
[1216]	619
[1306]	620	n = isave(2) + ii
	621	nxl_y_bound = isave(2)
	622	nxr_y_bound = isave(3)
[1216]	623
[1306]	624	CALL fft_y( ar1(:,:,:), 'backward', ar_tr = f_out_y, &
	625	nxl_y_bound = nxl_y_bound, nxr_y_bound = nxr_y_bound, &
	626	nxl_y_l = n, nxr_y_l = n )
[1216]	627
[1306]	628	IF ( ii == nblk ) THEN
	629	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	630	ELSE
	631	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
[1216]	632	ENDIF
	633
[1318]	634	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
[1216]	635
[1306]	636	CALL transpose_yx( ar1(:,:,:), f_in )
[1216]	637
[1306]	638	DO inew = 0, nx-1
	639	iind = ii + (nblk+1) * inew
	640	f_inv(:,:,iind) = f_in(:,:,inew)
	641	ENDDO
[1216]	642
[1306]	643	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
[1216]	644
	645	ENDDO
	646	!
	647	!-- Restore original indices/counters
	648	nx = isave(1)
	649	nxl_y = isave(2)
	650	nxr_y = isave(3)
	651	sendrecvcount_xy = isave(4)
	652
	653	CALL resort_for_yx( f_inv, f_out_z )
	654
	655	DEALLOCATE( ar1, f_in, f_inv )
	656
	657	!
	658	!-- fft along x backward + subsequent final transposition x --> z
	659	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
	660	!
	661	!-- Save original indices and gridpoint counter
	662	isave(1) = nz
	663	isave(2) = nzb_x
	664	isave(3) = nzt_x
	665	isave(4) = sendrecvcount_zx
	666	!
	667	!-- Set new indices for transformation
	668	nblk = nz / pdims(1)
	669	nz = pdims(1)
	670	nnz_x = 1
	671	nzb_x = 1 + myidx * nnz_x
	672	nzt_x = ( myidx + 1 ) * nnz_x
	673	sendrecvcount_zx = nnx * nny * nnz_x
	674
[1306]	675	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
[1216]	676	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
	677
[1306]	678	DO kk = 1, nblk
[1216]	679
[1318]	680	CALL cpu_log( log_point_s(4), 'fft_x', 'continue', cpu_log_nowait )
[1216]	681
[1306]	682	n = isave(2) + kk - 1
	683	CALL fft_x( ar1(:,:,:), 'backward', f_out_z(:,:,n))
[1216]	684
[1306]	685	IF ( kk == nblk ) THEN
	686	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	687	ELSE
	688	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
[1216]	689	ENDIF
	690
[1318]	691	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
[1216]	692
[1306]	693	CALL transpose_xz( ar1(:,:,:), f_in )
[1216]	694
[1306]	695	DO knew = 1, nz
	696	ki = kk + nblk * (knew-1)
	697	f_inv(:,:,ki) = f_in(:,:,knew)
	698	ENDDO
[1216]	699
[1306]	700	IF ( kk == nblk ) THEN
	701	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
	702	ELSE
	703	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
[1216]	704	ENDIF
	705
	706	ENDDO
	707	!
	708	!-- Restore original indices/counters
	709	nz = isave(1)
	710	nzb_x = isave(2)
	711	nzt_x = isave(3)
	712	sendrecvcount_zx = isave(4)
	713
	714	CALL resort_for_xz( f_inv, ar )
	715
	716	DEALLOCATE( ar1, f_in, f_inv )
	717
[1]	718	ENDIF
	719
	720	CALL cpu_log( log_point_s(3), 'poisfft', 'stop' )
	721
	722	END SUBROUTINE poisfft
	723
	724
	725	!------------------------------------------------------------------------------!
[1682]	726	! Description:
	727	! ------------
	728	!> Fourier-transformation along y with subsequent transposition y --> x for
	729	!> a 1d-decomposition along x.
	730	!>
	731	!> @attention The performance of this routine is much faster on the NEC-SX6,
	732	!> if the first index of work_ffty_vec is odd. Otherwise
	733	!> memory bank conflicts may occur (especially if the index is a
	734	!> multiple of 128). That's why work_ffty_vec is dimensioned as
	735	!> 0:ny+1.
	736	!> Of course, this will not work if users are using an odd number
	737	!> of gridpoints along y.
[1]	738	!------------------------------------------------------------------------------!
[1682]	739	SUBROUTINE ffty_tr_yx( f_in, f_out )
[1]	740
[1682]	741
[1320]	742	USE control_parameters, &
	743	ONLY: host
	744
	745	USE cpulog, &
	746	ONLY: cpu_log, log_point_s
	747
	748	USE kinds
	749
[1]	750	USE pegrid
	751
	752	IMPLICIT NONE
	753
[1682]	754	INTEGER(iwp) :: i !<
	755	INTEGER(iwp) :: iend !<
	756	INTEGER(iwp) :: iouter !<
	757	INTEGER(iwp) :: ir !<
	758	INTEGER(iwp) :: j !<
	759	INTEGER(iwp) :: k !<
[1]	760
[1682]	761	INTEGER(iwp), PARAMETER :: stridex = 4 !<
[1320]	762
[1682]	763	REAL(wp), DIMENSION(0:ny,stridex) :: work_ffty !<
[1]	764	#if defined( __nec )
[1682]	765	REAL(wp), DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec !<
[1]	766	#endif
[1682]	767	REAL(wp), DIMENSION(1:nz,0:ny,nxl:nxr) :: f_in !<
	768	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_out !<
	769	REAL(wp), DIMENSION(nxl:nxr,1:nz,0:ny) :: work !<
[1]	770
	771	!
	772	!-- Carry out the FFT along y, where all data are present due to the
	773	!-- 1d-decomposition along x. Resort the data in a way that x becomes
	774	!-- the first index.
[1106]	775	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'start' )
[1]	776
	777	IF ( host(1:3) == 'nec' ) THEN
	778	#if defined( __nec )
	779	!
	780	!-- Code optimized for vector processors
[85]	781	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	782	!$OMP DO
	783	DO i = nxl, nxr
	784
	785	DO j = 0, ny
	786	DO k = 1, nz
	787	work_ffty_vec(j,k,i) = f_in(k,j,i)
	788	ENDDO
	789	ENDDO
	790
	791	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' )
	792
	793	ENDDO
	794
	795	!$OMP DO
	796	DO k = 1, nz
	797	DO j = 0, ny
	798	DO i = nxl, nxr
	799	work(i,k,j) = work_ffty_vec(j,k,i)
	800	ENDDO
	801	ENDDO
	802	ENDDO
	803	!$OMP END PARALLEL
	804	#endif
	805
	806	ELSE
	807
	808	!
	809	!-- Cache optimized code.
	810	!-- The i-(x-)direction is split into a strided outer loop and an inner
	811	!-- loop for better cache performance
	812	!$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty)
	813	!$OMP DO
	814	DO iouter = nxl, nxr, stridex
	815
	816	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
	817
	818	DO k = 1, nz
	819
	820	DO i = iouter, iend
	821
	822	ir = i-iouter+1 ! counter within a stride
	823	DO j = 0, ny
	824	work_ffty(j,ir) = f_in(k,j,i)
	825	ENDDO
	826	!
	827	!-- FFT along y
[1106]	828	CALL fft_y_1d( work_ffty(:,ir), 'forward' )
[1]	829
	830	ENDDO
	831
	832	!
	833	!-- Resort
	834	DO j = 0, ny
	835	DO i = iouter, iend
	836	work(i,k,j) = work_ffty(j,i-iouter+1)
	837	ENDDO
	838	ENDDO
	839
	840	ENDDO
	841
	842	ENDDO
	843	!$OMP END PARALLEL
	844
	845	ENDIF
[1106]	846	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'pause' )
[1]	847
	848	!
	849	!-- Transpose array
[1111]	850	#if defined( __parallel )
[1]	851	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	852	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	853	CALL MPI_ALLTOALL( work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
	854	f_out(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
	855	comm1dx, ierr )
	856	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1111]	857	#endif
[1]	858
	859	END SUBROUTINE ffty_tr_yx
	860
	861
	862	!------------------------------------------------------------------------------!
[1682]	863	! Description:
	864	! ------------
	865	!> Transposition x --> y with a subsequent backward Fourier transformation for
	866	!> a 1d-decomposition along x
[1]	867	!------------------------------------------------------------------------------!
[1682]	868	SUBROUTINE tr_xy_ffty( f_in, f_out )
[1]	869
[1682]	870
[1320]	871	USE control_parameters, &
	872	ONLY: host
	873
	874	USE cpulog, &
	875	ONLY: cpu_log, log_point_s
	876
	877	USE kinds
	878
[1]	879	USE pegrid
	880
	881	IMPLICIT NONE
	882
[1682]	883	INTEGER(iwp) :: i !<
	884	INTEGER(iwp) :: iend !<
	885	INTEGER(iwp) :: iouter !<
	886	INTEGER(iwp) :: ir !<
	887	INTEGER(iwp) :: j !<
	888	INTEGER(iwp) :: k !<
[1]	889
[1682]	890	INTEGER(iwp), PARAMETER :: stridex = 4 !<
[1320]	891
[1682]	892	REAL(wp), DIMENSION(0:ny,stridex) :: work_ffty !<
[1]	893	#if defined( __nec )
[1682]	894	REAL(wp), DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec !<
[1]	895	#endif
[1682]	896	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_in !<
	897	REAL(wp), DIMENSION(1:nz,0:ny,nxl:nxr) :: f_out !<
	898	REAL(wp), DIMENSION(nxl:nxr,1:nz,0:ny) :: work !<
[1]	899
	900	!
	901	!-- Transpose array
[1111]	902	#if defined( __parallel )
[1]	903	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	904	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	905	CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
	906	work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
	907	comm1dx, ierr )
	908	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1111]	909	#endif
[1]	910
	911	!
	912	!-- Resort the data in a way that y becomes the first index and carry out the
	913	!-- backward fft along y.
[1106]	914	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'continue' )
[1]	915
	916	IF ( host(1:3) == 'nec' ) THEN
	917	#if defined( __nec )
	918	!
	919	!-- Code optimized for vector processors
[85]	920	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	921	!$OMP DO
	922	DO k = 1, nz
	923	DO j = 0, ny
	924	DO i = nxl, nxr
	925	work_ffty_vec(j,k,i) = work(i,k,j)
	926	ENDDO
	927	ENDDO
	928	ENDDO
	929
	930	!$OMP DO
	931	DO i = nxl, nxr
	932
	933	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' )
	934
	935	DO j = 0, ny
	936	DO k = 1, nz
	937	f_out(k,j,i) = work_ffty_vec(j,k,i)
	938	ENDDO
	939	ENDDO
	940
	941	ENDDO
	942	!$OMP END PARALLEL
	943	#endif
	944
	945	ELSE
	946
	947	!
	948	!-- Cache optimized code.
	949	!-- The i-(x-)direction is split into a strided outer loop and an inner
	950	!-- loop for better cache performance
	951	!$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty )
	952	!$OMP DO
	953	DO iouter = nxl, nxr, stridex
	954
	955	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
	956
	957	DO k = 1, nz
	958	!
	959	!-- Resort
	960	DO j = 0, ny
	961	DO i = iouter, iend
	962	work_ffty(j,i-iouter+1) = work(i,k,j)
	963	ENDDO
	964	ENDDO
	965
	966	DO i = iouter, iend
	967
	968	!
	969	!-- FFT along y
	970	ir = i-iouter+1 ! counter within a stride
[1106]	971	CALL fft_y_1d( work_ffty(:,ir), 'backward' )
[1]	972
	973	DO j = 0, ny
	974	f_out(k,j,i) = work_ffty(j,ir)
	975	ENDDO
	976	ENDDO
	977
	978	ENDDO
	979
	980	ENDDO
	981	!$OMP END PARALLEL
	982
	983	ENDIF
	984
[1106]	985	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'stop' )
[1]	986
	987	END SUBROUTINE tr_xy_ffty
	988
	989
	990	!------------------------------------------------------------------------------!
[1682]	991	! Description:
	992	! ------------
	993	!> FFT along x, solution of the tridiagonal system and backward FFT for
	994	!> a 1d-decomposition along x
	995	!>
	996	!> @warning this subroutine may still not work for hybrid parallelization
	997	!> with OpenMP (for possible necessary changes see the original
	998	!> routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
[1]	999	!------------------------------------------------------------------------------!
[1682]	1000	SUBROUTINE fftx_tri_fftx( ar )
[1]	1001
[1682]	1002
[1320]	1003	USE control_parameters, &
	1004	ONLY: host
	1005
	1006	USE cpulog, &
	1007	ONLY: cpu_log, log_point_s
	1008
	1009	USE grid_variables, &
	1010	ONLY: ddx2, ddy2
	1011
	1012	USE kinds
	1013
[1]	1014	USE pegrid
	1015
	1016	IMPLICIT NONE
	1017
[1682]	1018	INTEGER(iwp) :: i !<
	1019	INTEGER(iwp) :: j !<
	1020	INTEGER(iwp) :: k !<
	1021	INTEGER(iwp) :: m !<
	1022	INTEGER(iwp) :: n !<
	1023	INTEGER(iwp) :: omp_get_thread_num !<
	1024	INTEGER(iwp) :: tn !<
[1]	1025
[1682]	1026	REAL(wp), DIMENSION(0:nx) :: work_fftx !<
	1027	REAL(wp), DIMENSION(0:nx,1:nz) :: work_trix !<
	1028	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: ar !<
	1029	REAL(wp), DIMENSION(:,:,:,:), ALLOCATABLE :: tri !<
[1]	1030
	1031
[1106]	1032	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'start' )
[1]	1033
	1034	ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) )
	1035
	1036	tn = 0 ! Default thread number in case of one thread
	1037	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_fftx, work_trix )
	1038	DO j = nys_x, nyn_x
	1039
	1040	!$ tn = omp_get_thread_num()
	1041
	1042	IF ( host(1:3) == 'nec' ) THEN
	1043	!
	1044	!-- Code optimized for vector processors
	1045	DO k = 1, nz
	1046
	1047	m = 0
	1048	DO n = 1, pdims(1)
[1003]	1049	DO i = 1, nnx
[1]	1050	work_trix(m,k) = ar(i,k,j,n)
	1051	m = m + 1
	1052	ENDDO
	1053	ENDDO
	1054
	1055	ENDDO
	1056
	1057	CALL fft_x_m( work_trix, 'forward' )
	1058
	1059	ELSE
	1060	!
	1061	!-- Cache optimized code
	1062	DO k = 1, nz
	1063
	1064	m = 0
	1065	DO n = 1, pdims(1)
[1003]	1066	DO i = 1, nnx
[1]	1067	work_fftx(m) = ar(i,k,j,n)
	1068	m = m + 1
	1069	ENDDO
	1070	ENDDO
	1071
[1106]	1072	CALL fft_x_1d( work_fftx, 'forward' )
[1]	1073
	1074	DO i = 0, nx
	1075	work_trix(i,k) = work_fftx(i)
	1076	ENDDO
	1077
	1078	ENDDO
	1079
	1080	ENDIF
	1081
	1082	!
	1083	!-- Solve the linear equation system
	1084	CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) )
	1085
	1086	IF ( host(1:3) == 'nec' ) THEN
	1087	!
	1088	!-- Code optimized for vector processors
	1089	CALL fft_x_m( work_trix, 'backward' )
	1090
	1091	DO k = 1, nz
	1092
	1093	m = 0
	1094	DO n = 1, pdims(1)
[1003]	1095	DO i = 1, nnx
[1]	1096	ar(i,k,j,n) = work_trix(m,k)
	1097	m = m + 1
	1098	ENDDO
	1099	ENDDO
	1100
	1101	ENDDO
	1102
	1103	ELSE
	1104	!
	1105	!-- Cache optimized code
	1106	DO k = 1, nz
	1107
	1108	DO i = 0, nx
	1109	work_fftx(i) = work_trix(i,k)
	1110	ENDDO
	1111
[1106]	1112	CALL fft_x_1d( work_fftx, 'backward' )
[1]	1113
	1114	m = 0
	1115	DO n = 1, pdims(1)
[1003]	1116	DO i = 1, nnx
[1]	1117	ar(i,k,j,n) = work_fftx(m)
	1118	m = m + 1
	1119	ENDDO
	1120	ENDDO
	1121
	1122	ENDDO
	1123
	1124	ENDIF
	1125
	1126	ENDDO
	1127
	1128	DEALLOCATE( tri )
	1129
[1106]	1130	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'stop' )
[1]	1131
	1132	END SUBROUTINE fftx_tri_fftx
	1133
	1134
	1135	!------------------------------------------------------------------------------!
[1682]	1136	! Description:
	1137	! ------------
	1138	!> Fourier-transformation along x with subsequent transposition x --> y for
	1139	!> a 1d-decomposition along y.
	1140	!>
	1141	!> @attention NEC-branch of this routine may significantly profit from
	1142	!> further optimizations. So far, performance is much worse than
	1143	!> for routine ffty_tr_yx (more than three times slower).
[1]	1144	!------------------------------------------------------------------------------!
[1682]	1145	SUBROUTINE fftx_tr_xy( f_in, f_out )
[1]	1146
[1682]	1147
[1320]	1148	USE control_parameters, &
	1149	ONLY: host
	1150
	1151	USE cpulog, &
	1152	ONLY: cpu_log, log_point_s
	1153
	1154	USE kinds
	1155
[1]	1156	USE pegrid
	1157
	1158	IMPLICIT NONE
	1159
[1682]	1160	INTEGER(iwp) :: i !<
	1161	INTEGER(iwp) :: j !<
	1162	INTEGER(iwp) :: k !<
[1]	1163
[1682]	1164	REAL(wp), DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx !<
	1165	REAL(wp), DIMENSION(1:nz,nys:nyn,0:nx) :: f_in !<
	1166	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_out !<
	1167	REAL(wp), DIMENSION(nys:nyn,1:nz,0:nx) :: work !<
[1]	1168
	1169	!
	1170	!-- Carry out the FFT along x, where all data are present due to the
	1171	!-- 1d-decomposition along y. Resort the data in a way that y becomes
	1172	!-- the first index.
[1106]	1173	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'start' )
[1]	1174
	1175	IF ( host(1:3) == 'nec' ) THEN
	1176	!
	1177	!-- Code for vector processors
[85]	1178	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	1179	!$OMP DO
	1180	DO i = 0, nx
	1181
	1182	DO j = nys, nyn
	1183	DO k = 1, nz
	1184	work_fftx(i,k,j) = f_in(k,j,i)
	1185	ENDDO
	1186	ENDDO
	1187
	1188	ENDDO
	1189
	1190	!$OMP DO
	1191	DO j = nys, nyn
	1192
	1193	CALL fft_x_m( work_fftx(:,:,j), 'forward' )
	1194
	1195	DO k = 1, nz
	1196	DO i = 0, nx
	1197	work(j,k,i) = work_fftx(i,k,j)
	1198	ENDDO
	1199	ENDDO
	1200
	1201	ENDDO
	1202	!$OMP END PARALLEL
	1203
	1204	ELSE
	1205
	1206	!
	1207	!-- Cache optimized code (there might be still a potential for better
	1208	!-- optimization).
[696]	1209	!$OMP PARALLEL PRIVATE (i,j,k)
[1]	1210	!$OMP DO
	1211	DO i = 0, nx
	1212
	1213	DO j = nys, nyn
	1214	DO k = 1, nz
	1215	work_fftx(i,k,j) = f_in(k,j,i)
	1216	ENDDO
	1217	ENDDO
	1218
	1219	ENDDO
	1220
	1221	!$OMP DO
	1222	DO j = nys, nyn
	1223	DO k = 1, nz
	1224
[1106]	1225	CALL fft_x_1d( work_fftx(0:nx,k,j), 'forward' )
[1]	1226
	1227	DO i = 0, nx
	1228	work(j,k,i) = work_fftx(i,k,j)
	1229	ENDDO
	1230	ENDDO
	1231
	1232	ENDDO
	1233	!$OMP END PARALLEL
	1234
	1235	ENDIF
[1106]	1236	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'pause' )
[1]	1237
	1238	!
	1239	!-- Transpose array
[1111]	1240	#if defined( __parallel )
[1]	1241	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	1242	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	1243	CALL MPI_ALLTOALL( work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
	1244	f_out(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
	1245	comm1dy, ierr )
	1246	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1111]	1247	#endif
[1]	1248
	1249	END SUBROUTINE fftx_tr_xy
	1250
	1251
	1252	!------------------------------------------------------------------------------!
[1682]	1253	! Description:
	1254	! ------------
	1255	!> Transposition y --> x with a subsequent backward Fourier transformation for
	1256	!> a 1d-decomposition along x.
[1]	1257	!------------------------------------------------------------------------------!
[1682]	1258	SUBROUTINE tr_yx_fftx( f_in, f_out )
[1]	1259
[1682]	1260
[1320]	1261	USE control_parameters, &
	1262	ONLY: host
	1263
	1264	USE cpulog, &
	1265	ONLY: cpu_log, log_point_s
	1266
	1267	USE kinds
	1268
[1]	1269	USE pegrid
	1270
	1271	IMPLICIT NONE
	1272
[1682]	1273	INTEGER(iwp) :: i !<
	1274	INTEGER(iwp) :: j !<
	1275	INTEGER(iwp) :: k !<
[1]	1276
[1682]	1277	REAL(wp), DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx !<
	1278	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_in !<
	1279	REAL(wp), DIMENSION(1:nz,nys:nyn,0:nx) :: f_out !<
	1280	REAL(wp), DIMENSION(nys:nyn,1:nz,0:nx) :: work !<
[1]	1281
	1282	!
	1283	!-- Transpose array
[1111]	1284	#if defined( __parallel )
[1]	1285	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	1286	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	1287	CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
	1288	work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
	1289	comm1dy, ierr )
	1290	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1111]	1291	#endif
[1]	1292
	1293	!
	1294	!-- Carry out the FFT along x, where all data are present due to the
	1295	!-- 1d-decomposition along y. Resort the data in a way that y becomes
	1296	!-- the first index.
[1106]	1297	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'continue' )
[1]	1298
	1299	IF ( host(1:3) == 'nec' ) THEN
	1300	!
	1301	!-- Code optimized for vector processors
[85]	1302	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	1303	!$OMP DO
	1304	DO j = nys, nyn
	1305
	1306	DO k = 1, nz
	1307	DO i = 0, nx
	1308	work_fftx(i,k,j) = work(j,k,i)
	1309	ENDDO
	1310	ENDDO
	1311
	1312	CALL fft_x_m( work_fftx(:,:,j), 'backward' )
	1313
	1314	ENDDO
	1315
	1316	!$OMP DO
	1317	DO i = 0, nx
	1318	DO j = nys, nyn
	1319	DO k = 1, nz
	1320	f_out(k,j,i) = work_fftx(i,k,j)
	1321	ENDDO
	1322	ENDDO
	1323	ENDDO
	1324	!$OMP END PARALLEL
	1325
	1326	ELSE
	1327
	1328	!
	1329	!-- Cache optimized code (there might be still a potential for better
	1330	!-- optimization).
[696]	1331	!$OMP PARALLEL PRIVATE (i,j,k)
[1]	1332	!$OMP DO
	1333	DO j = nys, nyn
	1334	DO k = 1, nz
	1335
	1336	DO i = 0, nx
	1337	work_fftx(i,k,j) = work(j,k,i)
	1338	ENDDO
	1339
[1106]	1340	CALL fft_x_1d( work_fftx(0:nx,k,j), 'backward' )
[1]	1341
	1342	ENDDO
	1343	ENDDO
	1344
	1345	!$OMP DO
	1346	DO i = 0, nx
	1347	DO j = nys, nyn
	1348	DO k = 1, nz
	1349	f_out(k,j,i) = work_fftx(i,k,j)
	1350	ENDDO
	1351	ENDDO
	1352	ENDDO
	1353	!$OMP END PARALLEL
	1354
	1355	ENDIF
[1106]	1356	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'stop' )
[1]	1357
	1358	END SUBROUTINE tr_yx_fftx
	1359
	1360
	1361	!------------------------------------------------------------------------------!
[1682]	1362	! Description:
	1363	! ------------
	1364	!> FFT along y, solution of the tridiagonal system and backward FFT for
	1365	!> a 1d-decomposition along y.
	1366	!>
	1367	!> @warning this subroutine may still not work for hybrid parallelization
	1368	!> with OpenMP (for possible necessary changes see the original
	1369	!> routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
[1]	1370	!------------------------------------------------------------------------------!
[1682]	1371	SUBROUTINE ffty_tri_ffty( ar )
[1]	1372
[1682]	1373
[1320]	1374	USE control_parameters, &
	1375	ONLY: host
	1376
	1377	USE cpulog, &
	1378	ONLY: cpu_log, log_point_s
	1379
	1380	USE grid_variables, &
	1381	ONLY: ddx2, ddy2
	1382
	1383	USE kinds
	1384
[1]	1385	USE pegrid
	1386
	1387	IMPLICIT NONE
	1388
[1682]	1389	INTEGER(iwp) :: i !<
	1390	INTEGER(iwp) :: j !<
	1391	INTEGER(iwp) :: k !<
	1392	INTEGER(iwp) :: m !<
	1393	INTEGER(iwp) :: n !<
	1394	INTEGER(iwp) :: omp_get_thread_num !<
	1395	INTEGER(iwp) :: tn !<
[1]	1396
[1682]	1397	REAL(wp), DIMENSION(0:ny) :: work_ffty !<
	1398	REAL(wp), DIMENSION(0:ny,1:nz) :: work_triy !<
	1399	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: ar !<
	1400	REAL(wp), DIMENSION(:,:,:,:), ALLOCATABLE :: tri !<
[1]	1401
	1402
[1106]	1403	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'start' )
[1]	1404
	1405	ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) )
	1406
	1407	tn = 0 ! Default thread number in case of one thread
[696]	1408	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy )
[1]	1409	DO i = nxl_y, nxr_y
	1410
	1411	!$ tn = omp_get_thread_num()
	1412
	1413	IF ( host(1:3) == 'nec' ) THEN
	1414	!
	1415	!-- Code optimized for vector processors
	1416	DO k = 1, nz
	1417
	1418	m = 0
	1419	DO n = 1, pdims(2)
[1003]	1420	DO j = 1, nny
[1]	1421	work_triy(m,k) = ar(j,k,i,n)
	1422	m = m + 1
	1423	ENDDO
	1424	ENDDO
	1425
	1426	ENDDO
	1427
	1428	CALL fft_y_m( work_triy, ny, 'forward' )
	1429
	1430	ELSE
	1431	!
	1432	!-- Cache optimized code
	1433	DO k = 1, nz
	1434
	1435	m = 0
	1436	DO n = 1, pdims(2)
[1003]	1437	DO j = 1, nny
[1]	1438	work_ffty(m) = ar(j,k,i,n)
	1439	m = m + 1
	1440	ENDDO
	1441	ENDDO
	1442
[1106]	1443	CALL fft_y_1d( work_ffty, 'forward' )
[1]	1444
	1445	DO j = 0, ny
	1446	work_triy(j,k) = work_ffty(j)
	1447	ENDDO
	1448
	1449	ENDDO
	1450
	1451	ENDIF
	1452
	1453	!
	1454	!-- Solve the linear equation system
	1455	CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) )
	1456
	1457	IF ( host(1:3) == 'nec' ) THEN
	1458	!
	1459	!-- Code optimized for vector processors
	1460	CALL fft_y_m( work_triy, ny, 'backward' )
	1461
	1462	DO k = 1, nz
	1463
	1464	m = 0
	1465	DO n = 1, pdims(2)
[1003]	1466	DO j = 1, nny
[1]	1467	ar(j,k,i,n) = work_triy(m,k)
	1468	m = m + 1
	1469	ENDDO
	1470	ENDDO
	1471
	1472	ENDDO
	1473
	1474	ELSE
	1475	!
	1476	!-- Cache optimized code
	1477	DO k = 1, nz
	1478
	1479	DO j = 0, ny
	1480	work_ffty(j) = work_triy(j,k)
	1481	ENDDO
	1482
[1106]	1483	CALL fft_y_1d( work_ffty, 'backward' )
[1]	1484
	1485	m = 0
	1486	DO n = 1, pdims(2)
[1003]	1487	DO j = 1, nny
[1]	1488	ar(j,k,i,n) = work_ffty(m)
	1489	m = m + 1
	1490	ENDDO
	1491	ENDDO
	1492
	1493	ENDDO
	1494
	1495	ENDIF
	1496
	1497	ENDDO
	1498
	1499	DEALLOCATE( tri )
	1500
[1106]	1501	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'stop' )
[1]	1502
	1503	END SUBROUTINE ffty_tri_ffty
	1504
	1505	END MODULE poisfft_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |