Home

Context Navigation

source: palm/trunk/SOURCE/poisfft.f90 @ 1489

Last change on this file since 1489 was 1483, checked in by raasch, 10 years ago
last commit documented
Property svn:keywords set to `Id`
File size: 45.0 KB

Rev	Line
[1]	1	MODULE poisfft_mod
	2
[1036]	3	!--------------------------------------------------------------------------------!
	4	! This file is part of PALM.
	5	!
	6	! PALM is free software: you can redistribute it and/or modify it under the terms
	7	! of the GNU General Public License as published by the Free Software Foundation,
	8	! either version 3 of the License, or (at your option) any later version.
	9	!
	10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
	11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
	12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
	13	!
	14	! You should have received a copy of the GNU General Public License along with
	15	! PALM. If not, see <http://www.gnu.org/licenses/>.
	16	!
[1310]	17	! Copyright 1997-2014 Leibniz Universitaet Hannover
[1036]	18	!--------------------------------------------------------------------------------!
	19	!
[484]	20	! Current revisions:
[1]	21	! -----------------
[1407]	22	!
[1483]	23	!
[1321]	24	! Former revisions:
	25	! -----------------
	26	! $Id: poisfft.f90 1483 2014-10-18 12:51:13Z raasch $
	27	!
[1483]	28	! 1482 2014-10-18 12:34:45Z raasch
	29	! use 2d-decomposition, if accelerator boards are used
	30	!
[1407]	31	! 1406 2014-05-16 13:47:01Z raasch
	32	! bugfix for pgi 14.4: declare create moved after array declaration
	33	!
[1321]	34	! 1320 2014-03-20 08:40:49Z raasch
[1320]	35	! ONLY-attribute added to USE-statements,
	36	! kind-parameters added to all INTEGER and REAL declaration statements,
	37	! kinds are defined in new module kinds,
	38	! old module precision_kind is removed,
	39	! revision history before 2012 removed,
	40	! comment fields (!:) to be used for variable explanations added to
	41	! all variable declaration statements
[1217]	42	!
[1319]	43	! 1318 2014-03-17 13:35:16Z raasch
	44	! module interfaces removed
	45	!
[1307]	46	! 1306 2014-03-13 14:30:59Z raasch
	47	! openmp sections removed from the overlap branch,
	48	! second argument removed from parameter list
	49	!
[1217]	50	! 1216 2013-08-26 09:31:42Z raasch
[1216]	51	! resorting of arrays moved to separate routines resort_for_...,
	52	! one argument, used as temporary work array, removed from all transpose
	53	! routines
	54	! overlapping fft / transposition implemented
[1112]	55	!
[1213]	56	! 1212 2013-08-15 08:46:27Z raasch
	57	! tridia routines moved to seperate module tridia_solver
	58	!
[1209]	59	! 1208 2013-08-13 06:41:49Z raasch
	60	! acc-update clauses added for "ar" so that ffts other than cufft can also be
	61	! used (although they are not ported and will give a poor performance)
	62	!
[1112]	63	! 1111 2013-03-08 23:54:10Z raasch
[1111]	64	! further openACC porting of non-parallel (MPI) branch:
	65	! tridiagonal routines split into extermal subroutines (instead using CONTAINS),
	66	! no distinction between parallel/non-parallel in poisfft and tridia any more,
[1112]	67	! tridia routines moved to end of file because of probable bug in PGI compiler 12.5
[1111]	68	! (otherwise "invalid device function" is indicated during runtime),
	69	! optimization of tridia routines: constant elements and coefficients of tri are
	70	! stored in seperate arrays ddzuw and tric, last dimension of tri reduced from 5
	71	! to 2,
	72	! poisfft_init is now called internally from poisfft, maketri is called from
	73	! poisfft_init,
	74	! ibc_p_b = 2 removed
[1]	75	!
[1107]	76	! 1106 2013-03-04 05:31:38Z raasch
	77	! routines fftx, ffty, fftxp, fftyp removed, calls replaced by fft_x, fft_y,
	78	! in the 1D-decomposition routines fft_x, ffty are replaced by fft_x_1d,
	79	! fft_y_1d
	80	!
[1104]	81	! 1103 2013-02-20 02:15:53Z raasch
	82	! tri, ar, and ar1 arguments in tridia-routines (2d) are removed because they
	83	! sometimes cause segmentation faults with intel 12.1 compiler
	84	!
[1093]	85	! 1092 2013-02-02 11:24:22Z raasch
	86	! unused variables removed
	87	!
[1037]	88	! 1036 2012-10-22 13:43:42Z raasch
	89	! code put under GPL (PALM 3.9)
	90	!
[1014]	91	! 2012-09-21 07:03:55Z raasch
	92	! FLOAT type conversion replaced by REAL
	93	!
[1004]	94	! 1003 2012-09-14 14:35:53Z raasch
	95	! indices nxa, nya, etc. replaced by nx, ny, etc.
	96	!
[941]	97	! 940 2012-07-09 14:31:00Z raasch
	98	! special handling of tri-array as an argument in tridia_1dd routines switched
	99	! off because it caused segmentation faults with intel 12.1 compiler
	100	!
[878]	101	! 877 2012-04-03 11:21:44Z suehring
	102	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
	103	! pressure at the top of the model domain.
	104	!
[810]	105	! 809 2012-01-30 13:32:58Z maronga
	106	! Bugfix: replaced .AND. and .NOT. with && and ! in the preprocessor directives
	107	!
[808]	108	! 807 2012-01-25 11:53:51Z maronga
	109	! New cpp directive "__check" implemented which is used by check_namelist_files
	110	! (most of the code is unneeded by check_namelist_files).
	111	!
[1]	112	! Revision 1.1 1997/07/24 11:24:14 raasch
	113	! Initial revision
	114	!
	115	!
	116	! Description:
	117	! ------------
[1306]	118	! Original version by Stephan Siano (pois3d), as of July 23, 1996
	119	! Adapted for 2D-domain-decomposition by Siegfried Raasch, July 3, 1997
	120	!
	121	! Solves the Poisson equation with a 2D spectral method
	122	! d^2 p / dx^2 + d^2 p / dy^2 + d^2 p / dz^2 = s
	123	!
	124	! Input:
	125	! real ar contains (nnz,nny,nnx) elements of the velocity divergence,
	126	! starting from (1,nys,nxl)
	127	!
	128	! Output:
	129	! real ar contains the solution for perturbation pressure p
[1]	130	!------------------------------------------------------------------------------!
	131
[1320]	132	USE fft_xy, &
	133	ONLY: fft_init, fft_y, fft_y_1d, fft_y_m, fft_x, fft_x_1d, fft_x_m
[1]	134
[1320]	135	USE indices, &
	136	ONLY: nnx, nny, nx, nxl, nxr, ny, nys, nyn, nz
	137
	138	USE transpose_indices, &
	139	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nys_x, nys_z, nyn_x, nyn_z, nzb_x, &
	140	nzb_y, nzt_x, nzt_y
	141
	142	USE tridia_solver, &
	143	ONLY: tridia_1dd, tridia_init, tridia_substi, tridia_substi_overlap
	144
[1]	145	IMPLICIT NONE
	146
[1111]	147	LOGICAL, SAVE :: poisfft_initialized = .FALSE.
	148
[1]	149	PRIVATE
[807]	150
[809]	151	#if ! defined ( __check )
[1]	152	PUBLIC poisfft, poisfft_init
	153
	154	INTERFACE poisfft
	155	MODULE PROCEDURE poisfft
	156	END INTERFACE poisfft
	157
	158	INTERFACE poisfft_init
	159	MODULE PROCEDURE poisfft_init
	160	END INTERFACE poisfft_init
[807]	161	#else
	162	PUBLIC poisfft_init
[1]	163
[807]	164	INTERFACE poisfft_init
	165	MODULE PROCEDURE poisfft_init
	166	END INTERFACE poisfft_init
	167	#endif
	168
[1]	169	CONTAINS
	170
	171	SUBROUTINE poisfft_init
	172
[1320]	173	USE arrays_3d, &
	174	ONLY: ddzu_pres, ddzw
[1111]	175
[1320]	176	USE kinds
	177
[1111]	178	IMPLICIT NONE
	179
[1320]	180	INTEGER(iwp) :: k !:
[1111]	181
	182
[1]	183	CALL fft_init
	184
[1212]	185	CALL tridia_init
[1111]	186
	187	poisfft_initialized = .TRUE.
	188
[1]	189	END SUBROUTINE poisfft_init
	190
[1111]	191
[809]	192	#if ! defined ( __check )
[1306]	193	SUBROUTINE poisfft( ar )
[1]	194
[1320]	195	USE control_parameters, &
	196	ONLY: fft_method, transpose_compute_overlap
	197
	198	USE cpulog, &
	199	ONLY: cpu_log, cpu_log_nowait, log_point_s
	200
	201	USE kinds
	202
[1]	203	USE pegrid
	204
	205	IMPLICIT NONE
	206
[1320]	207	INTEGER(iwp) :: ii !:
	208	INTEGER(iwp) :: iind !:
	209	INTEGER(iwp) :: inew !:
	210	INTEGER(iwp) :: jj !:
	211	INTEGER(iwp) :: jind !:
	212	INTEGER(iwp) :: jnew !:
	213	INTEGER(iwp) :: ki !:
	214	INTEGER(iwp) :: kk !:
	215	INTEGER(iwp) :: knew !:
	216	INTEGER(iwp) :: n !:
	217	INTEGER(iwp) :: nblk !:
	218	INTEGER(iwp) :: nnx_y !:
	219	INTEGER(iwp) :: nny_z !:
	220	INTEGER(iwp) :: nnz_t !:
	221	INTEGER(iwp) :: nnz_x !:
	222	INTEGER(iwp) :: nxl_y_bound !:
	223	INTEGER(iwp) :: nxr_y_bound !:
[1]	224
[1320]	225	INTEGER(iwp), DIMENSION(4) :: isave !:
	226
	227	REAL(wp), DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar !:
[1406]	228	REAL(wp), DIMENSION(nys:nyn,nxl:nxr,1:nz) :: ar_inv !:
[1216]	229	!$acc declare create( ar_inv )
[1]	230
[1320]	231	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: ar1 !:
	232	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_in !:
	233	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_inv !:
	234	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_out_y !:
	235	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_out_z !:
[1216]	236
	237
[1]	238	CALL cpu_log( log_point_s(3), 'poisfft', 'start' )
	239
[1111]	240	IF ( .NOT. poisfft_initialized ) CALL poisfft_init
	241
[1]	242	!
	243	!-- Two-dimensional Fourier Transformation in x- and y-direction.
[1482]	244	IF ( pdims(2) == 1 .AND. pdims(1) > 1 .AND. num_acc_per_node == 0 ) &
	245	THEN
[1]	246
	247	!
	248	!-- 1d-domain-decomposition along x:
	249	!-- FFT along y and transposition y --> x
[1216]	250	CALL ffty_tr_yx( ar, ar )
[1]	251
	252	!
	253	!-- FFT along x, solving the tridiagonal system and backward FFT
	254	CALL fftx_tri_fftx( ar )
	255
	256	!
	257	!-- Transposition x --> y and backward FFT along y
[1216]	258	CALL tr_xy_ffty( ar, ar )
[1]	259
[1482]	260	ELSEIF ( pdims(1) == 1 .AND. pdims(2) > 1 .AND. num_acc_per_node == 0 ) &
	261	THEN
[1]	262
	263	!
	264	!-- 1d-domain-decomposition along y:
	265	!-- FFT along x and transposition x --> y
[1216]	266	CALL fftx_tr_xy( ar, ar )
[1]	267
	268	!
	269	!-- FFT along y, solving the tridiagonal system and backward FFT
	270	CALL ffty_tri_ffty( ar )
	271
	272	!
	273	!-- Transposition y --> x and backward FFT along x
[1216]	274	CALL tr_yx_fftx( ar, ar )
[1]	275
[1216]	276	ELSEIF ( .NOT. transpose_compute_overlap ) THEN
[1]	277
	278	!
[1111]	279	!-- 2d-domain-decomposition or no decomposition (1 PE run)
[1]	280	!-- Transposition z --> x
	281	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
[1216]	282	CALL resort_for_zx( ar, ar_inv )
	283	CALL transpose_zx( ar_inv, ar )
[1]	284	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
	285
	286	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
[1208]	287	IF ( fft_method /= 'system-specific' ) THEN
	288	!$acc update host( ar )
	289	ENDIF
[1106]	290	CALL fft_x( ar, 'forward' )
[1208]	291	IF ( fft_method /= 'system-specific' ) THEN
	292	!$acc update device( ar )
	293	ENDIF
[1]	294	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
	295
	296	!
	297	!-- Transposition x --> y
	298	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
[1216]	299	CALL resort_for_xy( ar, ar_inv )
	300	CALL transpose_xy( ar_inv, ar )
[1]	301	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
	302
	303	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
[1208]	304	IF ( fft_method /= 'system-specific' ) THEN
	305	!$acc update host( ar )
	306	ENDIF
[1216]	307	CALL fft_y( ar, 'forward', ar_tr = ar, &
	308	nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
	309	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
[1208]	310	IF ( fft_method /= 'system-specific' ) THEN
	311	!$acc update device( ar )
	312	ENDIF
[1]	313	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	314
	315	!
	316	!-- Transposition y --> z
	317	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
[1216]	318	CALL resort_for_yz( ar, ar_inv )
	319	CALL transpose_yz( ar_inv, ar )
[1]	320	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
	321
	322	!
[1106]	323	!-- Solve the tridiagonal equation system along z
[1]	324	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
[1212]	325	CALL tridia_substi( ar )
[1]	326	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
	327
	328	!
	329	!-- Inverse Fourier Transformation
	330	!-- Transposition z --> y
	331	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
[1216]	332	CALL transpose_zy( ar, ar_inv )
	333	CALL resort_for_zy( ar_inv, ar )
[1]	334	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
	335
	336	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
[1208]	337	IF ( fft_method /= 'system-specific' ) THEN
	338	!$acc update host( ar )
	339	ENDIF
[1216]	340	CALL fft_y( ar, 'backward', ar_tr = ar, &
	341	nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
	342	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
[1208]	343	IF ( fft_method /= 'system-specific' ) THEN
	344	!$acc update device( ar )
	345	ENDIF
[1]	346	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	347
	348	!
	349	!-- Transposition y --> x
	350	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
[1216]	351	CALL transpose_yx( ar, ar_inv )
	352	CALL resort_for_yx( ar_inv, ar )
[1]	353	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
	354
	355	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
[1208]	356	IF ( fft_method /= 'system-specific' ) THEN
	357	!$acc update host( ar )
	358	ENDIF
[1106]	359	CALL fft_x( ar, 'backward' )
[1208]	360	IF ( fft_method /= 'system-specific' ) THEN
	361	!$acc update device( ar )
	362	ENDIF
[1]	363	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	364
	365	!
	366	!-- Transposition x --> z
	367	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
[1216]	368	CALL transpose_xz( ar, ar_inv )
	369	CALL resort_for_xz( ar_inv, ar )
[1]	370	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
	371
[1216]	372	ELSE
	373
	374	!
	375	!-- 2d-domain-decomposition or no decomposition (1 PE run) with
	376	!-- overlapping transposition / fft
[1318]	377	!-- cputime logging must not use barriers, which would prevent overlapping
[1216]	378	ALLOCATE( f_out_y(0:ny,nxl_y:nxr_y,nzb_y:nzt_y), &
	379	f_out_z(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
	380	!
	381	!-- Transposition z --> x + subsequent fft along x
	382	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
	383	CALL resort_for_zx( ar, f_inv )
	384	!
	385	!-- Save original indices and gridpoint counter
	386	isave(1) = nz
	387	isave(2) = nzb_x
	388	isave(3) = nzt_x
	389	isave(4) = sendrecvcount_zx
	390	!
	391	!-- Set new indices for transformation
	392	nblk = nz / pdims(1)
	393	nz = pdims(1)
	394	nnz_x = 1
	395	nzb_x = 1 + myidx * nnz_x
	396	nzt_x = ( myidx + 1 ) * nnz_x
	397	sendrecvcount_zx = nnx * nny * nnz_x
	398
[1306]	399	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
[1216]	400	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
	401
[1306]	402	DO kk = 1, nblk
[1216]	403
[1306]	404	IF ( kk == 1 ) THEN
[1318]	405	CALL cpu_log( log_point_s(5), 'transpo forward', 'start', cpu_log_nowait )
[1306]	406	ELSE
[1318]	407	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
[1306]	408	ENDIF
[1216]	409
[1306]	410	DO knew = 1, nz
	411	ki = kk + nblk * ( knew - 1 )
	412	f_in(:,:,knew) = f_inv(:,:,ki)
	413	ENDDO
[1216]	414
[1306]	415	CALL transpose_zx( f_in, ar1(:,:,:))
	416	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
[1216]	417
[1306]	418	IF ( kk == 1 ) THEN
[1318]	419	CALL cpu_log( log_point_s(4), 'fft_x', 'start', cpu_log_nowait )
[1306]	420	ELSE
[1318]	421	CALL cpu_log( log_point_s(4), 'fft_x', 'continue', cpu_log_nowait )
[1216]	422	ENDIF
	423
[1306]	424	n = isave(2) + kk - 1
	425	CALL fft_x( ar1(:,:,:), 'forward', ar_2d = f_out_z(:,:,n))
	426	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
[1216]	427
	428	ENDDO
	429	!
	430	!-- Restore original indices/counters
	431	nz = isave(1)
	432	nzb_x = isave(2)
	433	nzt_x = isave(3)
	434	sendrecvcount_zx = isave(4)
	435
	436	DEALLOCATE( ar1, f_in, f_inv )
	437
	438	!
	439	!-- Transposition x --> y + subsequent fft along y
	440	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
	441	CALL resort_for_xy( f_out_z, f_inv )
	442	!
	443	!-- Save original indices and gridpoint counter
	444	isave(1) = nx
	445	isave(2) = nxl_y
	446	isave(3) = nxr_y
	447	isave(4) = sendrecvcount_xy
	448	!
	449	!-- Set new indices for transformation
	450	nblk = ( ( nx+1 ) / pdims(2) ) - 1
	451	nx = pdims(2)
	452	nnx_y = 1
	453	nxl_y = myidy * nnx_y
	454	nxr_y = ( myidy + 1 ) * nnx_y - 1
	455	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
	456
[1306]	457	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
[1216]	458	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
	459
[1306]	460	DO ii = 0, nblk
[1216]	461
[1318]	462	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
[1216]	463
[1306]	464	DO inew = 0, nx-1
	465	iind = ii + ( nblk + 1 ) * inew
	466	f_in(:,:,inew) = f_inv(:,:,iind)
	467	ENDDO
[1216]	468
[1306]	469	CALL transpose_xy( f_in, ar1(:,:,:) )
[1216]	470
[1306]	471	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
[1216]	472
[1306]	473	IF ( ii == 1 ) THEN
[1318]	474	CALL cpu_log( log_point_s(7), 'fft_y', 'start', cpu_log_nowait )
[1306]	475	ELSE
[1318]	476	CALL cpu_log( log_point_s(7), 'fft_y', 'continue', cpu_log_nowait )
[1216]	477	ENDIF
	478
[1306]	479	nxl_y_bound = isave(2)
	480	nxr_y_bound = isave(3)
	481	n = isave(2) + ii
	482	CALL fft_y( ar1(:,:,:), 'forward', ar_tr = f_out_y, &
	483	nxl_y_bound = nxl_y_bound, nxr_y_bound = nxr_y_bound, &
	484	nxl_y_l = n, nxr_y_l = n )
[1216]	485
[1306]	486	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
[1216]	487
	488	ENDDO
	489	!
	490	!-- Restore original indices/counters
	491	nx = isave(1)
	492	nxl_y = isave(2)
	493	nxr_y = isave(3)
	494	sendrecvcount_xy = isave(4)
	495
	496	DEALLOCATE( ar1, f_in, f_inv )
	497
	498	!
	499	!-- Transposition y --> z + subsequent tridia + resort for z --> y
	500	ALLOCATE( f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
	501	CALL resort_for_yz( f_out_y, f_inv )
	502	!
	503	!-- Save original indices and gridpoint counter
	504	isave(1) = ny
	505	isave(2) = nys_z
	506	isave(3) = nyn_z
	507	isave(4) = sendrecvcount_yz
	508	!
	509	!-- Set new indices for transformation
	510	nblk = ( ( ny+1 ) / pdims(1) ) - 1
	511	ny = pdims(1)
	512	nny_z = 1
	513	nys_z = myidx * nny_z
	514	nyn_z = ( myidx + 1 ) * nny_z - 1
	515	sendrecvcount_yz = ( nxr_y-nxl_y+1 ) * nny_z * ( nzt_y-nzb_y+1 )
	516
[1306]	517	ALLOCATE( ar1(nxl_z:nxr_z,nys_z:nyn_z,1:nz) )
[1216]	518	ALLOCATE( f_in(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
	519
[1306]	520	DO jj = 0, nblk
[1216]	521	!
[1306]	522	!-- Forward Fourier Transformation
	523	!-- Transposition y --> z
[1318]	524	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
[1216]	525
[1306]	526	DO jnew = 0, ny-1
	527	jind = jj + ( nblk + 1 ) * jnew
	528	f_in(:,:,jnew) = f_inv(:,:,jind)
	529	ENDDO
[1216]	530
[1306]	531	CALL transpose_yz( f_in, ar1(:,:,:) )
[1216]	532
[1306]	533	IF ( jj == nblk ) THEN
	534	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
	535	ELSE
	536	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
[1216]	537	ENDIF
	538
	539	!
[1306]	540	!-- Solve the tridiagonal equation system along z
[1318]	541	CALL cpu_log( log_point_s(6), 'tridia', 'start', cpu_log_nowait )
[1216]	542
[1306]	543	n = isave(2) + jj
	544	CALL tridia_substi_overlap( ar1(:,:,:), n )
[1216]	545
[1306]	546	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
[1216]	547
[1306]	548	!
	549	!-- Inverse Fourier Transformation
	550	!-- Transposition z --> y
	551	!-- Only one thread should call MPI routines, therefore forward and
	552	!-- backward tranpose are in the same section
	553	IF ( jj == 0 ) THEN
[1318]	554	CALL cpu_log( log_point_s(8), 'transpo invers', 'start', cpu_log_nowait )
[1306]	555	ELSE
[1318]	556	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
[1216]	557	ENDIF
	558
[1306]	559	CALL transpose_zy( ar1(:,:,:), f_in )
[1216]	560
[1306]	561	DO jnew = 0, ny-1
	562	jind = jj + ( nblk + 1 ) * jnew
	563	f_inv(:,:,jind) = f_in(:,:,jnew)
	564	ENDDO
[1216]	565
[1306]	566	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
[1216]	567
	568	ENDDO
	569	!
	570	!-- Restore original indices/counters
	571	ny = isave(1)
	572	nys_z = isave(2)
	573	nyn_z = isave(3)
	574	sendrecvcount_yz = isave(4)
	575
	576	CALL resort_for_zy( f_inv, f_out_y )
	577
	578	DEALLOCATE( ar1, f_in, f_inv )
	579
	580	!
	581	!-- fft along y backward + subsequent transposition y --> x
	582	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
	583	!
	584	!-- Save original indices and gridpoint counter
	585	isave(1) = nx
	586	isave(2) = nxl_y
	587	isave(3) = nxr_y
	588	isave(4) = sendrecvcount_xy
	589	!
	590	!-- Set new indices for transformation
	591	nblk = (( nx+1 ) / pdims(2) ) - 1
	592	nx = pdims(2)
	593	nnx_y = 1
	594	nxl_y = myidy * nnx_y
	595	nxr_y = ( myidy + 1 ) * nnx_y - 1
	596	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
	597
[1306]	598	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
[1216]	599	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
	600
[1306]	601	DO ii = 0, nblk
[1216]	602
[1318]	603	CALL cpu_log( log_point_s(7), 'fft_y', 'continue', cpu_log_nowait )
[1216]	604
[1306]	605	n = isave(2) + ii
	606	nxl_y_bound = isave(2)
	607	nxr_y_bound = isave(3)
[1216]	608
[1306]	609	CALL fft_y( ar1(:,:,:), 'backward', ar_tr = f_out_y, &
	610	nxl_y_bound = nxl_y_bound, nxr_y_bound = nxr_y_bound, &
	611	nxl_y_l = n, nxr_y_l = n )
[1216]	612
[1306]	613	IF ( ii == nblk ) THEN
	614	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	615	ELSE
	616	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
[1216]	617	ENDIF
	618
[1318]	619	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
[1216]	620
[1306]	621	CALL transpose_yx( ar1(:,:,:), f_in )
[1216]	622
[1306]	623	DO inew = 0, nx-1
	624	iind = ii + (nblk+1) * inew
	625	f_inv(:,:,iind) = f_in(:,:,inew)
	626	ENDDO
[1216]	627
[1306]	628	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
[1216]	629
	630	ENDDO
	631	!
	632	!-- Restore original indices/counters
	633	nx = isave(1)
	634	nxl_y = isave(2)
	635	nxr_y = isave(3)
	636	sendrecvcount_xy = isave(4)
	637
	638	CALL resort_for_yx( f_inv, f_out_z )
	639
	640	DEALLOCATE( ar1, f_in, f_inv )
	641
	642	!
	643	!-- fft along x backward + subsequent final transposition x --> z
	644	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
	645	!
	646	!-- Save original indices and gridpoint counter
	647	isave(1) = nz
	648	isave(2) = nzb_x
	649	isave(3) = nzt_x
	650	isave(4) = sendrecvcount_zx
	651	!
	652	!-- Set new indices for transformation
	653	nblk = nz / pdims(1)
	654	nz = pdims(1)
	655	nnz_x = 1
	656	nzb_x = 1 + myidx * nnz_x
	657	nzt_x = ( myidx + 1 ) * nnz_x
	658	sendrecvcount_zx = nnx * nny * nnz_x
	659
[1306]	660	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
[1216]	661	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
	662
[1306]	663	DO kk = 1, nblk
[1216]	664
[1318]	665	CALL cpu_log( log_point_s(4), 'fft_x', 'continue', cpu_log_nowait )
[1216]	666
[1306]	667	n = isave(2) + kk - 1
	668	CALL fft_x( ar1(:,:,:), 'backward', f_out_z(:,:,n))
[1216]	669
[1306]	670	IF ( kk == nblk ) THEN
	671	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	672	ELSE
	673	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
[1216]	674	ENDIF
	675
[1318]	676	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
[1216]	677
[1306]	678	CALL transpose_xz( ar1(:,:,:), f_in )
[1216]	679
[1306]	680	DO knew = 1, nz
	681	ki = kk + nblk * (knew-1)
	682	f_inv(:,:,ki) = f_in(:,:,knew)
	683	ENDDO
[1216]	684
[1306]	685	IF ( kk == nblk ) THEN
	686	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
	687	ELSE
	688	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
[1216]	689	ENDIF
	690
	691	ENDDO
	692	!
	693	!-- Restore original indices/counters
	694	nz = isave(1)
	695	nzb_x = isave(2)
	696	nzt_x = isave(3)
	697	sendrecvcount_zx = isave(4)
	698
	699	CALL resort_for_xz( f_inv, ar )
	700
	701	DEALLOCATE( ar1, f_in, f_inv )
	702
[1]	703	ENDIF
	704
	705	CALL cpu_log( log_point_s(3), 'poisfft', 'stop' )
	706
	707	END SUBROUTINE poisfft
	708
	709
	710
[1216]	711	SUBROUTINE ffty_tr_yx( f_in, f_out )
[1]	712
	713	!------------------------------------------------------------------------------!
	714	! Fourier-transformation along y with subsequent transposition y --> x for
	715	! a 1d-decomposition along x
	716	!
	717	! ATTENTION: The performance of this routine is much faster on the NEC-SX6,
	718	! if the first index of work_ffty_vec is odd. Otherwise
	719	! memory bank conflicts may occur (especially if the index is a
	720	! multiple of 128). That's why work_ffty_vec is dimensioned as
	721	! 0:ny+1.
	722	! Of course, this will not work if users are using an odd number
	723	! of gridpoints along y.
	724	!------------------------------------------------------------------------------!
	725
[1320]	726	USE control_parameters, &
	727	ONLY: host
	728
	729	USE cpulog, &
	730	ONLY: cpu_log, log_point_s
	731
	732	USE kinds
	733
[1]	734	USE pegrid
	735
	736	IMPLICIT NONE
	737
[1320]	738	INTEGER(iwp) :: i !:
	739	INTEGER(iwp) :: iend !:
	740	INTEGER(iwp) :: iouter !:
	741	INTEGER(iwp) :: ir !:
	742	INTEGER(iwp) :: j !:
	743	INTEGER(iwp) :: k !:
[1]	744
[1320]	745	INTEGER(iwp), PARAMETER :: stridex = 4 !:
	746
	747	REAL(wp), DIMENSION(0:ny,stridex) :: work_ffty !:
[1]	748	#if defined( __nec )
[1320]	749	REAL(wp), DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec !:
[1]	750	#endif
[1320]	751	REAL(wp), DIMENSION(1:nz,0:ny,nxl:nxr) :: f_in !:
	752	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_out !:
	753	REAL(wp), DIMENSION(nxl:nxr,1:nz,0:ny) :: work !:
[1]	754
	755	!
	756	!-- Carry out the FFT along y, where all data are present due to the
	757	!-- 1d-decomposition along x. Resort the data in a way that x becomes
	758	!-- the first index.
[1106]	759	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'start' )
[1]	760
	761	IF ( host(1:3) == 'nec' ) THEN
	762	#if defined( __nec )
	763	!
	764	!-- Code optimized for vector processors
[85]	765	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	766	!$OMP DO
	767	DO i = nxl, nxr
	768
	769	DO j = 0, ny
	770	DO k = 1, nz
	771	work_ffty_vec(j,k,i) = f_in(k,j,i)
	772	ENDDO
	773	ENDDO
	774
	775	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' )
	776
	777	ENDDO
	778
	779	!$OMP DO
	780	DO k = 1, nz
	781	DO j = 0, ny
	782	DO i = nxl, nxr
	783	work(i,k,j) = work_ffty_vec(j,k,i)
	784	ENDDO
	785	ENDDO
	786	ENDDO
	787	!$OMP END PARALLEL
	788	#endif
	789
	790	ELSE
	791
	792	!
	793	!-- Cache optimized code.
	794	!-- The i-(x-)direction is split into a strided outer loop and an inner
	795	!-- loop for better cache performance
	796	!$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty)
	797	!$OMP DO
	798	DO iouter = nxl, nxr, stridex
	799
	800	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
	801
	802	DO k = 1, nz
	803
	804	DO i = iouter, iend
	805
	806	ir = i-iouter+1 ! counter within a stride
	807	DO j = 0, ny
	808	work_ffty(j,ir) = f_in(k,j,i)
	809	ENDDO
	810	!
	811	!-- FFT along y
[1106]	812	CALL fft_y_1d( work_ffty(:,ir), 'forward' )
[1]	813
	814	ENDDO
	815
	816	!
	817	!-- Resort
	818	DO j = 0, ny
	819	DO i = iouter, iend
	820	work(i,k,j) = work_ffty(j,i-iouter+1)
	821	ENDDO
	822	ENDDO
	823
	824	ENDDO
	825
	826	ENDDO
	827	!$OMP END PARALLEL
	828
	829	ENDIF
[1106]	830	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'pause' )
[1]	831
	832	!
	833	!-- Transpose array
[1111]	834	#if defined( __parallel )
[1]	835	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	836	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	837	CALL MPI_ALLTOALL( work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
	838	f_out(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
	839	comm1dx, ierr )
	840	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1111]	841	#endif
[1]	842
	843	END SUBROUTINE ffty_tr_yx
	844
	845
[1216]	846	SUBROUTINE tr_xy_ffty( f_in, f_out )
[1]	847
	848	!------------------------------------------------------------------------------!
	849	! Transposition x --> y with a subsequent backward Fourier transformation for
	850	! a 1d-decomposition along x
	851	!------------------------------------------------------------------------------!
	852
[1320]	853	USE control_parameters, &
	854	ONLY: host
	855
	856	USE cpulog, &
	857	ONLY: cpu_log, log_point_s
	858
	859	USE kinds
	860
[1]	861	USE pegrid
	862
	863	IMPLICIT NONE
	864
[1320]	865	INTEGER(iwp) :: i !:
	866	INTEGER(iwp) :: iend !:
	867	INTEGER(iwp) :: iouter !:
	868	INTEGER(iwp) :: ir !:
	869	INTEGER(iwp) :: j !:
	870	INTEGER(iwp) :: k !:
[1]	871
[1320]	872	INTEGER(iwp), PARAMETER :: stridex = 4 !:
	873
	874	REAL(wp), DIMENSION(0:ny,stridex) :: work_ffty !:
[1]	875	#if defined( __nec )
[1320]	876	REAL(wp), DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec !:
[1]	877	#endif
[1320]	878	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_in !:
	879	REAL(wp), DIMENSION(1:nz,0:ny,nxl:nxr) :: f_out !:
	880	REAL(wp), DIMENSION(nxl:nxr,1:nz,0:ny) :: work !:
[1]	881
	882	!
	883	!-- Transpose array
[1111]	884	#if defined( __parallel )
[1]	885	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	886	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	887	CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
	888	work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
	889	comm1dx, ierr )
	890	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1111]	891	#endif
[1]	892
	893	!
	894	!-- Resort the data in a way that y becomes the first index and carry out the
	895	!-- backward fft along y.
[1106]	896	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'continue' )
[1]	897
	898	IF ( host(1:3) == 'nec' ) THEN
	899	#if defined( __nec )
	900	!
	901	!-- Code optimized for vector processors
[85]	902	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	903	!$OMP DO
	904	DO k = 1, nz
	905	DO j = 0, ny
	906	DO i = nxl, nxr
	907	work_ffty_vec(j,k,i) = work(i,k,j)
	908	ENDDO
	909	ENDDO
	910	ENDDO
	911
	912	!$OMP DO
	913	DO i = nxl, nxr
	914
	915	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' )
	916
	917	DO j = 0, ny
	918	DO k = 1, nz
	919	f_out(k,j,i) = work_ffty_vec(j,k,i)
	920	ENDDO
	921	ENDDO
	922
	923	ENDDO
	924	!$OMP END PARALLEL
	925	#endif
	926
	927	ELSE
	928
	929	!
	930	!-- Cache optimized code.
	931	!-- The i-(x-)direction is split into a strided outer loop and an inner
	932	!-- loop for better cache performance
	933	!$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty )
	934	!$OMP DO
	935	DO iouter = nxl, nxr, stridex
	936
	937	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
	938
	939	DO k = 1, nz
	940	!
	941	!-- Resort
	942	DO j = 0, ny
	943	DO i = iouter, iend
	944	work_ffty(j,i-iouter+1) = work(i,k,j)
	945	ENDDO
	946	ENDDO
	947
	948	DO i = iouter, iend
	949
	950	!
	951	!-- FFT along y
	952	ir = i-iouter+1 ! counter within a stride
[1106]	953	CALL fft_y_1d( work_ffty(:,ir), 'backward' )
[1]	954
	955	DO j = 0, ny
	956	f_out(k,j,i) = work_ffty(j,ir)
	957	ENDDO
	958	ENDDO
	959
	960	ENDDO
	961
	962	ENDDO
	963	!$OMP END PARALLEL
	964
	965	ENDIF
	966
[1106]	967	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'stop' )
[1]	968
	969	END SUBROUTINE tr_xy_ffty
	970
	971
	972	SUBROUTINE fftx_tri_fftx( ar )
	973
	974	!------------------------------------------------------------------------------!
	975	! FFT along x, solution of the tridiagonal system and backward FFT for
	976	! a 1d-decomposition along x
	977	!
	978	! WARNING: this subroutine may still not work for hybrid parallelization
	979	! with OpenMP (for possible necessary changes see the original
	980	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
	981	!------------------------------------------------------------------------------!
	982
[1320]	983	USE control_parameters, &
	984	ONLY: host
	985
	986	USE cpulog, &
	987	ONLY: cpu_log, log_point_s
	988
	989	USE grid_variables, &
	990	ONLY: ddx2, ddy2
	991
	992	USE kinds
	993
[1]	994	USE pegrid
	995
	996	IMPLICIT NONE
	997
[1320]	998	INTEGER(iwp) :: i !:
	999	INTEGER(iwp) :: j !:
	1000	INTEGER(iwp) :: k !:
	1001	INTEGER(iwp) :: m !:
	1002	INTEGER(iwp) :: n !:
	1003	INTEGER(iwp) :: omp_get_thread_num !:
	1004	INTEGER(iwp) :: tn !:
[1]	1005
[1320]	1006	REAL(wp), DIMENSION(0:nx) :: work_fftx !:
	1007	REAL(wp), DIMENSION(0:nx,1:nz) :: work_trix !:
	1008	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: ar !:
	1009	REAL(wp), DIMENSION(:,:,:,:), ALLOCATABLE :: tri !:
[1]	1010
	1011
[1106]	1012	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'start' )
[1]	1013
	1014	ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) )
	1015
	1016	tn = 0 ! Default thread number in case of one thread
	1017	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_fftx, work_trix )
	1018	DO j = nys_x, nyn_x
	1019
	1020	!$ tn = omp_get_thread_num()
	1021
	1022	IF ( host(1:3) == 'nec' ) THEN
	1023	!
	1024	!-- Code optimized for vector processors
	1025	DO k = 1, nz
	1026
	1027	m = 0
	1028	DO n = 1, pdims(1)
[1003]	1029	DO i = 1, nnx
[1]	1030	work_trix(m,k) = ar(i,k,j,n)
	1031	m = m + 1
	1032	ENDDO
	1033	ENDDO
	1034
	1035	ENDDO
	1036
	1037	CALL fft_x_m( work_trix, 'forward' )
	1038
	1039	ELSE
	1040	!
	1041	!-- Cache optimized code
	1042	DO k = 1, nz
	1043
	1044	m = 0
	1045	DO n = 1, pdims(1)
[1003]	1046	DO i = 1, nnx
[1]	1047	work_fftx(m) = ar(i,k,j,n)
	1048	m = m + 1
	1049	ENDDO
	1050	ENDDO
	1051
[1106]	1052	CALL fft_x_1d( work_fftx, 'forward' )
[1]	1053
	1054	DO i = 0, nx
	1055	work_trix(i,k) = work_fftx(i)
	1056	ENDDO
	1057
	1058	ENDDO
	1059
	1060	ENDIF
	1061
	1062	!
	1063	!-- Solve the linear equation system
	1064	CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) )
	1065
	1066	IF ( host(1:3) == 'nec' ) THEN
	1067	!
	1068	!-- Code optimized for vector processors
	1069	CALL fft_x_m( work_trix, 'backward' )
	1070
	1071	DO k = 1, nz
	1072
	1073	m = 0
	1074	DO n = 1, pdims(1)
[1003]	1075	DO i = 1, nnx
[1]	1076	ar(i,k,j,n) = work_trix(m,k)
	1077	m = m + 1
	1078	ENDDO
	1079	ENDDO
	1080
	1081	ENDDO
	1082
	1083	ELSE
	1084	!
	1085	!-- Cache optimized code
	1086	DO k = 1, nz
	1087
	1088	DO i = 0, nx
	1089	work_fftx(i) = work_trix(i,k)
	1090	ENDDO
	1091
[1106]	1092	CALL fft_x_1d( work_fftx, 'backward' )
[1]	1093
	1094	m = 0
	1095	DO n = 1, pdims(1)
[1003]	1096	DO i = 1, nnx
[1]	1097	ar(i,k,j,n) = work_fftx(m)
	1098	m = m + 1
	1099	ENDDO
	1100	ENDDO
	1101
	1102	ENDDO
	1103
	1104	ENDIF
	1105
	1106	ENDDO
	1107
	1108	DEALLOCATE( tri )
	1109
[1106]	1110	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'stop' )
[1]	1111
	1112	END SUBROUTINE fftx_tri_fftx
	1113
	1114
[1216]	1115	SUBROUTINE fftx_tr_xy( f_in, f_out )
[1]	1116
	1117	!------------------------------------------------------------------------------!
	1118	! Fourier-transformation along x with subsequent transposition x --> y for
	1119	! a 1d-decomposition along y
	1120	!
	1121	! ATTENTION: The NEC-branch of this routine may significantly profit from
	1122	! further optimizations. So far, performance is much worse than
	1123	! for routine ffty_tr_yx (more than three times slower).
	1124	!------------------------------------------------------------------------------!
	1125
[1320]	1126	USE control_parameters, &
	1127	ONLY: host
	1128
	1129	USE cpulog, &
	1130	ONLY: cpu_log, log_point_s
	1131
	1132	USE kinds
	1133
[1]	1134	USE pegrid
	1135
	1136	IMPLICIT NONE
	1137
[1320]	1138	INTEGER(iwp) :: i !:
	1139	INTEGER(iwp) :: j !:
	1140	INTEGER(iwp) :: k !:
[1]	1141
[1320]	1142	REAL(wp), DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx !:
	1143	REAL(wp), DIMENSION(1:nz,nys:nyn,0:nx) :: f_in !:
	1144	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_out !:
	1145	REAL(wp), DIMENSION(nys:nyn,1:nz,0:nx) :: work !:
[1]	1146
	1147	!
	1148	!-- Carry out the FFT along x, where all data are present due to the
	1149	!-- 1d-decomposition along y. Resort the data in a way that y becomes
	1150	!-- the first index.
[1106]	1151	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'start' )
[1]	1152
	1153	IF ( host(1:3) == 'nec' ) THEN
	1154	!
	1155	!-- Code for vector processors
[85]	1156	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	1157	!$OMP DO
	1158	DO i = 0, nx
	1159
	1160	DO j = nys, nyn
	1161	DO k = 1, nz
	1162	work_fftx(i,k,j) = f_in(k,j,i)
	1163	ENDDO
	1164	ENDDO
	1165
	1166	ENDDO
	1167
	1168	!$OMP DO
	1169	DO j = nys, nyn
	1170
	1171	CALL fft_x_m( work_fftx(:,:,j), 'forward' )
	1172
	1173	DO k = 1, nz
	1174	DO i = 0, nx
	1175	work(j,k,i) = work_fftx(i,k,j)
	1176	ENDDO
	1177	ENDDO
	1178
	1179	ENDDO
	1180	!$OMP END PARALLEL
	1181
	1182	ELSE
	1183
	1184	!
	1185	!-- Cache optimized code (there might be still a potential for better
	1186	!-- optimization).
[696]	1187	!$OMP PARALLEL PRIVATE (i,j,k)
[1]	1188	!$OMP DO
	1189	DO i = 0, nx
	1190
	1191	DO j = nys, nyn
	1192	DO k = 1, nz
	1193	work_fftx(i,k,j) = f_in(k,j,i)
	1194	ENDDO
	1195	ENDDO
	1196
	1197	ENDDO
	1198
	1199	!$OMP DO
	1200	DO j = nys, nyn
	1201	DO k = 1, nz
	1202
[1106]	1203	CALL fft_x_1d( work_fftx(0:nx,k,j), 'forward' )
[1]	1204
	1205	DO i = 0, nx
	1206	work(j,k,i) = work_fftx(i,k,j)
	1207	ENDDO
	1208	ENDDO
	1209
	1210	ENDDO
	1211	!$OMP END PARALLEL
	1212
	1213	ENDIF
[1106]	1214	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'pause' )
[1]	1215
	1216	!
	1217	!-- Transpose array
[1111]	1218	#if defined( __parallel )
[1]	1219	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	1220	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	1221	CALL MPI_ALLTOALL( work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
	1222	f_out(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
	1223	comm1dy, ierr )
	1224	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1111]	1225	#endif
[1]	1226
	1227	END SUBROUTINE fftx_tr_xy
	1228
	1229
[1216]	1230	SUBROUTINE tr_yx_fftx( f_in, f_out )
[1]	1231
	1232	!------------------------------------------------------------------------------!
	1233	! Transposition y --> x with a subsequent backward Fourier transformation for
	1234	! a 1d-decomposition along x
	1235	!------------------------------------------------------------------------------!
	1236
[1320]	1237	USE control_parameters, &
	1238	ONLY: host
	1239
	1240	USE cpulog, &
	1241	ONLY: cpu_log, log_point_s
	1242
	1243	USE kinds
	1244
[1]	1245	USE pegrid
	1246
	1247	IMPLICIT NONE
	1248
[1320]	1249	INTEGER(iwp) :: i !:
	1250	INTEGER(iwp) :: j !:
	1251	INTEGER(iwp) :: k !:
[1]	1252
[1320]	1253	REAL(wp), DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx !:
	1254	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_in !:
	1255	REAL(wp), DIMENSION(1:nz,nys:nyn,0:nx) :: f_out !:
	1256	REAL(wp), DIMENSION(nys:nyn,1:nz,0:nx) :: work !:
[1]	1257
	1258	!
	1259	!-- Transpose array
[1111]	1260	#if defined( __parallel )
[1]	1261	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	1262	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	1263	CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
	1264	work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
	1265	comm1dy, ierr )
	1266	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1111]	1267	#endif
[1]	1268
	1269	!
	1270	!-- Carry out the FFT along x, where all data are present due to the
	1271	!-- 1d-decomposition along y. Resort the data in a way that y becomes
	1272	!-- the first index.
[1106]	1273	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'continue' )
[1]	1274
	1275	IF ( host(1:3) == 'nec' ) THEN
	1276	!
	1277	!-- Code optimized for vector processors
[85]	1278	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	1279	!$OMP DO
	1280	DO j = nys, nyn
	1281
	1282	DO k = 1, nz
	1283	DO i = 0, nx
	1284	work_fftx(i,k,j) = work(j,k,i)
	1285	ENDDO
	1286	ENDDO
	1287
	1288	CALL fft_x_m( work_fftx(:,:,j), 'backward' )
	1289
	1290	ENDDO
	1291
	1292	!$OMP DO
	1293	DO i = 0, nx
	1294	DO j = nys, nyn
	1295	DO k = 1, nz
	1296	f_out(k,j,i) = work_fftx(i,k,j)
	1297	ENDDO
	1298	ENDDO
	1299	ENDDO
	1300	!$OMP END PARALLEL
	1301
	1302	ELSE
	1303
	1304	!
	1305	!-- Cache optimized code (there might be still a potential for better
	1306	!-- optimization).
[696]	1307	!$OMP PARALLEL PRIVATE (i,j,k)
[1]	1308	!$OMP DO
	1309	DO j = nys, nyn
	1310	DO k = 1, nz
	1311
	1312	DO i = 0, nx
	1313	work_fftx(i,k,j) = work(j,k,i)
	1314	ENDDO
	1315
[1106]	1316	CALL fft_x_1d( work_fftx(0:nx,k,j), 'backward' )
[1]	1317
	1318	ENDDO
	1319	ENDDO
	1320
	1321	!$OMP DO
	1322	DO i = 0, nx
	1323	DO j = nys, nyn
	1324	DO k = 1, nz
	1325	f_out(k,j,i) = work_fftx(i,k,j)
	1326	ENDDO
	1327	ENDDO
	1328	ENDDO
	1329	!$OMP END PARALLEL
	1330
	1331	ENDIF
[1106]	1332	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'stop' )
[1]	1333
	1334	END SUBROUTINE tr_yx_fftx
	1335
	1336
	1337	SUBROUTINE ffty_tri_ffty( ar )
	1338
	1339	!------------------------------------------------------------------------------!
	1340	! FFT along y, solution of the tridiagonal system and backward FFT for
	1341	! a 1d-decomposition along y
	1342	!
	1343	! WARNING: this subroutine may still not work for hybrid parallelization
	1344	! with OpenMP (for possible necessary changes see the original
	1345	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
	1346	!------------------------------------------------------------------------------!
	1347
[1320]	1348	USE control_parameters, &
	1349	ONLY: host
	1350
	1351	USE cpulog, &
	1352	ONLY: cpu_log, log_point_s
	1353
	1354	USE grid_variables, &
	1355	ONLY: ddx2, ddy2
	1356
	1357	USE kinds
	1358
[1]	1359	USE pegrid
	1360
	1361	IMPLICIT NONE
	1362
[1320]	1363	INTEGER(iwp) :: i !:
	1364	INTEGER(iwp) :: j !:
	1365	INTEGER(iwp) :: k !:
	1366	INTEGER(iwp) :: m !:
	1367	INTEGER(iwp) :: n !:
	1368	INTEGER(iwp) :: omp_get_thread_num !:
	1369	INTEGER(iwp) :: tn !:
[1]	1370
[1320]	1371	REAL(wp), DIMENSION(0:ny) :: work_ffty !:
	1372	REAL(wp), DIMENSION(0:ny,1:nz) :: work_triy !:
	1373	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: ar !:
	1374	REAL(wp), DIMENSION(:,:,:,:), ALLOCATABLE :: tri !:
[1]	1375
	1376
[1106]	1377	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'start' )
[1]	1378
	1379	ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) )
	1380
	1381	tn = 0 ! Default thread number in case of one thread
[696]	1382	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy )
[1]	1383	DO i = nxl_y, nxr_y
	1384
	1385	!$ tn = omp_get_thread_num()
	1386
	1387	IF ( host(1:3) == 'nec' ) THEN
	1388	!
	1389	!-- Code optimized for vector processors
	1390	DO k = 1, nz
	1391
	1392	m = 0
	1393	DO n = 1, pdims(2)
[1003]	1394	DO j = 1, nny
[1]	1395	work_triy(m,k) = ar(j,k,i,n)
	1396	m = m + 1
	1397	ENDDO
	1398	ENDDO
	1399
	1400	ENDDO
	1401
	1402	CALL fft_y_m( work_triy, ny, 'forward' )
	1403
	1404	ELSE
	1405	!
	1406	!-- Cache optimized code
	1407	DO k = 1, nz
	1408
	1409	m = 0
	1410	DO n = 1, pdims(2)
[1003]	1411	DO j = 1, nny
[1]	1412	work_ffty(m) = ar(j,k,i,n)
	1413	m = m + 1
	1414	ENDDO
	1415	ENDDO
	1416
[1106]	1417	CALL fft_y_1d( work_ffty, 'forward' )
[1]	1418
	1419	DO j = 0, ny
	1420	work_triy(j,k) = work_ffty(j)
	1421	ENDDO
	1422
	1423	ENDDO
	1424
	1425	ENDIF
	1426
	1427	!
	1428	!-- Solve the linear equation system
	1429	CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) )
	1430
	1431	IF ( host(1:3) == 'nec' ) THEN
	1432	!
	1433	!-- Code optimized for vector processors
	1434	CALL fft_y_m( work_triy, ny, 'backward' )
	1435
	1436	DO k = 1, nz
	1437
	1438	m = 0
	1439	DO n = 1, pdims(2)
[1003]	1440	DO j = 1, nny
[1]	1441	ar(j,k,i,n) = work_triy(m,k)
	1442	m = m + 1
	1443	ENDDO
	1444	ENDDO
	1445
	1446	ENDDO
	1447
	1448	ELSE
	1449	!
	1450	!-- Cache optimized code
	1451	DO k = 1, nz
	1452
	1453	DO j = 0, ny
	1454	work_ffty(j) = work_triy(j,k)
	1455	ENDDO
	1456
[1106]	1457	CALL fft_y_1d( work_ffty, 'backward' )
[1]	1458
	1459	m = 0
	1460	DO n = 1, pdims(2)
[1003]	1461	DO j = 1, nny
[1]	1462	ar(j,k,i,n) = work_ffty(m)
	1463	m = m + 1
	1464	ENDDO
	1465	ENDDO
	1466
	1467	ENDDO
	1468
	1469	ENDIF
	1470
	1471	ENDDO
	1472
	1473	DEALLOCATE( tri )
	1474
[1106]	1475	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'stop' )
[1]	1476
	1477	END SUBROUTINE ffty_tri_ffty
	1478
	1479	#endif
	1480
	1481	END MODULE poisfft_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |