Home

Context Navigation

source: palm/trunk/SOURCE/poisfft.f90 @ 1463

Last change on this file since 1463 was 1407, checked in by raasch, 11 years ago
last commit documented
Property svn:keywords set to `Id`
File size: 44.9 KB

Rev	Line
[1]	1	MODULE poisfft_mod
	2
[1036]	3	!--------------------------------------------------------------------------------!
	4	! This file is part of PALM.
	5	!
	6	! PALM is free software: you can redistribute it and/or modify it under the terms
	7	! of the GNU General Public License as published by the Free Software Foundation,
	8	! either version 3 of the License, or (at your option) any later version.
	9	!
	10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
	11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
	12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
	13	!
	14	! You should have received a copy of the GNU General Public License along with
	15	! PALM. If not, see <http://www.gnu.org/licenses/>.
	16	!
[1310]	17	! Copyright 1997-2014 Leibniz Universitaet Hannover
[1036]	18	!--------------------------------------------------------------------------------!
	19	!
[484]	20	! Current revisions:
[1]	21	! -----------------
[1407]	22	!
	23	!
[1321]	24	! Former revisions:
	25	! -----------------
	26	! $Id: poisfft.f90 1407 2014-05-16 14:06:08Z suehring $
	27	!
[1407]	28	! 1406 2014-05-16 13:47:01Z raasch
	29	! bugfix for pgi 14.4: declare create moved after array declaration
	30	!
[1321]	31	! 1320 2014-03-20 08:40:49Z raasch
[1320]	32	! ONLY-attribute added to USE-statements,
	33	! kind-parameters added to all INTEGER and REAL declaration statements,
	34	! kinds are defined in new module kinds,
	35	! old module precision_kind is removed,
	36	! revision history before 2012 removed,
	37	! comment fields (!:) to be used for variable explanations added to
	38	! all variable declaration statements
[1217]	39	!
[1319]	40	! 1318 2014-03-17 13:35:16Z raasch
	41	! module interfaces removed
	42	!
[1307]	43	! 1306 2014-03-13 14:30:59Z raasch
	44	! openmp sections removed from the overlap branch,
	45	! second argument removed from parameter list
	46	!
[1217]	47	! 1216 2013-08-26 09:31:42Z raasch
[1216]	48	! resorting of arrays moved to separate routines resort_for_...,
	49	! one argument, used as temporary work array, removed from all transpose
	50	! routines
	51	! overlapping fft / transposition implemented
[1112]	52	!
[1213]	53	! 1212 2013-08-15 08:46:27Z raasch
	54	! tridia routines moved to seperate module tridia_solver
	55	!
[1209]	56	! 1208 2013-08-13 06:41:49Z raasch
	57	! acc-update clauses added for "ar" so that ffts other than cufft can also be
	58	! used (although they are not ported and will give a poor performance)
	59	!
[1112]	60	! 1111 2013-03-08 23:54:10Z raasch
[1111]	61	! further openACC porting of non-parallel (MPI) branch:
	62	! tridiagonal routines split into extermal subroutines (instead using CONTAINS),
	63	! no distinction between parallel/non-parallel in poisfft and tridia any more,
[1112]	64	! tridia routines moved to end of file because of probable bug in PGI compiler 12.5
[1111]	65	! (otherwise "invalid device function" is indicated during runtime),
	66	! optimization of tridia routines: constant elements and coefficients of tri are
	67	! stored in seperate arrays ddzuw and tric, last dimension of tri reduced from 5
	68	! to 2,
	69	! poisfft_init is now called internally from poisfft, maketri is called from
	70	! poisfft_init,
	71	! ibc_p_b = 2 removed
[1]	72	!
[1107]	73	! 1106 2013-03-04 05:31:38Z raasch
	74	! routines fftx, ffty, fftxp, fftyp removed, calls replaced by fft_x, fft_y,
	75	! in the 1D-decomposition routines fft_x, ffty are replaced by fft_x_1d,
	76	! fft_y_1d
	77	!
[1104]	78	! 1103 2013-02-20 02:15:53Z raasch
	79	! tri, ar, and ar1 arguments in tridia-routines (2d) are removed because they
	80	! sometimes cause segmentation faults with intel 12.1 compiler
	81	!
[1093]	82	! 1092 2013-02-02 11:24:22Z raasch
	83	! unused variables removed
	84	!
[1037]	85	! 1036 2012-10-22 13:43:42Z raasch
	86	! code put under GPL (PALM 3.9)
	87	!
[1014]	88	! 2012-09-21 07:03:55Z raasch
	89	! FLOAT type conversion replaced by REAL
	90	!
[1004]	91	! 1003 2012-09-14 14:35:53Z raasch
	92	! indices nxa, nya, etc. replaced by nx, ny, etc.
	93	!
[941]	94	! 940 2012-07-09 14:31:00Z raasch
	95	! special handling of tri-array as an argument in tridia_1dd routines switched
	96	! off because it caused segmentation faults with intel 12.1 compiler
	97	!
[878]	98	! 877 2012-04-03 11:21:44Z suehring
	99	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
	100	! pressure at the top of the model domain.
	101	!
[810]	102	! 809 2012-01-30 13:32:58Z maronga
	103	! Bugfix: replaced .AND. and .NOT. with && and ! in the preprocessor directives
	104	!
[808]	105	! 807 2012-01-25 11:53:51Z maronga
	106	! New cpp directive "__check" implemented which is used by check_namelist_files
	107	! (most of the code is unneeded by check_namelist_files).
	108	!
[1]	109	! Revision 1.1 1997/07/24 11:24:14 raasch
	110	! Initial revision
	111	!
	112	!
	113	! Description:
	114	! ------------
[1306]	115	! Original version by Stephan Siano (pois3d), as of July 23, 1996
	116	! Adapted for 2D-domain-decomposition by Siegfried Raasch, July 3, 1997
	117	!
	118	! Solves the Poisson equation with a 2D spectral method
	119	! d^2 p / dx^2 + d^2 p / dy^2 + d^2 p / dz^2 = s
	120	!
	121	! Input:
	122	! real ar contains (nnz,nny,nnx) elements of the velocity divergence,
	123	! starting from (1,nys,nxl)
	124	!
	125	! Output:
	126	! real ar contains the solution for perturbation pressure p
[1]	127	!------------------------------------------------------------------------------!
	128
[1320]	129	USE fft_xy, &
	130	ONLY: fft_init, fft_y, fft_y_1d, fft_y_m, fft_x, fft_x_1d, fft_x_m
[1]	131
[1320]	132	USE indices, &
	133	ONLY: nnx, nny, nx, nxl, nxr, ny, nys, nyn, nz
	134
	135	USE transpose_indices, &
	136	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nys_x, nys_z, nyn_x, nyn_z, nzb_x, &
	137	nzb_y, nzt_x, nzt_y
	138
	139	USE tridia_solver, &
	140	ONLY: tridia_1dd, tridia_init, tridia_substi, tridia_substi_overlap
	141
[1]	142	IMPLICIT NONE
	143
[1111]	144	LOGICAL, SAVE :: poisfft_initialized = .FALSE.
	145
[1]	146	PRIVATE
[807]	147
[809]	148	#if ! defined ( __check )
[1]	149	PUBLIC poisfft, poisfft_init
	150
	151	INTERFACE poisfft
	152	MODULE PROCEDURE poisfft
	153	END INTERFACE poisfft
	154
	155	INTERFACE poisfft_init
	156	MODULE PROCEDURE poisfft_init
	157	END INTERFACE poisfft_init
[807]	158	#else
	159	PUBLIC poisfft_init
[1]	160
[807]	161	INTERFACE poisfft_init
	162	MODULE PROCEDURE poisfft_init
	163	END INTERFACE poisfft_init
	164	#endif
	165
[1]	166	CONTAINS
	167
	168	SUBROUTINE poisfft_init
	169
[1320]	170	USE arrays_3d, &
	171	ONLY: ddzu_pres, ddzw
[1111]	172
[1320]	173	USE kinds
	174
[1111]	175	IMPLICIT NONE
	176
[1320]	177	INTEGER(iwp) :: k !:
[1111]	178
	179
[1]	180	CALL fft_init
	181
[1212]	182	CALL tridia_init
[1111]	183
	184	poisfft_initialized = .TRUE.
	185
[1]	186	END SUBROUTINE poisfft_init
	187
[1111]	188
[809]	189	#if ! defined ( __check )
[1306]	190	SUBROUTINE poisfft( ar )
[1]	191
[1320]	192	USE control_parameters, &
	193	ONLY: fft_method, transpose_compute_overlap
	194
	195	USE cpulog, &
	196	ONLY: cpu_log, cpu_log_nowait, log_point_s
	197
	198	USE kinds
	199
[1]	200	USE pegrid
	201
	202	IMPLICIT NONE
	203
[1320]	204	INTEGER(iwp) :: ii !:
	205	INTEGER(iwp) :: iind !:
	206	INTEGER(iwp) :: inew !:
	207	INTEGER(iwp) :: jj !:
	208	INTEGER(iwp) :: jind !:
	209	INTEGER(iwp) :: jnew !:
	210	INTEGER(iwp) :: ki !:
	211	INTEGER(iwp) :: kk !:
	212	INTEGER(iwp) :: knew !:
	213	INTEGER(iwp) :: n !:
	214	INTEGER(iwp) :: nblk !:
	215	INTEGER(iwp) :: nnx_y !:
	216	INTEGER(iwp) :: nny_z !:
	217	INTEGER(iwp) :: nnz_t !:
	218	INTEGER(iwp) :: nnz_x !:
	219	INTEGER(iwp) :: nxl_y_bound !:
	220	INTEGER(iwp) :: nxr_y_bound !:
[1]	221
[1320]	222	INTEGER(iwp), DIMENSION(4) :: isave !:
	223
	224	REAL(wp), DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar !:
[1406]	225	REAL(wp), DIMENSION(nys:nyn,nxl:nxr,1:nz) :: ar_inv !:
[1216]	226	!$acc declare create( ar_inv )
[1]	227
[1320]	228	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: ar1 !:
	229	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_in !:
	230	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_inv !:
	231	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_out_y !:
	232	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_out_z !:
[1216]	233
	234
[1]	235	CALL cpu_log( log_point_s(3), 'poisfft', 'start' )
	236
[1111]	237	IF ( .NOT. poisfft_initialized ) CALL poisfft_init
	238
[1]	239	!
	240	!-- Two-dimensional Fourier Transformation in x- and y-direction.
[1111]	241	IF ( pdims(2) == 1 .AND. pdims(1) > 1 ) THEN
[1]	242
	243	!
	244	!-- 1d-domain-decomposition along x:
	245	!-- FFT along y and transposition y --> x
[1216]	246	CALL ffty_tr_yx( ar, ar )
[1]	247
	248	!
	249	!-- FFT along x, solving the tridiagonal system and backward FFT
	250	CALL fftx_tri_fftx( ar )
	251
	252	!
	253	!-- Transposition x --> y and backward FFT along y
[1216]	254	CALL tr_xy_ffty( ar, ar )
[1]	255
[1111]	256	ELSEIF ( pdims(1) == 1 .AND. pdims(2) > 1 ) THEN
[1]	257
	258	!
	259	!-- 1d-domain-decomposition along y:
	260	!-- FFT along x and transposition x --> y
[1216]	261	CALL fftx_tr_xy( ar, ar )
[1]	262
	263	!
	264	!-- FFT along y, solving the tridiagonal system and backward FFT
	265	CALL ffty_tri_ffty( ar )
	266
	267	!
	268	!-- Transposition y --> x and backward FFT along x
[1216]	269	CALL tr_yx_fftx( ar, ar )
[1]	270
[1216]	271	ELSEIF ( .NOT. transpose_compute_overlap ) THEN
[1]	272
	273	!
[1111]	274	!-- 2d-domain-decomposition or no decomposition (1 PE run)
[1]	275	!-- Transposition z --> x
	276	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
[1216]	277	CALL resort_for_zx( ar, ar_inv )
	278	CALL transpose_zx( ar_inv, ar )
[1]	279	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
	280
	281	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
[1208]	282	IF ( fft_method /= 'system-specific' ) THEN
	283	!$acc update host( ar )
	284	ENDIF
[1106]	285	CALL fft_x( ar, 'forward' )
[1208]	286	IF ( fft_method /= 'system-specific' ) THEN
	287	!$acc update device( ar )
	288	ENDIF
[1]	289	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
	290
	291	!
	292	!-- Transposition x --> y
	293	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
[1216]	294	CALL resort_for_xy( ar, ar_inv )
	295	CALL transpose_xy( ar_inv, ar )
[1]	296	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
	297
	298	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
[1208]	299	IF ( fft_method /= 'system-specific' ) THEN
	300	!$acc update host( ar )
	301	ENDIF
[1216]	302	CALL fft_y( ar, 'forward', ar_tr = ar, &
	303	nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
	304	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
[1208]	305	IF ( fft_method /= 'system-specific' ) THEN
	306	!$acc update device( ar )
	307	ENDIF
[1]	308	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	309
	310	!
	311	!-- Transposition y --> z
	312	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
[1216]	313	CALL resort_for_yz( ar, ar_inv )
	314	CALL transpose_yz( ar_inv, ar )
[1]	315	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
	316
	317	!
[1106]	318	!-- Solve the tridiagonal equation system along z
[1]	319	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
[1212]	320	CALL tridia_substi( ar )
[1]	321	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
	322
	323	!
	324	!-- Inverse Fourier Transformation
	325	!-- Transposition z --> y
	326	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
[1216]	327	CALL transpose_zy( ar, ar_inv )
	328	CALL resort_for_zy( ar_inv, ar )
[1]	329	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
	330
	331	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
[1208]	332	IF ( fft_method /= 'system-specific' ) THEN
	333	!$acc update host( ar )
	334	ENDIF
[1216]	335	CALL fft_y( ar, 'backward', ar_tr = ar, &
	336	nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
	337	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
[1208]	338	IF ( fft_method /= 'system-specific' ) THEN
	339	!$acc update device( ar )
	340	ENDIF
[1]	341	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	342
	343	!
	344	!-- Transposition y --> x
	345	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
[1216]	346	CALL transpose_yx( ar, ar_inv )
	347	CALL resort_for_yx( ar_inv, ar )
[1]	348	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
	349
	350	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
[1208]	351	IF ( fft_method /= 'system-specific' ) THEN
	352	!$acc update host( ar )
	353	ENDIF
[1106]	354	CALL fft_x( ar, 'backward' )
[1208]	355	IF ( fft_method /= 'system-specific' ) THEN
	356	!$acc update device( ar )
	357	ENDIF
[1]	358	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	359
	360	!
	361	!-- Transposition x --> z
	362	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
[1216]	363	CALL transpose_xz( ar, ar_inv )
	364	CALL resort_for_xz( ar_inv, ar )
[1]	365	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
	366
[1216]	367	ELSE
	368
	369	!
	370	!-- 2d-domain-decomposition or no decomposition (1 PE run) with
	371	!-- overlapping transposition / fft
[1318]	372	!-- cputime logging must not use barriers, which would prevent overlapping
[1216]	373	ALLOCATE( f_out_y(0:ny,nxl_y:nxr_y,nzb_y:nzt_y), &
	374	f_out_z(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
	375	!
	376	!-- Transposition z --> x + subsequent fft along x
	377	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
	378	CALL resort_for_zx( ar, f_inv )
	379	!
	380	!-- Save original indices and gridpoint counter
	381	isave(1) = nz
	382	isave(2) = nzb_x
	383	isave(3) = nzt_x
	384	isave(4) = sendrecvcount_zx
	385	!
	386	!-- Set new indices for transformation
	387	nblk = nz / pdims(1)
	388	nz = pdims(1)
	389	nnz_x = 1
	390	nzb_x = 1 + myidx * nnz_x
	391	nzt_x = ( myidx + 1 ) * nnz_x
	392	sendrecvcount_zx = nnx * nny * nnz_x
	393
[1306]	394	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
[1216]	395	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
	396
[1306]	397	DO kk = 1, nblk
[1216]	398
[1306]	399	IF ( kk == 1 ) THEN
[1318]	400	CALL cpu_log( log_point_s(5), 'transpo forward', 'start', cpu_log_nowait )
[1306]	401	ELSE
[1318]	402	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
[1306]	403	ENDIF
[1216]	404
[1306]	405	DO knew = 1, nz
	406	ki = kk + nblk * ( knew - 1 )
	407	f_in(:,:,knew) = f_inv(:,:,ki)
	408	ENDDO
[1216]	409
[1306]	410	CALL transpose_zx( f_in, ar1(:,:,:))
	411	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
[1216]	412
[1306]	413	IF ( kk == 1 ) THEN
[1318]	414	CALL cpu_log( log_point_s(4), 'fft_x', 'start', cpu_log_nowait )
[1306]	415	ELSE
[1318]	416	CALL cpu_log( log_point_s(4), 'fft_x', 'continue', cpu_log_nowait )
[1216]	417	ENDIF
	418
[1306]	419	n = isave(2) + kk - 1
	420	CALL fft_x( ar1(:,:,:), 'forward', ar_2d = f_out_z(:,:,n))
	421	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
[1216]	422
	423	ENDDO
	424	!
	425	!-- Restore original indices/counters
	426	nz = isave(1)
	427	nzb_x = isave(2)
	428	nzt_x = isave(3)
	429	sendrecvcount_zx = isave(4)
	430
	431	DEALLOCATE( ar1, f_in, f_inv )
	432
	433	!
	434	!-- Transposition x --> y + subsequent fft along y
	435	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
	436	CALL resort_for_xy( f_out_z, f_inv )
	437	!
	438	!-- Save original indices and gridpoint counter
	439	isave(1) = nx
	440	isave(2) = nxl_y
	441	isave(3) = nxr_y
	442	isave(4) = sendrecvcount_xy
	443	!
	444	!-- Set new indices for transformation
	445	nblk = ( ( nx+1 ) / pdims(2) ) - 1
	446	nx = pdims(2)
	447	nnx_y = 1
	448	nxl_y = myidy * nnx_y
	449	nxr_y = ( myidy + 1 ) * nnx_y - 1
	450	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
	451
[1306]	452	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
[1216]	453	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
	454
[1306]	455	DO ii = 0, nblk
[1216]	456
[1318]	457	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
[1216]	458
[1306]	459	DO inew = 0, nx-1
	460	iind = ii + ( nblk + 1 ) * inew
	461	f_in(:,:,inew) = f_inv(:,:,iind)
	462	ENDDO
[1216]	463
[1306]	464	CALL transpose_xy( f_in, ar1(:,:,:) )
[1216]	465
[1306]	466	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
[1216]	467
[1306]	468	IF ( ii == 1 ) THEN
[1318]	469	CALL cpu_log( log_point_s(7), 'fft_y', 'start', cpu_log_nowait )
[1306]	470	ELSE
[1318]	471	CALL cpu_log( log_point_s(7), 'fft_y', 'continue', cpu_log_nowait )
[1216]	472	ENDIF
	473
[1306]	474	nxl_y_bound = isave(2)
	475	nxr_y_bound = isave(3)
	476	n = isave(2) + ii
	477	CALL fft_y( ar1(:,:,:), 'forward', ar_tr = f_out_y, &
	478	nxl_y_bound = nxl_y_bound, nxr_y_bound = nxr_y_bound, &
	479	nxl_y_l = n, nxr_y_l = n )
[1216]	480
[1306]	481	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
[1216]	482
	483	ENDDO
	484	!
	485	!-- Restore original indices/counters
	486	nx = isave(1)
	487	nxl_y = isave(2)
	488	nxr_y = isave(3)
	489	sendrecvcount_xy = isave(4)
	490
	491	DEALLOCATE( ar1, f_in, f_inv )
	492
	493	!
	494	!-- Transposition y --> z + subsequent tridia + resort for z --> y
	495	ALLOCATE( f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
	496	CALL resort_for_yz( f_out_y, f_inv )
	497	!
	498	!-- Save original indices and gridpoint counter
	499	isave(1) = ny
	500	isave(2) = nys_z
	501	isave(3) = nyn_z
	502	isave(4) = sendrecvcount_yz
	503	!
	504	!-- Set new indices for transformation
	505	nblk = ( ( ny+1 ) / pdims(1) ) - 1
	506	ny = pdims(1)
	507	nny_z = 1
	508	nys_z = myidx * nny_z
	509	nyn_z = ( myidx + 1 ) * nny_z - 1
	510	sendrecvcount_yz = ( nxr_y-nxl_y+1 ) * nny_z * ( nzt_y-nzb_y+1 )
	511
[1306]	512	ALLOCATE( ar1(nxl_z:nxr_z,nys_z:nyn_z,1:nz) )
[1216]	513	ALLOCATE( f_in(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
	514
[1306]	515	DO jj = 0, nblk
[1216]	516	!
[1306]	517	!-- Forward Fourier Transformation
	518	!-- Transposition y --> z
[1318]	519	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
[1216]	520
[1306]	521	DO jnew = 0, ny-1
	522	jind = jj + ( nblk + 1 ) * jnew
	523	f_in(:,:,jnew) = f_inv(:,:,jind)
	524	ENDDO
[1216]	525
[1306]	526	CALL transpose_yz( f_in, ar1(:,:,:) )
[1216]	527
[1306]	528	IF ( jj == nblk ) THEN
	529	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
	530	ELSE
	531	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
[1216]	532	ENDIF
	533
	534	!
[1306]	535	!-- Solve the tridiagonal equation system along z
[1318]	536	CALL cpu_log( log_point_s(6), 'tridia', 'start', cpu_log_nowait )
[1216]	537
[1306]	538	n = isave(2) + jj
	539	CALL tridia_substi_overlap( ar1(:,:,:), n )
[1216]	540
[1306]	541	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
[1216]	542
[1306]	543	!
	544	!-- Inverse Fourier Transformation
	545	!-- Transposition z --> y
	546	!-- Only one thread should call MPI routines, therefore forward and
	547	!-- backward tranpose are in the same section
	548	IF ( jj == 0 ) THEN
[1318]	549	CALL cpu_log( log_point_s(8), 'transpo invers', 'start', cpu_log_nowait )
[1306]	550	ELSE
[1318]	551	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
[1216]	552	ENDIF
	553
[1306]	554	CALL transpose_zy( ar1(:,:,:), f_in )
[1216]	555
[1306]	556	DO jnew = 0, ny-1
	557	jind = jj + ( nblk + 1 ) * jnew
	558	f_inv(:,:,jind) = f_in(:,:,jnew)
	559	ENDDO
[1216]	560
[1306]	561	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
[1216]	562
	563	ENDDO
	564	!
	565	!-- Restore original indices/counters
	566	ny = isave(1)
	567	nys_z = isave(2)
	568	nyn_z = isave(3)
	569	sendrecvcount_yz = isave(4)
	570
	571	CALL resort_for_zy( f_inv, f_out_y )
	572
	573	DEALLOCATE( ar1, f_in, f_inv )
	574
	575	!
	576	!-- fft along y backward + subsequent transposition y --> x
	577	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
	578	!
	579	!-- Save original indices and gridpoint counter
	580	isave(1) = nx
	581	isave(2) = nxl_y
	582	isave(3) = nxr_y
	583	isave(4) = sendrecvcount_xy
	584	!
	585	!-- Set new indices for transformation
	586	nblk = (( nx+1 ) / pdims(2) ) - 1
	587	nx = pdims(2)
	588	nnx_y = 1
	589	nxl_y = myidy * nnx_y
	590	nxr_y = ( myidy + 1 ) * nnx_y - 1
	591	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
	592
[1306]	593	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
[1216]	594	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
	595
[1306]	596	DO ii = 0, nblk
[1216]	597
[1318]	598	CALL cpu_log( log_point_s(7), 'fft_y', 'continue', cpu_log_nowait )
[1216]	599
[1306]	600	n = isave(2) + ii
	601	nxl_y_bound = isave(2)
	602	nxr_y_bound = isave(3)
[1216]	603
[1306]	604	CALL fft_y( ar1(:,:,:), 'backward', ar_tr = f_out_y, &
	605	nxl_y_bound = nxl_y_bound, nxr_y_bound = nxr_y_bound, &
	606	nxl_y_l = n, nxr_y_l = n )
[1216]	607
[1306]	608	IF ( ii == nblk ) THEN
	609	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	610	ELSE
	611	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
[1216]	612	ENDIF
	613
[1318]	614	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
[1216]	615
[1306]	616	CALL transpose_yx( ar1(:,:,:), f_in )
[1216]	617
[1306]	618	DO inew = 0, nx-1
	619	iind = ii + (nblk+1) * inew
	620	f_inv(:,:,iind) = f_in(:,:,inew)
	621	ENDDO
[1216]	622
[1306]	623	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
[1216]	624
	625	ENDDO
	626	!
	627	!-- Restore original indices/counters
	628	nx = isave(1)
	629	nxl_y = isave(2)
	630	nxr_y = isave(3)
	631	sendrecvcount_xy = isave(4)
	632
	633	CALL resort_for_yx( f_inv, f_out_z )
	634
	635	DEALLOCATE( ar1, f_in, f_inv )
	636
	637	!
	638	!-- fft along x backward + subsequent final transposition x --> z
	639	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
	640	!
	641	!-- Save original indices and gridpoint counter
	642	isave(1) = nz
	643	isave(2) = nzb_x
	644	isave(3) = nzt_x
	645	isave(4) = sendrecvcount_zx
	646	!
	647	!-- Set new indices for transformation
	648	nblk = nz / pdims(1)
	649	nz = pdims(1)
	650	nnz_x = 1
	651	nzb_x = 1 + myidx * nnz_x
	652	nzt_x = ( myidx + 1 ) * nnz_x
	653	sendrecvcount_zx = nnx * nny * nnz_x
	654
[1306]	655	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
[1216]	656	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
	657
[1306]	658	DO kk = 1, nblk
[1216]	659
[1318]	660	CALL cpu_log( log_point_s(4), 'fft_x', 'continue', cpu_log_nowait )
[1216]	661
[1306]	662	n = isave(2) + kk - 1
	663	CALL fft_x( ar1(:,:,:), 'backward', f_out_z(:,:,n))
[1216]	664
[1306]	665	IF ( kk == nblk ) THEN
	666	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	667	ELSE
	668	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
[1216]	669	ENDIF
	670
[1318]	671	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
[1216]	672
[1306]	673	CALL transpose_xz( ar1(:,:,:), f_in )
[1216]	674
[1306]	675	DO knew = 1, nz
	676	ki = kk + nblk * (knew-1)
	677	f_inv(:,:,ki) = f_in(:,:,knew)
	678	ENDDO
[1216]	679
[1306]	680	IF ( kk == nblk ) THEN
	681	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
	682	ELSE
	683	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
[1216]	684	ENDIF
	685
	686	ENDDO
	687	!
	688	!-- Restore original indices/counters
	689	nz = isave(1)
	690	nzb_x = isave(2)
	691	nzt_x = isave(3)
	692	sendrecvcount_zx = isave(4)
	693
	694	CALL resort_for_xz( f_inv, ar )
	695
	696	DEALLOCATE( ar1, f_in, f_inv )
	697
[1]	698	ENDIF
	699
	700	CALL cpu_log( log_point_s(3), 'poisfft', 'stop' )
	701
	702	END SUBROUTINE poisfft
	703
	704
	705
[1216]	706	SUBROUTINE ffty_tr_yx( f_in, f_out )
[1]	707
	708	!------------------------------------------------------------------------------!
	709	! Fourier-transformation along y with subsequent transposition y --> x for
	710	! a 1d-decomposition along x
	711	!
	712	! ATTENTION: The performance of this routine is much faster on the NEC-SX6,
	713	! if the first index of work_ffty_vec is odd. Otherwise
	714	! memory bank conflicts may occur (especially if the index is a
	715	! multiple of 128). That's why work_ffty_vec is dimensioned as
	716	! 0:ny+1.
	717	! Of course, this will not work if users are using an odd number
	718	! of gridpoints along y.
	719	!------------------------------------------------------------------------------!
	720
[1320]	721	USE control_parameters, &
	722	ONLY: host
	723
	724	USE cpulog, &
	725	ONLY: cpu_log, log_point_s
	726
	727	USE kinds
	728
[1]	729	USE pegrid
	730
	731	IMPLICIT NONE
	732
[1320]	733	INTEGER(iwp) :: i !:
	734	INTEGER(iwp) :: iend !:
	735	INTEGER(iwp) :: iouter !:
	736	INTEGER(iwp) :: ir !:
	737	INTEGER(iwp) :: j !:
	738	INTEGER(iwp) :: k !:
[1]	739
[1320]	740	INTEGER(iwp), PARAMETER :: stridex = 4 !:
	741
	742	REAL(wp), DIMENSION(0:ny,stridex) :: work_ffty !:
[1]	743	#if defined( __nec )
[1320]	744	REAL(wp), DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec !:
[1]	745	#endif
[1320]	746	REAL(wp), DIMENSION(1:nz,0:ny,nxl:nxr) :: f_in !:
	747	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_out !:
	748	REAL(wp), DIMENSION(nxl:nxr,1:nz,0:ny) :: work !:
[1]	749
	750	!
	751	!-- Carry out the FFT along y, where all data are present due to the
	752	!-- 1d-decomposition along x. Resort the data in a way that x becomes
	753	!-- the first index.
[1106]	754	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'start' )
[1]	755
	756	IF ( host(1:3) == 'nec' ) THEN
	757	#if defined( __nec )
	758	!
	759	!-- Code optimized for vector processors
[85]	760	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	761	!$OMP DO
	762	DO i = nxl, nxr
	763
	764	DO j = 0, ny
	765	DO k = 1, nz
	766	work_ffty_vec(j,k,i) = f_in(k,j,i)
	767	ENDDO
	768	ENDDO
	769
	770	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' )
	771
	772	ENDDO
	773
	774	!$OMP DO
	775	DO k = 1, nz
	776	DO j = 0, ny
	777	DO i = nxl, nxr
	778	work(i,k,j) = work_ffty_vec(j,k,i)
	779	ENDDO
	780	ENDDO
	781	ENDDO
	782	!$OMP END PARALLEL
	783	#endif
	784
	785	ELSE
	786
	787	!
	788	!-- Cache optimized code.
	789	!-- The i-(x-)direction is split into a strided outer loop and an inner
	790	!-- loop for better cache performance
	791	!$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty)
	792	!$OMP DO
	793	DO iouter = nxl, nxr, stridex
	794
	795	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
	796
	797	DO k = 1, nz
	798
	799	DO i = iouter, iend
	800
	801	ir = i-iouter+1 ! counter within a stride
	802	DO j = 0, ny
	803	work_ffty(j,ir) = f_in(k,j,i)
	804	ENDDO
	805	!
	806	!-- FFT along y
[1106]	807	CALL fft_y_1d( work_ffty(:,ir), 'forward' )
[1]	808
	809	ENDDO
	810
	811	!
	812	!-- Resort
	813	DO j = 0, ny
	814	DO i = iouter, iend
	815	work(i,k,j) = work_ffty(j,i-iouter+1)
	816	ENDDO
	817	ENDDO
	818
	819	ENDDO
	820
	821	ENDDO
	822	!$OMP END PARALLEL
	823
	824	ENDIF
[1106]	825	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'pause' )
[1]	826
	827	!
	828	!-- Transpose array
[1111]	829	#if defined( __parallel )
[1]	830	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	831	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	832	CALL MPI_ALLTOALL( work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
	833	f_out(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
	834	comm1dx, ierr )
	835	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1111]	836	#endif
[1]	837
	838	END SUBROUTINE ffty_tr_yx
	839
	840
[1216]	841	SUBROUTINE tr_xy_ffty( f_in, f_out )
[1]	842
	843	!------------------------------------------------------------------------------!
	844	! Transposition x --> y with a subsequent backward Fourier transformation for
	845	! a 1d-decomposition along x
	846	!------------------------------------------------------------------------------!
	847
[1320]	848	USE control_parameters, &
	849	ONLY: host
	850
	851	USE cpulog, &
	852	ONLY: cpu_log, log_point_s
	853
	854	USE kinds
	855
[1]	856	USE pegrid
	857
	858	IMPLICIT NONE
	859
[1320]	860	INTEGER(iwp) :: i !:
	861	INTEGER(iwp) :: iend !:
	862	INTEGER(iwp) :: iouter !:
	863	INTEGER(iwp) :: ir !:
	864	INTEGER(iwp) :: j !:
	865	INTEGER(iwp) :: k !:
[1]	866
[1320]	867	INTEGER(iwp), PARAMETER :: stridex = 4 !:
	868
	869	REAL(wp), DIMENSION(0:ny,stridex) :: work_ffty !:
[1]	870	#if defined( __nec )
[1320]	871	REAL(wp), DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec !:
[1]	872	#endif
[1320]	873	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_in !:
	874	REAL(wp), DIMENSION(1:nz,0:ny,nxl:nxr) :: f_out !:
	875	REAL(wp), DIMENSION(nxl:nxr,1:nz,0:ny) :: work !:
[1]	876
	877	!
	878	!-- Transpose array
[1111]	879	#if defined( __parallel )
[1]	880	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	881	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	882	CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
	883	work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
	884	comm1dx, ierr )
	885	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1111]	886	#endif
[1]	887
	888	!
	889	!-- Resort the data in a way that y becomes the first index and carry out the
	890	!-- backward fft along y.
[1106]	891	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'continue' )
[1]	892
	893	IF ( host(1:3) == 'nec' ) THEN
	894	#if defined( __nec )
	895	!
	896	!-- Code optimized for vector processors
[85]	897	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	898	!$OMP DO
	899	DO k = 1, nz
	900	DO j = 0, ny
	901	DO i = nxl, nxr
	902	work_ffty_vec(j,k,i) = work(i,k,j)
	903	ENDDO
	904	ENDDO
	905	ENDDO
	906
	907	!$OMP DO
	908	DO i = nxl, nxr
	909
	910	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' )
	911
	912	DO j = 0, ny
	913	DO k = 1, nz
	914	f_out(k,j,i) = work_ffty_vec(j,k,i)
	915	ENDDO
	916	ENDDO
	917
	918	ENDDO
	919	!$OMP END PARALLEL
	920	#endif
	921
	922	ELSE
	923
	924	!
	925	!-- Cache optimized code.
	926	!-- The i-(x-)direction is split into a strided outer loop and an inner
	927	!-- loop for better cache performance
	928	!$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty )
	929	!$OMP DO
	930	DO iouter = nxl, nxr, stridex
	931
	932	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
	933
	934	DO k = 1, nz
	935	!
	936	!-- Resort
	937	DO j = 0, ny
	938	DO i = iouter, iend
	939	work_ffty(j,i-iouter+1) = work(i,k,j)
	940	ENDDO
	941	ENDDO
	942
	943	DO i = iouter, iend
	944
	945	!
	946	!-- FFT along y
	947	ir = i-iouter+1 ! counter within a stride
[1106]	948	CALL fft_y_1d( work_ffty(:,ir), 'backward' )
[1]	949
	950	DO j = 0, ny
	951	f_out(k,j,i) = work_ffty(j,ir)
	952	ENDDO
	953	ENDDO
	954
	955	ENDDO
	956
	957	ENDDO
	958	!$OMP END PARALLEL
	959
	960	ENDIF
	961
[1106]	962	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'stop' )
[1]	963
	964	END SUBROUTINE tr_xy_ffty
	965
	966
	967	SUBROUTINE fftx_tri_fftx( ar )
	968
	969	!------------------------------------------------------------------------------!
	970	! FFT along x, solution of the tridiagonal system and backward FFT for
	971	! a 1d-decomposition along x
	972	!
	973	! WARNING: this subroutine may still not work for hybrid parallelization
	974	! with OpenMP (for possible necessary changes see the original
	975	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
	976	!------------------------------------------------------------------------------!
	977
[1320]	978	USE control_parameters, &
	979	ONLY: host
	980
	981	USE cpulog, &
	982	ONLY: cpu_log, log_point_s
	983
	984	USE grid_variables, &
	985	ONLY: ddx2, ddy2
	986
	987	USE kinds
	988
[1]	989	USE pegrid
	990
	991	IMPLICIT NONE
	992
[1320]	993	INTEGER(iwp) :: i !:
	994	INTEGER(iwp) :: j !:
	995	INTEGER(iwp) :: k !:
	996	INTEGER(iwp) :: m !:
	997	INTEGER(iwp) :: n !:
	998	INTEGER(iwp) :: omp_get_thread_num !:
	999	INTEGER(iwp) :: tn !:
[1]	1000
[1320]	1001	REAL(wp), DIMENSION(0:nx) :: work_fftx !:
	1002	REAL(wp), DIMENSION(0:nx,1:nz) :: work_trix !:
	1003	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: ar !:
	1004	REAL(wp), DIMENSION(:,:,:,:), ALLOCATABLE :: tri !:
[1]	1005
	1006
[1106]	1007	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'start' )
[1]	1008
	1009	ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) )
	1010
	1011	tn = 0 ! Default thread number in case of one thread
	1012	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_fftx, work_trix )
	1013	DO j = nys_x, nyn_x
	1014
	1015	!$ tn = omp_get_thread_num()
	1016
	1017	IF ( host(1:3) == 'nec' ) THEN
	1018	!
	1019	!-- Code optimized for vector processors
	1020	DO k = 1, nz
	1021
	1022	m = 0
	1023	DO n = 1, pdims(1)
[1003]	1024	DO i = 1, nnx
[1]	1025	work_trix(m,k) = ar(i,k,j,n)
	1026	m = m + 1
	1027	ENDDO
	1028	ENDDO
	1029
	1030	ENDDO
	1031
	1032	CALL fft_x_m( work_trix, 'forward' )
	1033
	1034	ELSE
	1035	!
	1036	!-- Cache optimized code
	1037	DO k = 1, nz
	1038
	1039	m = 0
	1040	DO n = 1, pdims(1)
[1003]	1041	DO i = 1, nnx
[1]	1042	work_fftx(m) = ar(i,k,j,n)
	1043	m = m + 1
	1044	ENDDO
	1045	ENDDO
	1046
[1106]	1047	CALL fft_x_1d( work_fftx, 'forward' )
[1]	1048
	1049	DO i = 0, nx
	1050	work_trix(i,k) = work_fftx(i)
	1051	ENDDO
	1052
	1053	ENDDO
	1054
	1055	ENDIF
	1056
	1057	!
	1058	!-- Solve the linear equation system
	1059	CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) )
	1060
	1061	IF ( host(1:3) == 'nec' ) THEN
	1062	!
	1063	!-- Code optimized for vector processors
	1064	CALL fft_x_m( work_trix, 'backward' )
	1065
	1066	DO k = 1, nz
	1067
	1068	m = 0
	1069	DO n = 1, pdims(1)
[1003]	1070	DO i = 1, nnx
[1]	1071	ar(i,k,j,n) = work_trix(m,k)
	1072	m = m + 1
	1073	ENDDO
	1074	ENDDO
	1075
	1076	ENDDO
	1077
	1078	ELSE
	1079	!
	1080	!-- Cache optimized code
	1081	DO k = 1, nz
	1082
	1083	DO i = 0, nx
	1084	work_fftx(i) = work_trix(i,k)
	1085	ENDDO
	1086
[1106]	1087	CALL fft_x_1d( work_fftx, 'backward' )
[1]	1088
	1089	m = 0
	1090	DO n = 1, pdims(1)
[1003]	1091	DO i = 1, nnx
[1]	1092	ar(i,k,j,n) = work_fftx(m)
	1093	m = m + 1
	1094	ENDDO
	1095	ENDDO
	1096
	1097	ENDDO
	1098
	1099	ENDIF
	1100
	1101	ENDDO
	1102
	1103	DEALLOCATE( tri )
	1104
[1106]	1105	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'stop' )
[1]	1106
	1107	END SUBROUTINE fftx_tri_fftx
	1108
	1109
[1216]	1110	SUBROUTINE fftx_tr_xy( f_in, f_out )
[1]	1111
	1112	!------------------------------------------------------------------------------!
	1113	! Fourier-transformation along x with subsequent transposition x --> y for
	1114	! a 1d-decomposition along y
	1115	!
	1116	! ATTENTION: The NEC-branch of this routine may significantly profit from
	1117	! further optimizations. So far, performance is much worse than
	1118	! for routine ffty_tr_yx (more than three times slower).
	1119	!------------------------------------------------------------------------------!
	1120
[1320]	1121	USE control_parameters, &
	1122	ONLY: host
	1123
	1124	USE cpulog, &
	1125	ONLY: cpu_log, log_point_s
	1126
	1127	USE kinds
	1128
[1]	1129	USE pegrid
	1130
	1131	IMPLICIT NONE
	1132
[1320]	1133	INTEGER(iwp) :: i !:
	1134	INTEGER(iwp) :: j !:
	1135	INTEGER(iwp) :: k !:
[1]	1136
[1320]	1137	REAL(wp), DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx !:
	1138	REAL(wp), DIMENSION(1:nz,nys:nyn,0:nx) :: f_in !:
	1139	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_out !:
	1140	REAL(wp), DIMENSION(nys:nyn,1:nz,0:nx) :: work !:
[1]	1141
	1142	!
	1143	!-- Carry out the FFT along x, where all data are present due to the
	1144	!-- 1d-decomposition along y. Resort the data in a way that y becomes
	1145	!-- the first index.
[1106]	1146	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'start' )
[1]	1147
	1148	IF ( host(1:3) == 'nec' ) THEN
	1149	!
	1150	!-- Code for vector processors
[85]	1151	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	1152	!$OMP DO
	1153	DO i = 0, nx
	1154
	1155	DO j = nys, nyn
	1156	DO k = 1, nz
	1157	work_fftx(i,k,j) = f_in(k,j,i)
	1158	ENDDO
	1159	ENDDO
	1160
	1161	ENDDO
	1162
	1163	!$OMP DO
	1164	DO j = nys, nyn
	1165
	1166	CALL fft_x_m( work_fftx(:,:,j), 'forward' )
	1167
	1168	DO k = 1, nz
	1169	DO i = 0, nx
	1170	work(j,k,i) = work_fftx(i,k,j)
	1171	ENDDO
	1172	ENDDO
	1173
	1174	ENDDO
	1175	!$OMP END PARALLEL
	1176
	1177	ELSE
	1178
	1179	!
	1180	!-- Cache optimized code (there might be still a potential for better
	1181	!-- optimization).
[696]	1182	!$OMP PARALLEL PRIVATE (i,j,k)
[1]	1183	!$OMP DO
	1184	DO i = 0, nx
	1185
	1186	DO j = nys, nyn
	1187	DO k = 1, nz
	1188	work_fftx(i,k,j) = f_in(k,j,i)
	1189	ENDDO
	1190	ENDDO
	1191
	1192	ENDDO
	1193
	1194	!$OMP DO
	1195	DO j = nys, nyn
	1196	DO k = 1, nz
	1197
[1106]	1198	CALL fft_x_1d( work_fftx(0:nx,k,j), 'forward' )
[1]	1199
	1200	DO i = 0, nx
	1201	work(j,k,i) = work_fftx(i,k,j)
	1202	ENDDO
	1203	ENDDO
	1204
	1205	ENDDO
	1206	!$OMP END PARALLEL
	1207
	1208	ENDIF
[1106]	1209	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'pause' )
[1]	1210
	1211	!
	1212	!-- Transpose array
[1111]	1213	#if defined( __parallel )
[1]	1214	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	1215	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	1216	CALL MPI_ALLTOALL( work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
	1217	f_out(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
	1218	comm1dy, ierr )
	1219	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1111]	1220	#endif
[1]	1221
	1222	END SUBROUTINE fftx_tr_xy
	1223
	1224
[1216]	1225	SUBROUTINE tr_yx_fftx( f_in, f_out )
[1]	1226
	1227	!------------------------------------------------------------------------------!
	1228	! Transposition y --> x with a subsequent backward Fourier transformation for
	1229	! a 1d-decomposition along x
	1230	!------------------------------------------------------------------------------!
	1231
[1320]	1232	USE control_parameters, &
	1233	ONLY: host
	1234
	1235	USE cpulog, &
	1236	ONLY: cpu_log, log_point_s
	1237
	1238	USE kinds
	1239
[1]	1240	USE pegrid
	1241
	1242	IMPLICIT NONE
	1243
[1320]	1244	INTEGER(iwp) :: i !:
	1245	INTEGER(iwp) :: j !:
	1246	INTEGER(iwp) :: k !:
[1]	1247
[1320]	1248	REAL(wp), DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx !:
	1249	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_in !:
	1250	REAL(wp), DIMENSION(1:nz,nys:nyn,0:nx) :: f_out !:
	1251	REAL(wp), DIMENSION(nys:nyn,1:nz,0:nx) :: work !:
[1]	1252
	1253	!
	1254	!-- Transpose array
[1111]	1255	#if defined( __parallel )
[1]	1256	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	1257	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	1258	CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
	1259	work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
	1260	comm1dy, ierr )
	1261	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1111]	1262	#endif
[1]	1263
	1264	!
	1265	!-- Carry out the FFT along x, where all data are present due to the
	1266	!-- 1d-decomposition along y. Resort the data in a way that y becomes
	1267	!-- the first index.
[1106]	1268	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'continue' )
[1]	1269
	1270	IF ( host(1:3) == 'nec' ) THEN
	1271	!
	1272	!-- Code optimized for vector processors
[85]	1273	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	1274	!$OMP DO
	1275	DO j = nys, nyn
	1276
	1277	DO k = 1, nz
	1278	DO i = 0, nx
	1279	work_fftx(i,k,j) = work(j,k,i)
	1280	ENDDO
	1281	ENDDO
	1282
	1283	CALL fft_x_m( work_fftx(:,:,j), 'backward' )
	1284
	1285	ENDDO
	1286
	1287	!$OMP DO
	1288	DO i = 0, nx
	1289	DO j = nys, nyn
	1290	DO k = 1, nz
	1291	f_out(k,j,i) = work_fftx(i,k,j)
	1292	ENDDO
	1293	ENDDO
	1294	ENDDO
	1295	!$OMP END PARALLEL
	1296
	1297	ELSE
	1298
	1299	!
	1300	!-- Cache optimized code (there might be still a potential for better
	1301	!-- optimization).
[696]	1302	!$OMP PARALLEL PRIVATE (i,j,k)
[1]	1303	!$OMP DO
	1304	DO j = nys, nyn
	1305	DO k = 1, nz
	1306
	1307	DO i = 0, nx
	1308	work_fftx(i,k,j) = work(j,k,i)
	1309	ENDDO
	1310
[1106]	1311	CALL fft_x_1d( work_fftx(0:nx,k,j), 'backward' )
[1]	1312
	1313	ENDDO
	1314	ENDDO
	1315
	1316	!$OMP DO
	1317	DO i = 0, nx
	1318	DO j = nys, nyn
	1319	DO k = 1, nz
	1320	f_out(k,j,i) = work_fftx(i,k,j)
	1321	ENDDO
	1322	ENDDO
	1323	ENDDO
	1324	!$OMP END PARALLEL
	1325
	1326	ENDIF
[1106]	1327	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'stop' )
[1]	1328
	1329	END SUBROUTINE tr_yx_fftx
	1330
	1331
	1332	SUBROUTINE ffty_tri_ffty( ar )
	1333
	1334	!------------------------------------------------------------------------------!
	1335	! FFT along y, solution of the tridiagonal system and backward FFT for
	1336	! a 1d-decomposition along y
	1337	!
	1338	! WARNING: this subroutine may still not work for hybrid parallelization
	1339	! with OpenMP (for possible necessary changes see the original
	1340	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
	1341	!------------------------------------------------------------------------------!
	1342
[1320]	1343	USE control_parameters, &
	1344	ONLY: host
	1345
	1346	USE cpulog, &
	1347	ONLY: cpu_log, log_point_s
	1348
	1349	USE grid_variables, &
	1350	ONLY: ddx2, ddy2
	1351
	1352	USE kinds
	1353
[1]	1354	USE pegrid
	1355
	1356	IMPLICIT NONE
	1357
[1320]	1358	INTEGER(iwp) :: i !:
	1359	INTEGER(iwp) :: j !:
	1360	INTEGER(iwp) :: k !:
	1361	INTEGER(iwp) :: m !:
	1362	INTEGER(iwp) :: n !:
	1363	INTEGER(iwp) :: omp_get_thread_num !:
	1364	INTEGER(iwp) :: tn !:
[1]	1365
[1320]	1366	REAL(wp), DIMENSION(0:ny) :: work_ffty !:
	1367	REAL(wp), DIMENSION(0:ny,1:nz) :: work_triy !:
	1368	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: ar !:
	1369	REAL(wp), DIMENSION(:,:,:,:), ALLOCATABLE :: tri !:
[1]	1370
	1371
[1106]	1372	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'start' )
[1]	1373
	1374	ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) )
	1375
	1376	tn = 0 ! Default thread number in case of one thread
[696]	1377	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy )
[1]	1378	DO i = nxl_y, nxr_y
	1379
	1380	!$ tn = omp_get_thread_num()
	1381
	1382	IF ( host(1:3) == 'nec' ) THEN
	1383	!
	1384	!-- Code optimized for vector processors
	1385	DO k = 1, nz
	1386
	1387	m = 0
	1388	DO n = 1, pdims(2)
[1003]	1389	DO j = 1, nny
[1]	1390	work_triy(m,k) = ar(j,k,i,n)
	1391	m = m + 1
	1392	ENDDO
	1393	ENDDO
	1394
	1395	ENDDO
	1396
	1397	CALL fft_y_m( work_triy, ny, 'forward' )
	1398
	1399	ELSE
	1400	!
	1401	!-- Cache optimized code
	1402	DO k = 1, nz
	1403
	1404	m = 0
	1405	DO n = 1, pdims(2)
[1003]	1406	DO j = 1, nny
[1]	1407	work_ffty(m) = ar(j,k,i,n)
	1408	m = m + 1
	1409	ENDDO
	1410	ENDDO
	1411
[1106]	1412	CALL fft_y_1d( work_ffty, 'forward' )
[1]	1413
	1414	DO j = 0, ny
	1415	work_triy(j,k) = work_ffty(j)
	1416	ENDDO
	1417
	1418	ENDDO
	1419
	1420	ENDIF
	1421
	1422	!
	1423	!-- Solve the linear equation system
	1424	CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) )
	1425
	1426	IF ( host(1:3) == 'nec' ) THEN
	1427	!
	1428	!-- Code optimized for vector processors
	1429	CALL fft_y_m( work_triy, ny, 'backward' )
	1430
	1431	DO k = 1, nz
	1432
	1433	m = 0
	1434	DO n = 1, pdims(2)
[1003]	1435	DO j = 1, nny
[1]	1436	ar(j,k,i,n) = work_triy(m,k)
	1437	m = m + 1
	1438	ENDDO
	1439	ENDDO
	1440
	1441	ENDDO
	1442
	1443	ELSE
	1444	!
	1445	!-- Cache optimized code
	1446	DO k = 1, nz
	1447
	1448	DO j = 0, ny
	1449	work_ffty(j) = work_triy(j,k)
	1450	ENDDO
	1451
[1106]	1452	CALL fft_y_1d( work_ffty, 'backward' )
[1]	1453
	1454	m = 0
	1455	DO n = 1, pdims(2)
[1003]	1456	DO j = 1, nny
[1]	1457	ar(j,k,i,n) = work_ffty(m)
	1458	m = m + 1
	1459	ENDDO
	1460	ENDDO
	1461
	1462	ENDDO
	1463
	1464	ENDIF
	1465
	1466	ENDDO
	1467
	1468	DEALLOCATE( tri )
	1469
[1106]	1470	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'stop' )
[1]	1471
	1472	END SUBROUTINE ffty_tri_ffty
	1473
	1474	#endif
	1475
	1476	END MODULE poisfft_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |