Home

Context Navigation

poisfft.f90 @ 1812

Last change on this file since 1812 was 1805, checked in by maronga, 9 years ago
last commit documented
Property svn:keywords set to `Id`
File size: 45.5 KB

Rev	Line
[1682]	1	!> @file poisfft.f90
[1036]	2	!--------------------------------------------------------------------------------!
	3	! This file is part of PALM.
	4	!
	5	! PALM is free software: you can redistribute it and/or modify it under the terms
	6	! of the GNU General Public License as published by the Free Software Foundation,
	7	! either version 3 of the License, or (at your option) any later version.
	8	!
	9	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
	10	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
	11	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
	12	!
	13	! You should have received a copy of the GNU General Public License along with
	14	! PALM. If not, see <http://www.gnu.org/licenses/>.
	15	!
[1310]	16	! Copyright 1997-2014 Leibniz Universitaet Hannover
[1036]	17	!--------------------------------------------------------------------------------!
	18	!
[484]	19	! Current revisions:
[1]	20	! -----------------
[1683]	21	!
[1805]	22	!
[1321]	23	! Former revisions:
	24	! -----------------
	25	! $Id: poisfft.f90 1805 2016-04-05 16:32:34Z knoop $
	26	!
[1805]	27	! 1804 2016-04-05 16:30:18Z maronga
	28	! Removed code for parameter file check (__check)
	29	!
[1683]	30	! 1682 2015-10-07 23:56:08Z knoop
	31	! Code annotations made doxygen readable
	32	!
[1483]	33	! 1482 2014-10-18 12:34:45Z raasch
	34	! use 2d-decomposition, if accelerator boards are used
	35	!
[1407]	36	! 1406 2014-05-16 13:47:01Z raasch
	37	! bugfix for pgi 14.4: declare create moved after array declaration
	38	!
[1321]	39	! 1320 2014-03-20 08:40:49Z raasch
[1320]	40	! ONLY-attribute added to USE-statements,
	41	! kind-parameters added to all INTEGER and REAL declaration statements,
	42	! kinds are defined in new module kinds,
	43	! old module precision_kind is removed,
	44	! revision history before 2012 removed,
	45	! comment fields (!:) to be used for variable explanations added to
	46	! all variable declaration statements
[1217]	47	!
[1319]	48	! 1318 2014-03-17 13:35:16Z raasch
	49	! module interfaces removed
	50	!
[1307]	51	! 1306 2014-03-13 14:30:59Z raasch
	52	! openmp sections removed from the overlap branch,
	53	! second argument removed from parameter list
	54	!
[1217]	55	! 1216 2013-08-26 09:31:42Z raasch
[1216]	56	! resorting of arrays moved to separate routines resort_for_...,
	57	! one argument, used as temporary work array, removed from all transpose
	58	! routines
	59	! overlapping fft / transposition implemented
[1112]	60	!
[1213]	61	! 1212 2013-08-15 08:46:27Z raasch
	62	! tridia routines moved to seperate module tridia_solver
	63	!
[1209]	64	! 1208 2013-08-13 06:41:49Z raasch
	65	! acc-update clauses added for "ar" so that ffts other than cufft can also be
	66	! used (although they are not ported and will give a poor performance)
	67	!
[1112]	68	! 1111 2013-03-08 23:54:10Z raasch
[1111]	69	! further openACC porting of non-parallel (MPI) branch:
	70	! tridiagonal routines split into extermal subroutines (instead using CONTAINS),
	71	! no distinction between parallel/non-parallel in poisfft and tridia any more,
[1112]	72	! tridia routines moved to end of file because of probable bug in PGI compiler 12.5
[1111]	73	! (otherwise "invalid device function" is indicated during runtime),
	74	! optimization of tridia routines: constant elements and coefficients of tri are
	75	! stored in seperate arrays ddzuw and tric, last dimension of tri reduced from 5
	76	! to 2,
	77	! poisfft_init is now called internally from poisfft, maketri is called from
	78	! poisfft_init,
	79	! ibc_p_b = 2 removed
[1]	80	!
[1107]	81	! 1106 2013-03-04 05:31:38Z raasch
	82	! routines fftx, ffty, fftxp, fftyp removed, calls replaced by fft_x, fft_y,
	83	! in the 1D-decomposition routines fft_x, ffty are replaced by fft_x_1d,
	84	! fft_y_1d
	85	!
[1104]	86	! 1103 2013-02-20 02:15:53Z raasch
	87	! tri, ar, and ar1 arguments in tridia-routines (2d) are removed because they
	88	! sometimes cause segmentation faults with intel 12.1 compiler
	89	!
[1093]	90	! 1092 2013-02-02 11:24:22Z raasch
	91	! unused variables removed
	92	!
[1037]	93	! 1036 2012-10-22 13:43:42Z raasch
	94	! code put under GPL (PALM 3.9)
	95	!
[1014]	96	! 2012-09-21 07:03:55Z raasch
	97	! FLOAT type conversion replaced by REAL
	98	!
[1004]	99	! 1003 2012-09-14 14:35:53Z raasch
	100	! indices nxa, nya, etc. replaced by nx, ny, etc.
	101	!
[941]	102	! 940 2012-07-09 14:31:00Z raasch
	103	! special handling of tri-array as an argument in tridia_1dd routines switched
	104	! off because it caused segmentation faults with intel 12.1 compiler
	105	!
[878]	106	! 877 2012-04-03 11:21:44Z suehring
	107	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
	108	! pressure at the top of the model domain.
	109	!
[810]	110	! 809 2012-01-30 13:32:58Z maronga
	111	! Bugfix: replaced .AND. and .NOT. with && and ! in the preprocessor directives
	112	!
[808]	113	! 807 2012-01-25 11:53:51Z maronga
	114	! New cpp directive "__check" implemented which is used by check_namelist_files
	115	! (most of the code is unneeded by check_namelist_files).
	116	!
[1]	117	! Revision 1.1 1997/07/24 11:24:14 raasch
	118	! Initial revision
	119	!
	120	!
	121	! Description:
	122	! ------------
[1682]	123	!> Solves the Poisson equation with a 2D spectral method
	124	!> d^2 p / dx^2 + d^2 p / dy^2 + d^2 p / dz^2 = s
	125	!>
	126	!> Input:
	127	!> real ar contains (nnz,nny,nnx) elements of the velocity divergence,
	128	!> starting from (1,nys,nxl)
	129	!>
	130	!> Output:
	131	!> real ar contains the solution for perturbation pressure p
[1]	132	!------------------------------------------------------------------------------!
[1682]	133	MODULE poisfft_mod
	134
[1]	135
[1320]	136	USE fft_xy, &
	137	ONLY: fft_init, fft_y, fft_y_1d, fft_y_m, fft_x, fft_x_1d, fft_x_m
[1]	138
[1320]	139	USE indices, &
	140	ONLY: nnx, nny, nx, nxl, nxr, ny, nys, nyn, nz
	141
	142	USE transpose_indices, &
	143	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nys_x, nys_z, nyn_x, nyn_z, nzb_x, &
	144	nzb_y, nzt_x, nzt_y
	145
	146	USE tridia_solver, &
	147	ONLY: tridia_1dd, tridia_init, tridia_substi, tridia_substi_overlap
	148
[1]	149	IMPLICIT NONE
	150
[1111]	151	LOGICAL, SAVE :: poisfft_initialized = .FALSE.
	152
[1]	153	PRIVATE
[807]	154
[1]	155	PUBLIC poisfft, poisfft_init
	156
	157	INTERFACE poisfft
	158	MODULE PROCEDURE poisfft
	159	END INTERFACE poisfft
	160
	161	INTERFACE poisfft_init
	162	MODULE PROCEDURE poisfft_init
	163	END INTERFACE poisfft_init
	164
[807]	165
[1]	166	CONTAINS
	167
[1682]	168	!------------------------------------------------------------------------------!
	169	! Description:
	170	! ------------
	171	!> @todo Missing subroutine description.
	172	!------------------------------------------------------------------------------!
[1]	173	SUBROUTINE poisfft_init
	174
[1320]	175	USE arrays_3d, &
	176	ONLY: ddzu_pres, ddzw
[1111]	177
[1320]	178	USE kinds
	179
[1111]	180	IMPLICIT NONE
	181
[1682]	182	INTEGER(iwp) :: k !<
[1111]	183
	184
[1]	185	CALL fft_init
	186
[1212]	187	CALL tridia_init
[1111]	188
	189	poisfft_initialized = .TRUE.
	190
[1]	191	END SUBROUTINE poisfft_init
	192
[1111]	193
[1804]	194
[1682]	195	!------------------------------------------------------------------------------!
	196	! Description:
	197	! ------------
	198	!> Two-dimensional Fourier Transformation in x- and y-direction.
	199	!------------------------------------------------------------------------------!
[1306]	200	SUBROUTINE poisfft( ar )
[1]	201
[1320]	202	USE control_parameters, &
	203	ONLY: fft_method, transpose_compute_overlap
	204
	205	USE cpulog, &
	206	ONLY: cpu_log, cpu_log_nowait, log_point_s
	207
	208	USE kinds
	209
[1]	210	USE pegrid
	211
	212	IMPLICIT NONE
	213
[1682]	214	INTEGER(iwp) :: ii !<
	215	INTEGER(iwp) :: iind !<
	216	INTEGER(iwp) :: inew !<
	217	INTEGER(iwp) :: jj !<
	218	INTEGER(iwp) :: jind !<
	219	INTEGER(iwp) :: jnew !<
	220	INTEGER(iwp) :: ki !<
	221	INTEGER(iwp) :: kk !<
	222	INTEGER(iwp) :: knew !<
	223	INTEGER(iwp) :: n !<
	224	INTEGER(iwp) :: nblk !<
	225	INTEGER(iwp) :: nnx_y !<
	226	INTEGER(iwp) :: nny_z !<
	227	INTEGER(iwp) :: nnz_t !<
	228	INTEGER(iwp) :: nnz_x !<
	229	INTEGER(iwp) :: nxl_y_bound !<
	230	INTEGER(iwp) :: nxr_y_bound !<
[1]	231
[1682]	232	INTEGER(iwp), DIMENSION(4) :: isave !<
[1320]	233
[1682]	234	REAL(wp), DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar !<
	235	REAL(wp), DIMENSION(nys:nyn,nxl:nxr,1:nz) :: ar_inv !<
[1216]	236	!$acc declare create( ar_inv )
[1]	237
[1682]	238	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: ar1 !<
	239	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_in !<
	240	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_inv !<
	241	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_out_y !<
	242	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_out_z !<
[1216]	243
	244
[1]	245	CALL cpu_log( log_point_s(3), 'poisfft', 'start' )
	246
[1111]	247	IF ( .NOT. poisfft_initialized ) CALL poisfft_init
	248
[1]	249	!
	250	!-- Two-dimensional Fourier Transformation in x- and y-direction.
[1482]	251	IF ( pdims(2) == 1 .AND. pdims(1) > 1 .AND. num_acc_per_node == 0 ) &
	252	THEN
[1]	253
	254	!
	255	!-- 1d-domain-decomposition along x:
	256	!-- FFT along y and transposition y --> x
[1216]	257	CALL ffty_tr_yx( ar, ar )
[1]	258
	259	!
	260	!-- FFT along x, solving the tridiagonal system and backward FFT
	261	CALL fftx_tri_fftx( ar )
	262
	263	!
	264	!-- Transposition x --> y and backward FFT along y
[1216]	265	CALL tr_xy_ffty( ar, ar )
[1]	266
[1482]	267	ELSEIF ( pdims(1) == 1 .AND. pdims(2) > 1 .AND. num_acc_per_node == 0 ) &
	268	THEN
[1]	269
	270	!
	271	!-- 1d-domain-decomposition along y:
	272	!-- FFT along x and transposition x --> y
[1216]	273	CALL fftx_tr_xy( ar, ar )
[1]	274
	275	!
	276	!-- FFT along y, solving the tridiagonal system and backward FFT
	277	CALL ffty_tri_ffty( ar )
	278
	279	!
	280	!-- Transposition y --> x and backward FFT along x
[1216]	281	CALL tr_yx_fftx( ar, ar )
[1]	282
[1216]	283	ELSEIF ( .NOT. transpose_compute_overlap ) THEN
[1]	284
	285	!
[1111]	286	!-- 2d-domain-decomposition or no decomposition (1 PE run)
[1]	287	!-- Transposition z --> x
	288	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
[1216]	289	CALL resort_for_zx( ar, ar_inv )
	290	CALL transpose_zx( ar_inv, ar )
[1]	291	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
	292
	293	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
[1208]	294	IF ( fft_method /= 'system-specific' ) THEN
	295	!$acc update host( ar )
	296	ENDIF
[1106]	297	CALL fft_x( ar, 'forward' )
[1208]	298	IF ( fft_method /= 'system-specific' ) THEN
	299	!$acc update device( ar )
	300	ENDIF
[1]	301	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
	302
	303	!
	304	!-- Transposition x --> y
	305	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
[1216]	306	CALL resort_for_xy( ar, ar_inv )
	307	CALL transpose_xy( ar_inv, ar )
[1]	308	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
	309
	310	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
[1208]	311	IF ( fft_method /= 'system-specific' ) THEN
	312	!$acc update host( ar )
	313	ENDIF
[1216]	314	CALL fft_y( ar, 'forward', ar_tr = ar, &
	315	nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
	316	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
[1208]	317	IF ( fft_method /= 'system-specific' ) THEN
	318	!$acc update device( ar )
	319	ENDIF
[1]	320	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	321
	322	!
	323	!-- Transposition y --> z
	324	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
[1216]	325	CALL resort_for_yz( ar, ar_inv )
	326	CALL transpose_yz( ar_inv, ar )
[1]	327	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
	328
	329	!
[1106]	330	!-- Solve the tridiagonal equation system along z
[1]	331	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
[1212]	332	CALL tridia_substi( ar )
[1]	333	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
	334
	335	!
	336	!-- Inverse Fourier Transformation
	337	!-- Transposition z --> y
	338	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
[1216]	339	CALL transpose_zy( ar, ar_inv )
	340	CALL resort_for_zy( ar_inv, ar )
[1]	341	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
	342
	343	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
[1208]	344	IF ( fft_method /= 'system-specific' ) THEN
	345	!$acc update host( ar )
	346	ENDIF
[1216]	347	CALL fft_y( ar, 'backward', ar_tr = ar, &
	348	nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
	349	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
[1208]	350	IF ( fft_method /= 'system-specific' ) THEN
	351	!$acc update device( ar )
	352	ENDIF
[1]	353	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	354
	355	!
	356	!-- Transposition y --> x
	357	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
[1216]	358	CALL transpose_yx( ar, ar_inv )
	359	CALL resort_for_yx( ar_inv, ar )
[1]	360	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
	361
	362	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
[1208]	363	IF ( fft_method /= 'system-specific' ) THEN
	364	!$acc update host( ar )
	365	ENDIF
[1106]	366	CALL fft_x( ar, 'backward' )
[1208]	367	IF ( fft_method /= 'system-specific' ) THEN
	368	!$acc update device( ar )
	369	ENDIF
[1]	370	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	371
	372	!
	373	!-- Transposition x --> z
	374	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
[1216]	375	CALL transpose_xz( ar, ar_inv )
	376	CALL resort_for_xz( ar_inv, ar )
[1]	377	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
	378
[1216]	379	ELSE
	380
	381	!
	382	!-- 2d-domain-decomposition or no decomposition (1 PE run) with
	383	!-- overlapping transposition / fft
[1318]	384	!-- cputime logging must not use barriers, which would prevent overlapping
[1216]	385	ALLOCATE( f_out_y(0:ny,nxl_y:nxr_y,nzb_y:nzt_y), &
	386	f_out_z(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
	387	!
	388	!-- Transposition z --> x + subsequent fft along x
	389	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
	390	CALL resort_for_zx( ar, f_inv )
	391	!
	392	!-- Save original indices and gridpoint counter
	393	isave(1) = nz
	394	isave(2) = nzb_x
	395	isave(3) = nzt_x
	396	isave(4) = sendrecvcount_zx
	397	!
	398	!-- Set new indices for transformation
	399	nblk = nz / pdims(1)
	400	nz = pdims(1)
	401	nnz_x = 1
	402	nzb_x = 1 + myidx * nnz_x
	403	nzt_x = ( myidx + 1 ) * nnz_x
	404	sendrecvcount_zx = nnx * nny * nnz_x
	405
[1306]	406	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
[1216]	407	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
	408
[1306]	409	DO kk = 1, nblk
[1216]	410
[1306]	411	IF ( kk == 1 ) THEN
[1318]	412	CALL cpu_log( log_point_s(5), 'transpo forward', 'start', cpu_log_nowait )
[1306]	413	ELSE
[1318]	414	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
[1306]	415	ENDIF
[1216]	416
[1306]	417	DO knew = 1, nz
	418	ki = kk + nblk * ( knew - 1 )
	419	f_in(:,:,knew) = f_inv(:,:,ki)
	420	ENDDO
[1216]	421
[1306]	422	CALL transpose_zx( f_in, ar1(:,:,:))
	423	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
[1216]	424
[1306]	425	IF ( kk == 1 ) THEN
[1318]	426	CALL cpu_log( log_point_s(4), 'fft_x', 'start', cpu_log_nowait )
[1306]	427	ELSE
[1318]	428	CALL cpu_log( log_point_s(4), 'fft_x', 'continue', cpu_log_nowait )
[1216]	429	ENDIF
	430
[1306]	431	n = isave(2) + kk - 1
	432	CALL fft_x( ar1(:,:,:), 'forward', ar_2d = f_out_z(:,:,n))
	433	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
[1216]	434
	435	ENDDO
	436	!
	437	!-- Restore original indices/counters
	438	nz = isave(1)
	439	nzb_x = isave(2)
	440	nzt_x = isave(3)
	441	sendrecvcount_zx = isave(4)
	442
	443	DEALLOCATE( ar1, f_in, f_inv )
	444
	445	!
	446	!-- Transposition x --> y + subsequent fft along y
	447	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
	448	CALL resort_for_xy( f_out_z, f_inv )
	449	!
	450	!-- Save original indices and gridpoint counter
	451	isave(1) = nx
	452	isave(2) = nxl_y
	453	isave(3) = nxr_y
	454	isave(4) = sendrecvcount_xy
	455	!
	456	!-- Set new indices for transformation
	457	nblk = ( ( nx+1 ) / pdims(2) ) - 1
	458	nx = pdims(2)
	459	nnx_y = 1
	460	nxl_y = myidy * nnx_y
	461	nxr_y = ( myidy + 1 ) * nnx_y - 1
	462	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
	463
[1306]	464	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
[1216]	465	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
	466
[1306]	467	DO ii = 0, nblk
[1216]	468
[1318]	469	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
[1216]	470
[1306]	471	DO inew = 0, nx-1
	472	iind = ii + ( nblk + 1 ) * inew
	473	f_in(:,:,inew) = f_inv(:,:,iind)
	474	ENDDO
[1216]	475
[1306]	476	CALL transpose_xy( f_in, ar1(:,:,:) )
[1216]	477
[1306]	478	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
[1216]	479
[1306]	480	IF ( ii == 1 ) THEN
[1318]	481	CALL cpu_log( log_point_s(7), 'fft_y', 'start', cpu_log_nowait )
[1306]	482	ELSE
[1318]	483	CALL cpu_log( log_point_s(7), 'fft_y', 'continue', cpu_log_nowait )
[1216]	484	ENDIF
	485
[1306]	486	nxl_y_bound = isave(2)
	487	nxr_y_bound = isave(3)
	488	n = isave(2) + ii
	489	CALL fft_y( ar1(:,:,:), 'forward', ar_tr = f_out_y, &
	490	nxl_y_bound = nxl_y_bound, nxr_y_bound = nxr_y_bound, &
	491	nxl_y_l = n, nxr_y_l = n )
[1216]	492
[1306]	493	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
[1216]	494
	495	ENDDO
	496	!
	497	!-- Restore original indices/counters
	498	nx = isave(1)
	499	nxl_y = isave(2)
	500	nxr_y = isave(3)
	501	sendrecvcount_xy = isave(4)
	502
	503	DEALLOCATE( ar1, f_in, f_inv )
	504
	505	!
	506	!-- Transposition y --> z + subsequent tridia + resort for z --> y
	507	ALLOCATE( f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
	508	CALL resort_for_yz( f_out_y, f_inv )
	509	!
	510	!-- Save original indices and gridpoint counter
	511	isave(1) = ny
	512	isave(2) = nys_z
	513	isave(3) = nyn_z
	514	isave(4) = sendrecvcount_yz
	515	!
	516	!-- Set new indices for transformation
	517	nblk = ( ( ny+1 ) / pdims(1) ) - 1
	518	ny = pdims(1)
	519	nny_z = 1
	520	nys_z = myidx * nny_z
	521	nyn_z = ( myidx + 1 ) * nny_z - 1
	522	sendrecvcount_yz = ( nxr_y-nxl_y+1 ) * nny_z * ( nzt_y-nzb_y+1 )
	523
[1306]	524	ALLOCATE( ar1(nxl_z:nxr_z,nys_z:nyn_z,1:nz) )
[1216]	525	ALLOCATE( f_in(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
	526
[1306]	527	DO jj = 0, nblk
[1216]	528	!
[1306]	529	!-- Forward Fourier Transformation
	530	!-- Transposition y --> z
[1318]	531	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
[1216]	532
[1306]	533	DO jnew = 0, ny-1
	534	jind = jj + ( nblk + 1 ) * jnew
	535	f_in(:,:,jnew) = f_inv(:,:,jind)
	536	ENDDO
[1216]	537
[1306]	538	CALL transpose_yz( f_in, ar1(:,:,:) )
[1216]	539
[1306]	540	IF ( jj == nblk ) THEN
	541	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
	542	ELSE
	543	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
[1216]	544	ENDIF
	545
	546	!
[1306]	547	!-- Solve the tridiagonal equation system along z
[1318]	548	CALL cpu_log( log_point_s(6), 'tridia', 'start', cpu_log_nowait )
[1216]	549
[1306]	550	n = isave(2) + jj
	551	CALL tridia_substi_overlap( ar1(:,:,:), n )
[1216]	552
[1306]	553	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
[1216]	554
[1306]	555	!
	556	!-- Inverse Fourier Transformation
	557	!-- Transposition z --> y
	558	!-- Only one thread should call MPI routines, therefore forward and
	559	!-- backward tranpose are in the same section
	560	IF ( jj == 0 ) THEN
[1318]	561	CALL cpu_log( log_point_s(8), 'transpo invers', 'start', cpu_log_nowait )
[1306]	562	ELSE
[1318]	563	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
[1216]	564	ENDIF
	565
[1306]	566	CALL transpose_zy( ar1(:,:,:), f_in )
[1216]	567
[1306]	568	DO jnew = 0, ny-1
	569	jind = jj + ( nblk + 1 ) * jnew
	570	f_inv(:,:,jind) = f_in(:,:,jnew)
	571	ENDDO
[1216]	572
[1306]	573	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
[1216]	574
	575	ENDDO
	576	!
	577	!-- Restore original indices/counters
	578	ny = isave(1)
	579	nys_z = isave(2)
	580	nyn_z = isave(3)
	581	sendrecvcount_yz = isave(4)
	582
	583	CALL resort_for_zy( f_inv, f_out_y )
	584
	585	DEALLOCATE( ar1, f_in, f_inv )
	586
	587	!
	588	!-- fft along y backward + subsequent transposition y --> x
	589	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
	590	!
	591	!-- Save original indices and gridpoint counter
	592	isave(1) = nx
	593	isave(2) = nxl_y
	594	isave(3) = nxr_y
	595	isave(4) = sendrecvcount_xy
	596	!
	597	!-- Set new indices for transformation
	598	nblk = (( nx+1 ) / pdims(2) ) - 1
	599	nx = pdims(2)
	600	nnx_y = 1
	601	nxl_y = myidy * nnx_y
	602	nxr_y = ( myidy + 1 ) * nnx_y - 1
	603	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
	604
[1306]	605	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
[1216]	606	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
	607
[1306]	608	DO ii = 0, nblk
[1216]	609
[1318]	610	CALL cpu_log( log_point_s(7), 'fft_y', 'continue', cpu_log_nowait )
[1216]	611
[1306]	612	n = isave(2) + ii
	613	nxl_y_bound = isave(2)
	614	nxr_y_bound = isave(3)
[1216]	615
[1306]	616	CALL fft_y( ar1(:,:,:), 'backward', ar_tr = f_out_y, &
	617	nxl_y_bound = nxl_y_bound, nxr_y_bound = nxr_y_bound, &
	618	nxl_y_l = n, nxr_y_l = n )
[1216]	619
[1306]	620	IF ( ii == nblk ) THEN
	621	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	622	ELSE
	623	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
[1216]	624	ENDIF
	625
[1318]	626	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
[1216]	627
[1306]	628	CALL transpose_yx( ar1(:,:,:), f_in )
[1216]	629
[1306]	630	DO inew = 0, nx-1
	631	iind = ii + (nblk+1) * inew
	632	f_inv(:,:,iind) = f_in(:,:,inew)
	633	ENDDO
[1216]	634
[1306]	635	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
[1216]	636
	637	ENDDO
	638	!
	639	!-- Restore original indices/counters
	640	nx = isave(1)
	641	nxl_y = isave(2)
	642	nxr_y = isave(3)
	643	sendrecvcount_xy = isave(4)
	644
	645	CALL resort_for_yx( f_inv, f_out_z )
	646
	647	DEALLOCATE( ar1, f_in, f_inv )
	648
	649	!
	650	!-- fft along x backward + subsequent final transposition x --> z
	651	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
	652	!
	653	!-- Save original indices and gridpoint counter
	654	isave(1) = nz
	655	isave(2) = nzb_x
	656	isave(3) = nzt_x
	657	isave(4) = sendrecvcount_zx
	658	!
	659	!-- Set new indices for transformation
	660	nblk = nz / pdims(1)
	661	nz = pdims(1)
	662	nnz_x = 1
	663	nzb_x = 1 + myidx * nnz_x
	664	nzt_x = ( myidx + 1 ) * nnz_x
	665	sendrecvcount_zx = nnx * nny * nnz_x
	666
[1306]	667	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
[1216]	668	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
	669
[1306]	670	DO kk = 1, nblk
[1216]	671
[1318]	672	CALL cpu_log( log_point_s(4), 'fft_x', 'continue', cpu_log_nowait )
[1216]	673
[1306]	674	n = isave(2) + kk - 1
	675	CALL fft_x( ar1(:,:,:), 'backward', f_out_z(:,:,n))
[1216]	676
[1306]	677	IF ( kk == nblk ) THEN
	678	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	679	ELSE
	680	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
[1216]	681	ENDIF
	682
[1318]	683	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
[1216]	684
[1306]	685	CALL transpose_xz( ar1(:,:,:), f_in )
[1216]	686
[1306]	687	DO knew = 1, nz
	688	ki = kk + nblk * (knew-1)
	689	f_inv(:,:,ki) = f_in(:,:,knew)
	690	ENDDO
[1216]	691
[1306]	692	IF ( kk == nblk ) THEN
	693	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
	694	ELSE
	695	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
[1216]	696	ENDIF
	697
	698	ENDDO
	699	!
	700	!-- Restore original indices/counters
	701	nz = isave(1)
	702	nzb_x = isave(2)
	703	nzt_x = isave(3)
	704	sendrecvcount_zx = isave(4)
	705
	706	CALL resort_for_xz( f_inv, ar )
	707
	708	DEALLOCATE( ar1, f_in, f_inv )
	709
[1]	710	ENDIF
	711
	712	CALL cpu_log( log_point_s(3), 'poisfft', 'stop' )
	713
	714	END SUBROUTINE poisfft
	715
	716
	717	!------------------------------------------------------------------------------!
[1682]	718	! Description:
	719	! ------------
	720	!> Fourier-transformation along y with subsequent transposition y --> x for
	721	!> a 1d-decomposition along x.
	722	!>
	723	!> @attention The performance of this routine is much faster on the NEC-SX6,
	724	!> if the first index of work_ffty_vec is odd. Otherwise
	725	!> memory bank conflicts may occur (especially if the index is a
	726	!> multiple of 128). That's why work_ffty_vec is dimensioned as
	727	!> 0:ny+1.
	728	!> Of course, this will not work if users are using an odd number
	729	!> of gridpoints along y.
[1]	730	!------------------------------------------------------------------------------!
[1682]	731	SUBROUTINE ffty_tr_yx( f_in, f_out )
[1]	732
[1682]	733
[1320]	734	USE control_parameters, &
	735	ONLY: host
	736
	737	USE cpulog, &
	738	ONLY: cpu_log, log_point_s
	739
	740	USE kinds
	741
[1]	742	USE pegrid
	743
	744	IMPLICIT NONE
	745
[1682]	746	INTEGER(iwp) :: i !<
	747	INTEGER(iwp) :: iend !<
	748	INTEGER(iwp) :: iouter !<
	749	INTEGER(iwp) :: ir !<
	750	INTEGER(iwp) :: j !<
	751	INTEGER(iwp) :: k !<
[1]	752
[1682]	753	INTEGER(iwp), PARAMETER :: stridex = 4 !<
[1320]	754
[1682]	755	REAL(wp), DIMENSION(0:ny,stridex) :: work_ffty !<
[1]	756	#if defined( __nec )
[1682]	757	REAL(wp), DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec !<
[1]	758	#endif
[1682]	759	REAL(wp), DIMENSION(1:nz,0:ny,nxl:nxr) :: f_in !<
	760	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_out !<
	761	REAL(wp), DIMENSION(nxl:nxr,1:nz,0:ny) :: work !<
[1]	762
	763	!
	764	!-- Carry out the FFT along y, where all data are present due to the
	765	!-- 1d-decomposition along x. Resort the data in a way that x becomes
	766	!-- the first index.
[1106]	767	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'start' )
[1]	768
	769	IF ( host(1:3) == 'nec' ) THEN
	770	#if defined( __nec )
	771	!
	772	!-- Code optimized for vector processors
[85]	773	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	774	!$OMP DO
	775	DO i = nxl, nxr
	776
	777	DO j = 0, ny
	778	DO k = 1, nz
	779	work_ffty_vec(j,k,i) = f_in(k,j,i)
	780	ENDDO
	781	ENDDO
	782
	783	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' )
	784
	785	ENDDO
	786
	787	!$OMP DO
	788	DO k = 1, nz
	789	DO j = 0, ny
	790	DO i = nxl, nxr
	791	work(i,k,j) = work_ffty_vec(j,k,i)
	792	ENDDO
	793	ENDDO
	794	ENDDO
	795	!$OMP END PARALLEL
	796	#endif
	797
	798	ELSE
	799
	800	!
	801	!-- Cache optimized code.
	802	!-- The i-(x-)direction is split into a strided outer loop and an inner
	803	!-- loop for better cache performance
	804	!$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty)
	805	!$OMP DO
	806	DO iouter = nxl, nxr, stridex
	807
	808	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
	809
	810	DO k = 1, nz
	811
	812	DO i = iouter, iend
	813
	814	ir = i-iouter+1 ! counter within a stride
	815	DO j = 0, ny
	816	work_ffty(j,ir) = f_in(k,j,i)
	817	ENDDO
	818	!
	819	!-- FFT along y
[1106]	820	CALL fft_y_1d( work_ffty(:,ir), 'forward' )
[1]	821
	822	ENDDO
	823
	824	!
	825	!-- Resort
	826	DO j = 0, ny
	827	DO i = iouter, iend
	828	work(i,k,j) = work_ffty(j,i-iouter+1)
	829	ENDDO
	830	ENDDO
	831
	832	ENDDO
	833
	834	ENDDO
	835	!$OMP END PARALLEL
	836
	837	ENDIF
[1106]	838	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'pause' )
[1]	839
	840	!
	841	!-- Transpose array
[1111]	842	#if defined( __parallel )
[1]	843	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	844	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	845	CALL MPI_ALLTOALL( work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
	846	f_out(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
	847	comm1dx, ierr )
	848	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1111]	849	#endif
[1]	850
	851	END SUBROUTINE ffty_tr_yx
	852
	853
	854	!------------------------------------------------------------------------------!
[1682]	855	! Description:
	856	! ------------
	857	!> Transposition x --> y with a subsequent backward Fourier transformation for
	858	!> a 1d-decomposition along x
[1]	859	!------------------------------------------------------------------------------!
[1682]	860	SUBROUTINE tr_xy_ffty( f_in, f_out )
[1]	861
[1682]	862
[1320]	863	USE control_parameters, &
	864	ONLY: host
	865
	866	USE cpulog, &
	867	ONLY: cpu_log, log_point_s
	868
	869	USE kinds
	870
[1]	871	USE pegrid
	872
	873	IMPLICIT NONE
	874
[1682]	875	INTEGER(iwp) :: i !<
	876	INTEGER(iwp) :: iend !<
	877	INTEGER(iwp) :: iouter !<
	878	INTEGER(iwp) :: ir !<
	879	INTEGER(iwp) :: j !<
	880	INTEGER(iwp) :: k !<
[1]	881
[1682]	882	INTEGER(iwp), PARAMETER :: stridex = 4 !<
[1320]	883
[1682]	884	REAL(wp), DIMENSION(0:ny,stridex) :: work_ffty !<
[1]	885	#if defined( __nec )
[1682]	886	REAL(wp), DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec !<
[1]	887	#endif
[1682]	888	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_in !<
	889	REAL(wp), DIMENSION(1:nz,0:ny,nxl:nxr) :: f_out !<
	890	REAL(wp), DIMENSION(nxl:nxr,1:nz,0:ny) :: work !<
[1]	891
	892	!
	893	!-- Transpose array
[1111]	894	#if defined( __parallel )
[1]	895	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	896	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	897	CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
	898	work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
	899	comm1dx, ierr )
	900	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1111]	901	#endif
[1]	902
	903	!
	904	!-- Resort the data in a way that y becomes the first index and carry out the
	905	!-- backward fft along y.
[1106]	906	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'continue' )
[1]	907
	908	IF ( host(1:3) == 'nec' ) THEN
	909	#if defined( __nec )
	910	!
	911	!-- Code optimized for vector processors
[85]	912	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	913	!$OMP DO
	914	DO k = 1, nz
	915	DO j = 0, ny
	916	DO i = nxl, nxr
	917	work_ffty_vec(j,k,i) = work(i,k,j)
	918	ENDDO
	919	ENDDO
	920	ENDDO
	921
	922	!$OMP DO
	923	DO i = nxl, nxr
	924
	925	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' )
	926
	927	DO j = 0, ny
	928	DO k = 1, nz
	929	f_out(k,j,i) = work_ffty_vec(j,k,i)
	930	ENDDO
	931	ENDDO
	932
	933	ENDDO
	934	!$OMP END PARALLEL
	935	#endif
	936
	937	ELSE
	938
	939	!
	940	!-- Cache optimized code.
	941	!-- The i-(x-)direction is split into a strided outer loop and an inner
	942	!-- loop for better cache performance
	943	!$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty )
	944	!$OMP DO
	945	DO iouter = nxl, nxr, stridex
	946
	947	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
	948
	949	DO k = 1, nz
	950	!
	951	!-- Resort
	952	DO j = 0, ny
	953	DO i = iouter, iend
	954	work_ffty(j,i-iouter+1) = work(i,k,j)
	955	ENDDO
	956	ENDDO
	957
	958	DO i = iouter, iend
	959
	960	!
	961	!-- FFT along y
	962	ir = i-iouter+1 ! counter within a stride
[1106]	963	CALL fft_y_1d( work_ffty(:,ir), 'backward' )
[1]	964
	965	DO j = 0, ny
	966	f_out(k,j,i) = work_ffty(j,ir)
	967	ENDDO
	968	ENDDO
	969
	970	ENDDO
	971
	972	ENDDO
	973	!$OMP END PARALLEL
	974
	975	ENDIF
	976
[1106]	977	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'stop' )
[1]	978
	979	END SUBROUTINE tr_xy_ffty
	980
	981
	982	!------------------------------------------------------------------------------!
[1682]	983	! Description:
	984	! ------------
	985	!> FFT along x, solution of the tridiagonal system and backward FFT for
	986	!> a 1d-decomposition along x
	987	!>
	988	!> @warning this subroutine may still not work for hybrid parallelization
	989	!> with OpenMP (for possible necessary changes see the original
	990	!> routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
[1]	991	!------------------------------------------------------------------------------!
[1682]	992	SUBROUTINE fftx_tri_fftx( ar )
[1]	993
[1682]	994
[1320]	995	USE control_parameters, &
	996	ONLY: host
	997
	998	USE cpulog, &
	999	ONLY: cpu_log, log_point_s
	1000
	1001	USE grid_variables, &
	1002	ONLY: ddx2, ddy2
	1003
	1004	USE kinds
	1005
[1]	1006	USE pegrid
	1007
	1008	IMPLICIT NONE
	1009
[1682]	1010	INTEGER(iwp) :: i !<
	1011	INTEGER(iwp) :: j !<
	1012	INTEGER(iwp) :: k !<
	1013	INTEGER(iwp) :: m !<
	1014	INTEGER(iwp) :: n !<
	1015	INTEGER(iwp) :: omp_get_thread_num !<
	1016	INTEGER(iwp) :: tn !<
[1]	1017
[1682]	1018	REAL(wp), DIMENSION(0:nx) :: work_fftx !<
	1019	REAL(wp), DIMENSION(0:nx,1:nz) :: work_trix !<
	1020	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: ar !<
	1021	REAL(wp), DIMENSION(:,:,:,:), ALLOCATABLE :: tri !<
[1]	1022
	1023
[1106]	1024	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'start' )
[1]	1025
	1026	ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) )
	1027
	1028	tn = 0 ! Default thread number in case of one thread
	1029	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_fftx, work_trix )
	1030	DO j = nys_x, nyn_x
	1031
	1032	!$ tn = omp_get_thread_num()
	1033
	1034	IF ( host(1:3) == 'nec' ) THEN
	1035	!
	1036	!-- Code optimized for vector processors
	1037	DO k = 1, nz
	1038
	1039	m = 0
	1040	DO n = 1, pdims(1)
[1003]	1041	DO i = 1, nnx
[1]	1042	work_trix(m,k) = ar(i,k,j,n)
	1043	m = m + 1
	1044	ENDDO
	1045	ENDDO
	1046
	1047	ENDDO
	1048
	1049	CALL fft_x_m( work_trix, 'forward' )
	1050
	1051	ELSE
	1052	!
	1053	!-- Cache optimized code
	1054	DO k = 1, nz
	1055
	1056	m = 0
	1057	DO n = 1, pdims(1)
[1003]	1058	DO i = 1, nnx
[1]	1059	work_fftx(m) = ar(i,k,j,n)
	1060	m = m + 1
	1061	ENDDO
	1062	ENDDO
	1063
[1106]	1064	CALL fft_x_1d( work_fftx, 'forward' )
[1]	1065
	1066	DO i = 0, nx
	1067	work_trix(i,k) = work_fftx(i)
	1068	ENDDO
	1069
	1070	ENDDO
	1071
	1072	ENDIF
	1073
	1074	!
	1075	!-- Solve the linear equation system
	1076	CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) )
	1077
	1078	IF ( host(1:3) == 'nec' ) THEN
	1079	!
	1080	!-- Code optimized for vector processors
	1081	CALL fft_x_m( work_trix, 'backward' )
	1082
	1083	DO k = 1, nz
	1084
	1085	m = 0
	1086	DO n = 1, pdims(1)
[1003]	1087	DO i = 1, nnx
[1]	1088	ar(i,k,j,n) = work_trix(m,k)
	1089	m = m + 1
	1090	ENDDO
	1091	ENDDO
	1092
	1093	ENDDO
	1094
	1095	ELSE
	1096	!
	1097	!-- Cache optimized code
	1098	DO k = 1, nz
	1099
	1100	DO i = 0, nx
	1101	work_fftx(i) = work_trix(i,k)
	1102	ENDDO
	1103
[1106]	1104	CALL fft_x_1d( work_fftx, 'backward' )
[1]	1105
	1106	m = 0
	1107	DO n = 1, pdims(1)
[1003]	1108	DO i = 1, nnx
[1]	1109	ar(i,k,j,n) = work_fftx(m)
	1110	m = m + 1
	1111	ENDDO
	1112	ENDDO
	1113
	1114	ENDDO
	1115
	1116	ENDIF
	1117
	1118	ENDDO
	1119
	1120	DEALLOCATE( tri )
	1121
[1106]	1122	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'stop' )
[1]	1123
	1124	END SUBROUTINE fftx_tri_fftx
	1125
	1126
	1127	!------------------------------------------------------------------------------!
[1682]	1128	! Description:
	1129	! ------------
	1130	!> Fourier-transformation along x with subsequent transposition x --> y for
	1131	!> a 1d-decomposition along y.
	1132	!>
	1133	!> @attention NEC-branch of this routine may significantly profit from
	1134	!> further optimizations. So far, performance is much worse than
	1135	!> for routine ffty_tr_yx (more than three times slower).
[1]	1136	!------------------------------------------------------------------------------!
[1682]	1137	SUBROUTINE fftx_tr_xy( f_in, f_out )
[1]	1138
[1682]	1139
[1320]	1140	USE control_parameters, &
	1141	ONLY: host
	1142
	1143	USE cpulog, &
	1144	ONLY: cpu_log, log_point_s
	1145
	1146	USE kinds
	1147
[1]	1148	USE pegrid
	1149
	1150	IMPLICIT NONE
	1151
[1682]	1152	INTEGER(iwp) :: i !<
	1153	INTEGER(iwp) :: j !<
	1154	INTEGER(iwp) :: k !<
[1]	1155
[1682]	1156	REAL(wp), DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx !<
	1157	REAL(wp), DIMENSION(1:nz,nys:nyn,0:nx) :: f_in !<
	1158	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_out !<
	1159	REAL(wp), DIMENSION(nys:nyn,1:nz,0:nx) :: work !<
[1]	1160
	1161	!
	1162	!-- Carry out the FFT along x, where all data are present due to the
	1163	!-- 1d-decomposition along y. Resort the data in a way that y becomes
	1164	!-- the first index.
[1106]	1165	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'start' )
[1]	1166
	1167	IF ( host(1:3) == 'nec' ) THEN
	1168	!
	1169	!-- Code for vector processors
[85]	1170	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	1171	!$OMP DO
	1172	DO i = 0, nx
	1173
	1174	DO j = nys, nyn
	1175	DO k = 1, nz
	1176	work_fftx(i,k,j) = f_in(k,j,i)
	1177	ENDDO
	1178	ENDDO
	1179
	1180	ENDDO
	1181
	1182	!$OMP DO
	1183	DO j = nys, nyn
	1184
	1185	CALL fft_x_m( work_fftx(:,:,j), 'forward' )
	1186
	1187	DO k = 1, nz
	1188	DO i = 0, nx
	1189	work(j,k,i) = work_fftx(i,k,j)
	1190	ENDDO
	1191	ENDDO
	1192
	1193	ENDDO
	1194	!$OMP END PARALLEL
	1195
	1196	ELSE
	1197
	1198	!
	1199	!-- Cache optimized code (there might be still a potential for better
	1200	!-- optimization).
[696]	1201	!$OMP PARALLEL PRIVATE (i,j,k)
[1]	1202	!$OMP DO
	1203	DO i = 0, nx
	1204
	1205	DO j = nys, nyn
	1206	DO k = 1, nz
	1207	work_fftx(i,k,j) = f_in(k,j,i)
	1208	ENDDO
	1209	ENDDO
	1210
	1211	ENDDO
	1212
	1213	!$OMP DO
	1214	DO j = nys, nyn
	1215	DO k = 1, nz
	1216
[1106]	1217	CALL fft_x_1d( work_fftx(0:nx,k,j), 'forward' )
[1]	1218
	1219	DO i = 0, nx
	1220	work(j,k,i) = work_fftx(i,k,j)
	1221	ENDDO
	1222	ENDDO
	1223
	1224	ENDDO
	1225	!$OMP END PARALLEL
	1226
	1227	ENDIF
[1106]	1228	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'pause' )
[1]	1229
	1230	!
	1231	!-- Transpose array
[1111]	1232	#if defined( __parallel )
[1]	1233	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	1234	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	1235	CALL MPI_ALLTOALL( work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
	1236	f_out(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
	1237	comm1dy, ierr )
	1238	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1111]	1239	#endif
[1]	1240
	1241	END SUBROUTINE fftx_tr_xy
	1242
	1243
	1244	!------------------------------------------------------------------------------!
[1682]	1245	! Description:
	1246	! ------------
	1247	!> Transposition y --> x with a subsequent backward Fourier transformation for
	1248	!> a 1d-decomposition along x.
[1]	1249	!------------------------------------------------------------------------------!
[1682]	1250	SUBROUTINE tr_yx_fftx( f_in, f_out )
[1]	1251
[1682]	1252
[1320]	1253	USE control_parameters, &
	1254	ONLY: host
	1255
	1256	USE cpulog, &
	1257	ONLY: cpu_log, log_point_s
	1258
	1259	USE kinds
	1260
[1]	1261	USE pegrid
	1262
	1263	IMPLICIT NONE
	1264
[1682]	1265	INTEGER(iwp) :: i !<
	1266	INTEGER(iwp) :: j !<
	1267	INTEGER(iwp) :: k !<
[1]	1268
[1682]	1269	REAL(wp), DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx !<
	1270	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_in !<
	1271	REAL(wp), DIMENSION(1:nz,nys:nyn,0:nx) :: f_out !<
	1272	REAL(wp), DIMENSION(nys:nyn,1:nz,0:nx) :: work !<
[1]	1273
	1274	!
	1275	!-- Transpose array
[1111]	1276	#if defined( __parallel )
[1]	1277	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	1278	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	1279	CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
	1280	work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
	1281	comm1dy, ierr )
	1282	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1111]	1283	#endif
[1]	1284
	1285	!
	1286	!-- Carry out the FFT along x, where all data are present due to the
	1287	!-- 1d-decomposition along y. Resort the data in a way that y becomes
	1288	!-- the first index.
[1106]	1289	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'continue' )
[1]	1290
	1291	IF ( host(1:3) == 'nec' ) THEN
	1292	!
	1293	!-- Code optimized for vector processors
[85]	1294	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	1295	!$OMP DO
	1296	DO j = nys, nyn
	1297
	1298	DO k = 1, nz
	1299	DO i = 0, nx
	1300	work_fftx(i,k,j) = work(j,k,i)
	1301	ENDDO
	1302	ENDDO
	1303
	1304	CALL fft_x_m( work_fftx(:,:,j), 'backward' )
	1305
	1306	ENDDO
	1307
	1308	!$OMP DO
	1309	DO i = 0, nx
	1310	DO j = nys, nyn
	1311	DO k = 1, nz
	1312	f_out(k,j,i) = work_fftx(i,k,j)
	1313	ENDDO
	1314	ENDDO
	1315	ENDDO
	1316	!$OMP END PARALLEL
	1317
	1318	ELSE
	1319
	1320	!
	1321	!-- Cache optimized code (there might be still a potential for better
	1322	!-- optimization).
[696]	1323	!$OMP PARALLEL PRIVATE (i,j,k)
[1]	1324	!$OMP DO
	1325	DO j = nys, nyn
	1326	DO k = 1, nz
	1327
	1328	DO i = 0, nx
	1329	work_fftx(i,k,j) = work(j,k,i)
	1330	ENDDO
	1331
[1106]	1332	CALL fft_x_1d( work_fftx(0:nx,k,j), 'backward' )
[1]	1333
	1334	ENDDO
	1335	ENDDO
	1336
	1337	!$OMP DO
	1338	DO i = 0, nx
	1339	DO j = nys, nyn
	1340	DO k = 1, nz
	1341	f_out(k,j,i) = work_fftx(i,k,j)
	1342	ENDDO
	1343	ENDDO
	1344	ENDDO
	1345	!$OMP END PARALLEL
	1346
	1347	ENDIF
[1106]	1348	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'stop' )
[1]	1349
	1350	END SUBROUTINE tr_yx_fftx
	1351
	1352
	1353	!------------------------------------------------------------------------------!
[1682]	1354	! Description:
	1355	! ------------
	1356	!> FFT along y, solution of the tridiagonal system and backward FFT for
	1357	!> a 1d-decomposition along y.
	1358	!>
	1359	!> @warning this subroutine may still not work for hybrid parallelization
	1360	!> with OpenMP (for possible necessary changes see the original
	1361	!> routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
[1]	1362	!------------------------------------------------------------------------------!
[1682]	1363	SUBROUTINE ffty_tri_ffty( ar )
[1]	1364
[1682]	1365
[1320]	1366	USE control_parameters, &
	1367	ONLY: host
	1368
	1369	USE cpulog, &
	1370	ONLY: cpu_log, log_point_s
	1371
	1372	USE grid_variables, &
	1373	ONLY: ddx2, ddy2
	1374
	1375	USE kinds
	1376
[1]	1377	USE pegrid
	1378
	1379	IMPLICIT NONE
	1380
[1682]	1381	INTEGER(iwp) :: i !<
	1382	INTEGER(iwp) :: j !<
	1383	INTEGER(iwp) :: k !<
	1384	INTEGER(iwp) :: m !<
	1385	INTEGER(iwp) :: n !<
	1386	INTEGER(iwp) :: omp_get_thread_num !<
	1387	INTEGER(iwp) :: tn !<
[1]	1388
[1682]	1389	REAL(wp), DIMENSION(0:ny) :: work_ffty !<
	1390	REAL(wp), DIMENSION(0:ny,1:nz) :: work_triy !<
	1391	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: ar !<
	1392	REAL(wp), DIMENSION(:,:,:,:), ALLOCATABLE :: tri !<
[1]	1393
	1394
[1106]	1395	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'start' )
[1]	1396
	1397	ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) )
	1398
	1399	tn = 0 ! Default thread number in case of one thread
[696]	1400	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy )
[1]	1401	DO i = nxl_y, nxr_y
	1402
	1403	!$ tn = omp_get_thread_num()
	1404
	1405	IF ( host(1:3) == 'nec' ) THEN
	1406	!
	1407	!-- Code optimized for vector processors
	1408	DO k = 1, nz
	1409
	1410	m = 0
	1411	DO n = 1, pdims(2)
[1003]	1412	DO j = 1, nny
[1]	1413	work_triy(m,k) = ar(j,k,i,n)
	1414	m = m + 1
	1415	ENDDO
	1416	ENDDO
	1417
	1418	ENDDO
	1419
	1420	CALL fft_y_m( work_triy, ny, 'forward' )
	1421
	1422	ELSE
	1423	!
	1424	!-- Cache optimized code
	1425	DO k = 1, nz
	1426
	1427	m = 0
	1428	DO n = 1, pdims(2)
[1003]	1429	DO j = 1, nny
[1]	1430	work_ffty(m) = ar(j,k,i,n)
	1431	m = m + 1
	1432	ENDDO
	1433	ENDDO
	1434
[1106]	1435	CALL fft_y_1d( work_ffty, 'forward' )
[1]	1436
	1437	DO j = 0, ny
	1438	work_triy(j,k) = work_ffty(j)
	1439	ENDDO
	1440
	1441	ENDDO
	1442
	1443	ENDIF
	1444
	1445	!
	1446	!-- Solve the linear equation system
	1447	CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) )
	1448
	1449	IF ( host(1:3) == 'nec' ) THEN
	1450	!
	1451	!-- Code optimized for vector processors
	1452	CALL fft_y_m( work_triy, ny, 'backward' )
	1453
	1454	DO k = 1, nz
	1455
	1456	m = 0
	1457	DO n = 1, pdims(2)
[1003]	1458	DO j = 1, nny
[1]	1459	ar(j,k,i,n) = work_triy(m,k)
	1460	m = m + 1
	1461	ENDDO
	1462	ENDDO
	1463
	1464	ENDDO
	1465
	1466	ELSE
	1467	!
	1468	!-- Cache optimized code
	1469	DO k = 1, nz
	1470
	1471	DO j = 0, ny
	1472	work_ffty(j) = work_triy(j,k)
	1473	ENDDO
	1474
[1106]	1475	CALL fft_y_1d( work_ffty, 'backward' )
[1]	1476
	1477	m = 0
	1478	DO n = 1, pdims(2)
[1003]	1479	DO j = 1, nny
[1]	1480	ar(j,k,i,n) = work_ffty(m)
	1481	m = m + 1
	1482	ENDDO
	1483	ENDDO
	1484
	1485	ENDDO
	1486
	1487	ENDIF
	1488
	1489	ENDDO
	1490
	1491	DEALLOCATE( tri )
	1492
[1106]	1493	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'stop' )
[1]	1494
	1495	END SUBROUTINE ffty_tri_ffty
	1496
	1497	END MODULE poisfft_mod

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: palm/trunk/SOURCE/poisfft.f90 @ 1812

Download in other formats: