Home

Context Navigation

source: palm/trunk/SOURCE/poisfft_mod.f90 @ 3654

Last change on this file since 3654 was 3634, checked in by knoop, 6 years ago
OpenACC port for SPEC
Property svn:keywords set to `Id`
File size: 45.6 KB

Rev	Line
[1850]	1	!> @file poisfft_mod.f90
[2000]	2	!------------------------------------------------------------------------------!
[2696]	3	! This file is part of the PALM model system.
[1036]	4	!
[2000]	5	! PALM is free software: you can redistribute it and/or modify it under the
	6	! terms of the GNU General Public License as published by the Free Software
	7	! Foundation, either version 3 of the License, or (at your option) any later
	8	! version.
[1036]	9	!
	10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
	11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
	12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
	13	!
	14	! You should have received a copy of the GNU General Public License along with
	15	! PALM. If not, see <http://www.gnu.org/licenses/>.
	16	!
[2718]	17	! Copyright 1997-2018 Leibniz Universitaet Hannover
[2000]	18	!------------------------------------------------------------------------------!
[1036]	19	!
[484]	20	! Current revisions:
[1]	21	! -----------------
[1683]	22	!
[2119]	23	!
[1321]	24	! Former revisions:
	25	! -----------------
	26	! $Id: poisfft_mod.f90 3634 2018-12-18 12:31:28Z suehring $
[3634]	27	! OpenACC port for SPEC
	28	!
	29	! 3241 2018-09-12 15:02:00Z raasch
[3241]	30	! unused variables removed,
	31	! declarations of omp_get_thread_num now as omp-directive
	32	!
	33	! 2718 2018-01-02 08:49:38Z maronga
[2716]	34	! Corrected "Former revisions" section
	35	!
	36	! 2696 2017-12-14 17:12:51Z kanani
	37	! Change in file header (GPL part)
	38	!
	39	! 2300 2017-06-29 13:31:14Z raasch
[2300]	40	! settings depending on host variable removed or replaced by loop_optimization
	41	!
	42	! 2119 2017-01-17 16:51:50Z raasch
[1321]	43	!
[2119]	44	! 2118 2017-01-17 16:38:49Z raasch
	45	! OpenACC directives and related code removed
	46	!
[2001]	47	! 2000 2016-08-20 18:09:15Z knoop
	48	! Forced header and separation lines into 80 columns
	49	!
[1851]	50	! 1850 2016-04-08 13:29:27Z maronga
	51	! Module renamed
	52	!
	53	!
[1805]	54	! 1804 2016-04-05 16:30:18Z maronga
	55	! Removed code for parameter file check (__check)
	56	!
[1683]	57	! 1682 2015-10-07 23:56:08Z knoop
	58	! Code annotations made doxygen readable
	59	!
[1483]	60	! 1482 2014-10-18 12:34:45Z raasch
	61	! use 2d-decomposition, if accelerator boards are used
	62	!
[1407]	63	! 1406 2014-05-16 13:47:01Z raasch
	64	! bugfix for pgi 14.4: declare create moved after array declaration
	65	!
[1321]	66	! 1320 2014-03-20 08:40:49Z raasch
[1320]	67	! ONLY-attribute added to USE-statements,
	68	! kind-parameters added to all INTEGER and REAL declaration statements,
	69	! kinds are defined in new module kinds,
	70	! old module precision_kind is removed,
	71	! revision history before 2012 removed,
	72	! comment fields (!:) to be used for variable explanations added to
	73	! all variable declaration statements
[1217]	74	!
[1319]	75	! 1318 2014-03-17 13:35:16Z raasch
	76	! module interfaces removed
	77	!
[1307]	78	! 1306 2014-03-13 14:30:59Z raasch
	79	! openmp sections removed from the overlap branch,
	80	! second argument removed from parameter list
	81	!
[1217]	82	! 1216 2013-08-26 09:31:42Z raasch
[1216]	83	! resorting of arrays moved to separate routines resort_for_...,
	84	! one argument, used as temporary work array, removed from all transpose
	85	! routines
	86	! overlapping fft / transposition implemented
[1112]	87	!
[1213]	88	! 1212 2013-08-15 08:46:27Z raasch
	89	! tridia routines moved to seperate module tridia_solver
	90	!
[1209]	91	! 1208 2013-08-13 06:41:49Z raasch
	92	! acc-update clauses added for "ar" so that ffts other than cufft can also be
	93	! used (although they are not ported and will give a poor performance)
	94	!
[1112]	95	! 1111 2013-03-08 23:54:10Z raasch
[1111]	96	! further openACC porting of non-parallel (MPI) branch:
	97	! tridiagonal routines split into extermal subroutines (instead using CONTAINS),
	98	! no distinction between parallel/non-parallel in poisfft and tridia any more,
[1112]	99	! tridia routines moved to end of file because of probable bug in PGI compiler 12.5
[1111]	100	! (otherwise "invalid device function" is indicated during runtime),
	101	! optimization of tridia routines: constant elements and coefficients of tri are
	102	! stored in seperate arrays ddzuw and tric, last dimension of tri reduced from 5
	103	! to 2,
	104	! poisfft_init is now called internally from poisfft, maketri is called from
	105	! poisfft_init,
	106	! ibc_p_b = 2 removed
[1]	107	!
[1107]	108	! 1106 2013-03-04 05:31:38Z raasch
	109	! routines fftx, ffty, fftxp, fftyp removed, calls replaced by fft_x, fft_y,
	110	! in the 1D-decomposition routines fft_x, ffty are replaced by fft_x_1d,
	111	! fft_y_1d
	112	!
[1104]	113	! 1103 2013-02-20 02:15:53Z raasch
	114	! tri, ar, and ar1 arguments in tridia-routines (2d) are removed because they
	115	! sometimes cause segmentation faults with intel 12.1 compiler
	116	!
[1093]	117	! 1092 2013-02-02 11:24:22Z raasch
	118	! unused variables removed
	119	!
[1037]	120	! 1036 2012-10-22 13:43:42Z raasch
	121	! code put under GPL (PALM 3.9)
	122	!
[1014]	123	! 2012-09-21 07:03:55Z raasch
	124	! FLOAT type conversion replaced by REAL
	125	!
[1004]	126	! 1003 2012-09-14 14:35:53Z raasch
	127	! indices nxa, nya, etc. replaced by nx, ny, etc.
	128	!
[941]	129	! 940 2012-07-09 14:31:00Z raasch
	130	! special handling of tri-array as an argument in tridia_1dd routines switched
	131	! off because it caused segmentation faults with intel 12.1 compiler
	132	!
[878]	133	! 877 2012-04-03 11:21:44Z suehring
	134	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
	135	! pressure at the top of the model domain.
	136	!
[810]	137	! 809 2012-01-30 13:32:58Z maronga
	138	! Bugfix: replaced .AND. and .NOT. with && and ! in the preprocessor directives
	139	!
[808]	140	! 807 2012-01-25 11:53:51Z maronga
	141	! New cpp directive "__check" implemented which is used by check_namelist_files
	142	! (most of the code is unneeded by check_namelist_files).
	143	!
[1]	144	! Revision 1.1 1997/07/24 11:24:14 raasch
	145	! Initial revision
	146	!
	147	!
	148	! Description:
	149	! ------------
[1682]	150	!> Solves the Poisson equation with a 2D spectral method
	151	!> d^2 p / dx^2 + d^2 p / dy^2 + d^2 p / dz^2 = s
	152	!>
	153	!> Input:
	154	!> real ar contains (nnz,nny,nnx) elements of the velocity divergence,
	155	!> starting from (1,nys,nxl)
	156	!>
	157	!> Output:
	158	!> real ar contains the solution for perturbation pressure p
[1]	159	!------------------------------------------------------------------------------!
[1682]	160	MODULE poisfft_mod
	161
[1]	162
[1320]	163	USE fft_xy, &
	164	ONLY: fft_init, fft_y, fft_y_1d, fft_y_m, fft_x, fft_x_1d, fft_x_m
[1]	165
[1320]	166	USE indices, &
	167	ONLY: nnx, nny, nx, nxl, nxr, ny, nys, nyn, nz
	168
	169	USE transpose_indices, &
	170	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nys_x, nys_z, nyn_x, nyn_z, nzb_x, &
	171	nzb_y, nzt_x, nzt_y
	172
	173	USE tridia_solver, &
	174	ONLY: tridia_1dd, tridia_init, tridia_substi, tridia_substi_overlap
	175
[1]	176	IMPLICIT NONE
	177
[1111]	178	LOGICAL, SAVE :: poisfft_initialized = .FALSE.
	179
[1]	180	PRIVATE
[807]	181
[1]	182	PUBLIC poisfft, poisfft_init
	183
	184	INTERFACE poisfft
	185	MODULE PROCEDURE poisfft
	186	END INTERFACE poisfft
	187
	188	INTERFACE poisfft_init
	189	MODULE PROCEDURE poisfft_init
	190	END INTERFACE poisfft_init
	191
[807]	192
[1]	193	CONTAINS
	194
[1682]	195	!------------------------------------------------------------------------------!
	196	! Description:
	197	! ------------
[3241]	198	!> Setup coefficients for FFT and the tridiagonal solver
[1682]	199	!------------------------------------------------------------------------------!
[1]	200	SUBROUTINE poisfft_init
	201
[1111]	202	IMPLICIT NONE
	203
	204
[1]	205	CALL fft_init
	206
[1212]	207	CALL tridia_init
[1111]	208
	209	poisfft_initialized = .TRUE.
	210
[1]	211	END SUBROUTINE poisfft_init
	212
[1111]	213
[1804]	214
[1682]	215	!------------------------------------------------------------------------------!
	216	! Description:
	217	! ------------
	218	!> Two-dimensional Fourier Transformation in x- and y-direction.
	219	!------------------------------------------------------------------------------!
[1306]	220	SUBROUTINE poisfft( ar )
[1]	221
[1320]	222	USE control_parameters, &
[3241]	223	ONLY: transpose_compute_overlap
[1320]	224
	225	USE cpulog, &
	226	ONLY: cpu_log, cpu_log_nowait, log_point_s
	227
	228	USE kinds
	229
[1]	230	USE pegrid
	231
	232	IMPLICIT NONE
	233
[1682]	234	INTEGER(iwp) :: ii !<
	235	INTEGER(iwp) :: iind !<
	236	INTEGER(iwp) :: inew !<
	237	INTEGER(iwp) :: jj !<
	238	INTEGER(iwp) :: jind !<
	239	INTEGER(iwp) :: jnew !<
	240	INTEGER(iwp) :: ki !<
	241	INTEGER(iwp) :: kk !<
	242	INTEGER(iwp) :: knew !<
	243	INTEGER(iwp) :: n !<
	244	INTEGER(iwp) :: nblk !<
	245	INTEGER(iwp) :: nnx_y !<
	246	INTEGER(iwp) :: nny_z !<
	247	INTEGER(iwp) :: nnz_x !<
	248	INTEGER(iwp) :: nxl_y_bound !<
	249	INTEGER(iwp) :: nxr_y_bound !<
[1]	250
[1682]	251	INTEGER(iwp), DIMENSION(4) :: isave !<
[1320]	252
[1682]	253	REAL(wp), DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar !<
	254	REAL(wp), DIMENSION(nys:nyn,nxl:nxr,1:nz) :: ar_inv !<
[3634]	255	!$ACC DECLARE CREATE(ar_inv)
[1]	256
[1682]	257	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: ar1 !<
	258	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_in !<
	259	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_inv !<
	260	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_out_y !<
	261	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_out_z !<
[1216]	262
	263
[1]	264	CALL cpu_log( log_point_s(3), 'poisfft', 'start' )
	265
[1111]	266	IF ( .NOT. poisfft_initialized ) CALL poisfft_init
	267
[3634]	268	#ifndef _OPENACC
[1]	269	!
	270	!-- Two-dimensional Fourier Transformation in x- and y-direction.
[2118]	271	IF ( pdims(2) == 1 .AND. pdims(1) > 1 ) THEN
[1]	272
	273	!
	274	!-- 1d-domain-decomposition along x:
	275	!-- FFT along y and transposition y --> x
[1216]	276	CALL ffty_tr_yx( ar, ar )
[1]	277
	278	!
	279	!-- FFT along x, solving the tridiagonal system and backward FFT
	280	CALL fftx_tri_fftx( ar )
	281
	282	!
	283	!-- Transposition x --> y and backward FFT along y
[1216]	284	CALL tr_xy_ffty( ar, ar )
[1]	285
[2118]	286	ELSEIF ( pdims(1) == 1 .AND. pdims(2) > 1 ) THEN
[1]	287
	288	!
	289	!-- 1d-domain-decomposition along y:
	290	!-- FFT along x and transposition x --> y
[1216]	291	CALL fftx_tr_xy( ar, ar )
[1]	292
	293	!
	294	!-- FFT along y, solving the tridiagonal system and backward FFT
	295	CALL ffty_tri_ffty( ar )
	296
	297	!
	298	!-- Transposition y --> x and backward FFT along x
[1216]	299	CALL tr_yx_fftx( ar, ar )
[1]	300
[1216]	301	ELSEIF ( .NOT. transpose_compute_overlap ) THEN
[3634]	302	#endif
[1]	303
	304	!
[1111]	305	!-- 2d-domain-decomposition or no decomposition (1 PE run)
[1]	306	!-- Transposition z --> x
	307	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
[1216]	308	CALL resort_for_zx( ar, ar_inv )
	309	CALL transpose_zx( ar_inv, ar )
[1]	310	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
	311
	312	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
[1106]	313	CALL fft_x( ar, 'forward' )
[1]	314	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
	315
	316	!
	317	!-- Transposition x --> y
	318	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
[1216]	319	CALL resort_for_xy( ar, ar_inv )
	320	CALL transpose_xy( ar_inv, ar )
[1]	321	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
	322
	323	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
[1216]	324	CALL fft_y( ar, 'forward', ar_tr = ar, &
	325	nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
	326	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
[1]	327	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	328
	329	!
	330	!-- Transposition y --> z
	331	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
[1216]	332	CALL resort_for_yz( ar, ar_inv )
	333	CALL transpose_yz( ar_inv, ar )
[1]	334	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
	335
	336	!
[1106]	337	!-- Solve the tridiagonal equation system along z
[1]	338	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
[1212]	339	CALL tridia_substi( ar )
[1]	340	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
	341
	342	!
	343	!-- Inverse Fourier Transformation
	344	!-- Transposition z --> y
	345	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
[1216]	346	CALL transpose_zy( ar, ar_inv )
	347	CALL resort_for_zy( ar_inv, ar )
[1]	348	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
	349
	350	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
[1216]	351	CALL fft_y( ar, 'backward', ar_tr = ar, &
	352	nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
	353	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
[1]	354	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	355
	356	!
	357	!-- Transposition y --> x
	358	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
[1216]	359	CALL transpose_yx( ar, ar_inv )
	360	CALL resort_for_yx( ar_inv, ar )
[1]	361	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
	362
	363	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
[1106]	364	CALL fft_x( ar, 'backward' )
[1]	365	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	366
	367	!
	368	!-- Transposition x --> z
	369	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
[1216]	370	CALL transpose_xz( ar, ar_inv )
	371	CALL resort_for_xz( ar_inv, ar )
[1]	372	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
	373
[3634]	374	#ifndef _OPENACC
[1216]	375	ELSE
	376
	377	!
	378	!-- 2d-domain-decomposition or no decomposition (1 PE run) with
	379	!-- overlapping transposition / fft
[1318]	380	!-- cputime logging must not use barriers, which would prevent overlapping
[1216]	381	ALLOCATE( f_out_y(0:ny,nxl_y:nxr_y,nzb_y:nzt_y), &
	382	f_out_z(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
	383	!
	384	!-- Transposition z --> x + subsequent fft along x
	385	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
	386	CALL resort_for_zx( ar, f_inv )
	387	!
	388	!-- Save original indices and gridpoint counter
	389	isave(1) = nz
	390	isave(2) = nzb_x
	391	isave(3) = nzt_x
	392	isave(4) = sendrecvcount_zx
	393	!
	394	!-- Set new indices for transformation
	395	nblk = nz / pdims(1)
	396	nz = pdims(1)
	397	nnz_x = 1
	398	nzb_x = 1 + myidx * nnz_x
	399	nzt_x = ( myidx + 1 ) * nnz_x
	400	sendrecvcount_zx = nnx * nny * nnz_x
	401
[1306]	402	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
[1216]	403	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
	404
[1306]	405	DO kk = 1, nblk
[1216]	406
[1306]	407	IF ( kk == 1 ) THEN
[1318]	408	CALL cpu_log( log_point_s(5), 'transpo forward', 'start', cpu_log_nowait )
[1306]	409	ELSE
[1318]	410	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
[1306]	411	ENDIF
[1216]	412
[1306]	413	DO knew = 1, nz
	414	ki = kk + nblk * ( knew - 1 )
	415	f_in(:,:,knew) = f_inv(:,:,ki)
	416	ENDDO
[1216]	417
[1306]	418	CALL transpose_zx( f_in, ar1(:,:,:))
	419	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
[1216]	420
[1306]	421	IF ( kk == 1 ) THEN
[1318]	422	CALL cpu_log( log_point_s(4), 'fft_x', 'start', cpu_log_nowait )
[1306]	423	ELSE
[1318]	424	CALL cpu_log( log_point_s(4), 'fft_x', 'continue', cpu_log_nowait )
[1216]	425	ENDIF
	426
[1306]	427	n = isave(2) + kk - 1
	428	CALL fft_x( ar1(:,:,:), 'forward', ar_2d = f_out_z(:,:,n))
	429	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
[1216]	430
	431	ENDDO
	432	!
	433	!-- Restore original indices/counters
	434	nz = isave(1)
	435	nzb_x = isave(2)
	436	nzt_x = isave(3)
	437	sendrecvcount_zx = isave(4)
	438
	439	DEALLOCATE( ar1, f_in, f_inv )
	440
	441	!
	442	!-- Transposition x --> y + subsequent fft along y
	443	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
	444	CALL resort_for_xy( f_out_z, f_inv )
	445	!
	446	!-- Save original indices and gridpoint counter
	447	isave(1) = nx
	448	isave(2) = nxl_y
	449	isave(3) = nxr_y
	450	isave(4) = sendrecvcount_xy
	451	!
	452	!-- Set new indices for transformation
	453	nblk = ( ( nx+1 ) / pdims(2) ) - 1
	454	nx = pdims(2)
	455	nnx_y = 1
	456	nxl_y = myidy * nnx_y
	457	nxr_y = ( myidy + 1 ) * nnx_y - 1
	458	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
	459
[1306]	460	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
[1216]	461	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
	462
[1306]	463	DO ii = 0, nblk
[1216]	464
[1318]	465	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
[1216]	466
[1306]	467	DO inew = 0, nx-1
	468	iind = ii + ( nblk + 1 ) * inew
	469	f_in(:,:,inew) = f_inv(:,:,iind)
	470	ENDDO
[1216]	471
[1306]	472	CALL transpose_xy( f_in, ar1(:,:,:) )
[1216]	473
[1306]	474	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
[1216]	475
[1306]	476	IF ( ii == 1 ) THEN
[1318]	477	CALL cpu_log( log_point_s(7), 'fft_y', 'start', cpu_log_nowait )
[1306]	478	ELSE
[1318]	479	CALL cpu_log( log_point_s(7), 'fft_y', 'continue', cpu_log_nowait )
[1216]	480	ENDIF
	481
[1306]	482	nxl_y_bound = isave(2)
	483	nxr_y_bound = isave(3)
	484	n = isave(2) + ii
	485	CALL fft_y( ar1(:,:,:), 'forward', ar_tr = f_out_y, &
	486	nxl_y_bound = nxl_y_bound, nxr_y_bound = nxr_y_bound, &
	487	nxl_y_l = n, nxr_y_l = n )
[1216]	488
[1306]	489	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
[1216]	490
	491	ENDDO
	492	!
	493	!-- Restore original indices/counters
	494	nx = isave(1)
	495	nxl_y = isave(2)
	496	nxr_y = isave(3)
	497	sendrecvcount_xy = isave(4)
	498
	499	DEALLOCATE( ar1, f_in, f_inv )
	500
	501	!
	502	!-- Transposition y --> z + subsequent tridia + resort for z --> y
	503	ALLOCATE( f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
	504	CALL resort_for_yz( f_out_y, f_inv )
	505	!
	506	!-- Save original indices and gridpoint counter
	507	isave(1) = ny
	508	isave(2) = nys_z
	509	isave(3) = nyn_z
	510	isave(4) = sendrecvcount_yz
	511	!
	512	!-- Set new indices for transformation
	513	nblk = ( ( ny+1 ) / pdims(1) ) - 1
	514	ny = pdims(1)
	515	nny_z = 1
	516	nys_z = myidx * nny_z
	517	nyn_z = ( myidx + 1 ) * nny_z - 1
	518	sendrecvcount_yz = ( nxr_y-nxl_y+1 ) * nny_z * ( nzt_y-nzb_y+1 )
	519
[1306]	520	ALLOCATE( ar1(nxl_z:nxr_z,nys_z:nyn_z,1:nz) )
[1216]	521	ALLOCATE( f_in(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
	522
[1306]	523	DO jj = 0, nblk
[1216]	524	!
[1306]	525	!-- Forward Fourier Transformation
	526	!-- Transposition y --> z
[1318]	527	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
[1216]	528
[1306]	529	DO jnew = 0, ny-1
	530	jind = jj + ( nblk + 1 ) * jnew
	531	f_in(:,:,jnew) = f_inv(:,:,jind)
	532	ENDDO
[1216]	533
[1306]	534	CALL transpose_yz( f_in, ar1(:,:,:) )
[1216]	535
[1306]	536	IF ( jj == nblk ) THEN
	537	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
	538	ELSE
	539	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
[1216]	540	ENDIF
	541
	542	!
[1306]	543	!-- Solve the tridiagonal equation system along z
[1318]	544	CALL cpu_log( log_point_s(6), 'tridia', 'start', cpu_log_nowait )
[1216]	545
[1306]	546	n = isave(2) + jj
	547	CALL tridia_substi_overlap( ar1(:,:,:), n )
[1216]	548
[1306]	549	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
[1216]	550
[1306]	551	!
	552	!-- Inverse Fourier Transformation
	553	!-- Transposition z --> y
	554	!-- Only one thread should call MPI routines, therefore forward and
	555	!-- backward tranpose are in the same section
	556	IF ( jj == 0 ) THEN
[1318]	557	CALL cpu_log( log_point_s(8), 'transpo invers', 'start', cpu_log_nowait )
[1306]	558	ELSE
[1318]	559	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
[1216]	560	ENDIF
	561
[1306]	562	CALL transpose_zy( ar1(:,:,:), f_in )
[1216]	563
[1306]	564	DO jnew = 0, ny-1
	565	jind = jj + ( nblk + 1 ) * jnew
	566	f_inv(:,:,jind) = f_in(:,:,jnew)
	567	ENDDO
[1216]	568
[1306]	569	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
[1216]	570
	571	ENDDO
	572	!
	573	!-- Restore original indices/counters
	574	ny = isave(1)
	575	nys_z = isave(2)
	576	nyn_z = isave(3)
	577	sendrecvcount_yz = isave(4)
	578
	579	CALL resort_for_zy( f_inv, f_out_y )
	580
	581	DEALLOCATE( ar1, f_in, f_inv )
	582
	583	!
	584	!-- fft along y backward + subsequent transposition y --> x
	585	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
	586	!
	587	!-- Save original indices and gridpoint counter
	588	isave(1) = nx
	589	isave(2) = nxl_y
	590	isave(3) = nxr_y
	591	isave(4) = sendrecvcount_xy
	592	!
	593	!-- Set new indices for transformation
	594	nblk = (( nx+1 ) / pdims(2) ) - 1
	595	nx = pdims(2)
	596	nnx_y = 1
	597	nxl_y = myidy * nnx_y
	598	nxr_y = ( myidy + 1 ) * nnx_y - 1
	599	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
	600
[1306]	601	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
[1216]	602	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
	603
[1306]	604	DO ii = 0, nblk
[1216]	605
[1318]	606	CALL cpu_log( log_point_s(7), 'fft_y', 'continue', cpu_log_nowait )
[1216]	607
[1306]	608	n = isave(2) + ii
	609	nxl_y_bound = isave(2)
	610	nxr_y_bound = isave(3)
[1216]	611
[1306]	612	CALL fft_y( ar1(:,:,:), 'backward', ar_tr = f_out_y, &
	613	nxl_y_bound = nxl_y_bound, nxr_y_bound = nxr_y_bound, &
	614	nxl_y_l = n, nxr_y_l = n )
[1216]	615
[1306]	616	IF ( ii == nblk ) THEN
	617	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	618	ELSE
	619	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
[1216]	620	ENDIF
	621
[1318]	622	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
[1216]	623
[1306]	624	CALL transpose_yx( ar1(:,:,:), f_in )
[1216]	625
[1306]	626	DO inew = 0, nx-1
	627	iind = ii + (nblk+1) * inew
	628	f_inv(:,:,iind) = f_in(:,:,inew)
	629	ENDDO
[1216]	630
[1306]	631	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
[1216]	632
	633	ENDDO
	634	!
	635	!-- Restore original indices/counters
	636	nx = isave(1)
	637	nxl_y = isave(2)
	638	nxr_y = isave(3)
	639	sendrecvcount_xy = isave(4)
	640
	641	CALL resort_for_yx( f_inv, f_out_z )
	642
	643	DEALLOCATE( ar1, f_in, f_inv )
	644
	645	!
	646	!-- fft along x backward + subsequent final transposition x --> z
	647	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
	648	!
	649	!-- Save original indices and gridpoint counter
	650	isave(1) = nz
	651	isave(2) = nzb_x
	652	isave(3) = nzt_x
	653	isave(4) = sendrecvcount_zx
	654	!
	655	!-- Set new indices for transformation
	656	nblk = nz / pdims(1)
	657	nz = pdims(1)
	658	nnz_x = 1
	659	nzb_x = 1 + myidx * nnz_x
	660	nzt_x = ( myidx + 1 ) * nnz_x
	661	sendrecvcount_zx = nnx * nny * nnz_x
	662
[1306]	663	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
[1216]	664	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
	665
[1306]	666	DO kk = 1, nblk
[1216]	667
[1318]	668	CALL cpu_log( log_point_s(4), 'fft_x', 'continue', cpu_log_nowait )
[1216]	669
[1306]	670	n = isave(2) + kk - 1
	671	CALL fft_x( ar1(:,:,:), 'backward', f_out_z(:,:,n))
[1216]	672
[1306]	673	IF ( kk == nblk ) THEN
	674	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	675	ELSE
	676	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
[1216]	677	ENDIF
	678
[1318]	679	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
[1216]	680
[1306]	681	CALL transpose_xz( ar1(:,:,:), f_in )
[1216]	682
[1306]	683	DO knew = 1, nz
	684	ki = kk + nblk * (knew-1)
	685	f_inv(:,:,ki) = f_in(:,:,knew)
	686	ENDDO
[1216]	687
[1306]	688	IF ( kk == nblk ) THEN
	689	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
	690	ELSE
	691	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
[1216]	692	ENDIF
	693
	694	ENDDO
	695	!
	696	!-- Restore original indices/counters
	697	nz = isave(1)
	698	nzb_x = isave(2)
	699	nzt_x = isave(3)
	700	sendrecvcount_zx = isave(4)
	701
	702	CALL resort_for_xz( f_inv, ar )
	703
	704	DEALLOCATE( ar1, f_in, f_inv )
	705
[1]	706	ENDIF
[3634]	707	#endif
[1]	708
	709	CALL cpu_log( log_point_s(3), 'poisfft', 'stop' )
	710
	711	END SUBROUTINE poisfft
	712
	713
	714	!------------------------------------------------------------------------------!
[1682]	715	! Description:
	716	! ------------
	717	!> Fourier-transformation along y with subsequent transposition y --> x for
	718	!> a 1d-decomposition along x.
	719	!>
	720	!> @attention The performance of this routine is much faster on the NEC-SX6,
	721	!> if the first index of work_ffty_vec is odd. Otherwise
	722	!> memory bank conflicts may occur (especially if the index is a
	723	!> multiple of 128). That's why work_ffty_vec is dimensioned as
	724	!> 0:ny+1.
	725	!> Of course, this will not work if users are using an odd number
	726	!> of gridpoints along y.
[1]	727	!------------------------------------------------------------------------------!
[1682]	728	SUBROUTINE ffty_tr_yx( f_in, f_out )
[1]	729
[1320]	730	USE control_parameters, &
[2300]	731	ONLY: loop_optimization
[1320]	732
	733	USE cpulog, &
	734	ONLY: cpu_log, log_point_s
	735
	736	USE kinds
	737
[1]	738	USE pegrid
	739
	740	IMPLICIT NONE
	741
[1682]	742	INTEGER(iwp) :: i !<
	743	INTEGER(iwp) :: iend !<
	744	INTEGER(iwp) :: iouter !<
	745	INTEGER(iwp) :: ir !<
	746	INTEGER(iwp) :: j !<
	747	INTEGER(iwp) :: k !<
[1]	748
[1682]	749	INTEGER(iwp), PARAMETER :: stridex = 4 !<
[1320]	750
[1682]	751	REAL(wp), DIMENSION(1:nz,0:ny,nxl:nxr) :: f_in !<
	752	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_out !<
	753	REAL(wp), DIMENSION(nxl:nxr,1:nz,0:ny) :: work !<
[1]	754
[2300]	755	REAL(wp), DIMENSION(:,:), ALLOCATABLE :: work_ffty !<
	756	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: work_ffty_vec !<
	757
[1]	758	!
	759	!-- Carry out the FFT along y, where all data are present due to the
	760	!-- 1d-decomposition along x. Resort the data in a way that x becomes
	761	!-- the first index.
[1106]	762	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'start' )
[1]	763
[2300]	764	IF ( loop_optimization == 'vector' ) THEN
	765
	766	ALLOCATE( work_ffty_vec(0:ny+1,1:nz,nxl:nxr) )
[1]	767	!
	768	!-- Code optimized for vector processors
[2300]	769	!$OMP PARALLEL PRIVATE ( i, j, k )
	770	!$OMP DO
[1]	771	DO i = nxl, nxr
	772
	773	DO j = 0, ny
	774	DO k = 1, nz
	775	work_ffty_vec(j,k,i) = f_in(k,j,i)
	776	ENDDO
	777	ENDDO
	778
	779	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' )
	780
	781	ENDDO
	782
[2300]	783	!$OMP DO
[1]	784	DO k = 1, nz
	785	DO j = 0, ny
	786	DO i = nxl, nxr
	787	work(i,k,j) = work_ffty_vec(j,k,i)
	788	ENDDO
	789	ENDDO
	790	ENDDO
[2300]	791	!$OMP END PARALLEL
[1]	792
[2300]	793	DEALLOCATE( work_ffty_vec )
	794
[1]	795	ELSE
	796	!
	797	!-- Cache optimized code.
[2300]	798	ALLOCATE( work_ffty(0:ny,stridex) )
	799	!
[1]	800	!-- The i-(x-)direction is split into a strided outer loop and an inner
	801	!-- loop for better cache performance
[2300]	802	!$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty)
	803	!$OMP DO
[1]	804	DO iouter = nxl, nxr, stridex
	805
	806	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
	807
	808	DO k = 1, nz
	809
	810	DO i = iouter, iend
	811
	812	ir = i-iouter+1 ! counter within a stride
	813	DO j = 0, ny
	814	work_ffty(j,ir) = f_in(k,j,i)
	815	ENDDO
	816	!
	817	!-- FFT along y
[1106]	818	CALL fft_y_1d( work_ffty(:,ir), 'forward' )
[1]	819
	820	ENDDO
	821
	822	!
	823	!-- Resort
	824	DO j = 0, ny
	825	DO i = iouter, iend
	826	work(i,k,j) = work_ffty(j,i-iouter+1)
	827	ENDDO
	828	ENDDO
	829
	830	ENDDO
	831
	832	ENDDO
[2300]	833	!$OMP END PARALLEL
[1]	834
[2300]	835	DEALLOCATE( work_ffty )
	836
[1]	837	ENDIF
[2300]	838
[1106]	839	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'pause' )
[1]	840
	841	!
	842	!-- Transpose array
[1111]	843	#if defined( __parallel )
[1]	844	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	845	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	846	CALL MPI_ALLTOALL( work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
	847	f_out(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
	848	comm1dx, ierr )
	849	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1111]	850	#endif
[1]	851
	852	END SUBROUTINE ffty_tr_yx
	853
	854
	855	!------------------------------------------------------------------------------!
[1682]	856	! Description:
	857	! ------------
	858	!> Transposition x --> y with a subsequent backward Fourier transformation for
	859	!> a 1d-decomposition along x
[1]	860	!------------------------------------------------------------------------------!
[1682]	861	SUBROUTINE tr_xy_ffty( f_in, f_out )
[1]	862
[1320]	863	USE control_parameters, &
[2300]	864	ONLY: loop_optimization
[1320]	865
	866	USE cpulog, &
	867	ONLY: cpu_log, log_point_s
	868
	869	USE kinds
	870
[1]	871	USE pegrid
	872
	873	IMPLICIT NONE
	874
[1682]	875	INTEGER(iwp) :: i !<
	876	INTEGER(iwp) :: iend !<
	877	INTEGER(iwp) :: iouter !<
	878	INTEGER(iwp) :: ir !<
	879	INTEGER(iwp) :: j !<
	880	INTEGER(iwp) :: k !<
[1]	881
[1682]	882	INTEGER(iwp), PARAMETER :: stridex = 4 !<
[1320]	883
[1682]	884	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_in !<
	885	REAL(wp), DIMENSION(1:nz,0:ny,nxl:nxr) :: f_out !<
	886	REAL(wp), DIMENSION(nxl:nxr,1:nz,0:ny) :: work !<
[1]	887
[2300]	888	REAL(wp), DIMENSION(:,:), ALLOCATABLE :: work_ffty !<
	889	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: work_ffty_vec !<
	890
[1]	891	!
	892	!-- Transpose array
[1111]	893	#if defined( __parallel )
[1]	894	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	895	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	896	CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
	897	work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
	898	comm1dx, ierr )
	899	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1111]	900	#endif
[1]	901
	902	!
	903	!-- Resort the data in a way that y becomes the first index and carry out the
	904	!-- backward fft along y.
[1106]	905	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'continue' )
[1]	906
[2300]	907	IF ( loop_optimization == 'vector' ) THEN
	908
	909	ALLOCATE( work_ffty_vec(0:ny+1,1:nz,nxl:nxr) )
[1]	910	!
	911	!-- Code optimized for vector processors
[2300]	912	!$OMP PARALLEL PRIVATE ( i, j, k )
	913	!$OMP DO
[1]	914	DO k = 1, nz
	915	DO j = 0, ny
	916	DO i = nxl, nxr
	917	work_ffty_vec(j,k,i) = work(i,k,j)
	918	ENDDO
	919	ENDDO
	920	ENDDO
	921
[2300]	922	!$OMP DO
[1]	923	DO i = nxl, nxr
	924
	925	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' )
	926
	927	DO j = 0, ny
	928	DO k = 1, nz
	929	f_out(k,j,i) = work_ffty_vec(j,k,i)
	930	ENDDO
	931	ENDDO
	932
	933	ENDDO
[2300]	934	!$OMP END PARALLEL
[1]	935
[2300]	936	DEALLOCATE( work_ffty_vec )
	937
[1]	938	ELSE
	939	!
	940	!-- Cache optimized code.
[2300]	941	ALLOCATE( work_ffty(0:ny,stridex) )
	942	!
[1]	943	!-- The i-(x-)direction is split into a strided outer loop and an inner
	944	!-- loop for better cache performance
[2300]	945	!$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty )
	946	!$OMP DO
[1]	947	DO iouter = nxl, nxr, stridex
	948
	949	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
	950
	951	DO k = 1, nz
	952	!
	953	!-- Resort
	954	DO j = 0, ny
	955	DO i = iouter, iend
	956	work_ffty(j,i-iouter+1) = work(i,k,j)
	957	ENDDO
	958	ENDDO
	959
	960	DO i = iouter, iend
	961
	962	!
	963	!-- FFT along y
	964	ir = i-iouter+1 ! counter within a stride
[1106]	965	CALL fft_y_1d( work_ffty(:,ir), 'backward' )
[1]	966
	967	DO j = 0, ny
	968	f_out(k,j,i) = work_ffty(j,ir)
	969	ENDDO
	970	ENDDO
	971
	972	ENDDO
	973
	974	ENDDO
[2300]	975	!$OMP END PARALLEL
[1]	976
[2300]	977	DEALLOCATE( work_ffty )
	978
[1]	979	ENDIF
	980
[1106]	981	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'stop' )
[1]	982
	983	END SUBROUTINE tr_xy_ffty
	984
	985
	986	!------------------------------------------------------------------------------!
[1682]	987	! Description:
	988	! ------------
	989	!> FFT along x, solution of the tridiagonal system and backward FFT for
	990	!> a 1d-decomposition along x
	991	!>
	992	!> @warning this subroutine may still not work for hybrid parallelization
	993	!> with OpenMP (for possible necessary changes see the original
	994	!> routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
[1]	995	!------------------------------------------------------------------------------!
[1682]	996	SUBROUTINE fftx_tri_fftx( ar )
[1]	997
[1320]	998	USE control_parameters, &
[2300]	999	ONLY: loop_optimization
[1320]	1000
	1001	USE cpulog, &
	1002	ONLY: cpu_log, log_point_s
	1003
	1004	USE grid_variables, &
	1005	ONLY: ddx2, ddy2
	1006
	1007	USE kinds
	1008
[1]	1009	USE pegrid
	1010
	1011	IMPLICIT NONE
	1012
[1682]	1013	INTEGER(iwp) :: i !<
	1014	INTEGER(iwp) :: j !<
	1015	INTEGER(iwp) :: k !<
	1016	INTEGER(iwp) :: m !<
	1017	INTEGER(iwp) :: n !<
[3241]	1018	!$ INTEGER(iwp) :: omp_get_thread_num !<
[1682]	1019	INTEGER(iwp) :: tn !<
[1]	1020
[1682]	1021	REAL(wp), DIMENSION(0:nx) :: work_fftx !<
	1022	REAL(wp), DIMENSION(0:nx,1:nz) :: work_trix !<
	1023	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: ar !<
	1024	REAL(wp), DIMENSION(:,:,:,:), ALLOCATABLE :: tri !<
[1]	1025
	1026
[1106]	1027	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'start' )
[1]	1028
	1029	ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) )
	1030
	1031	tn = 0 ! Default thread number in case of one thread
	1032	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_fftx, work_trix )
	1033	DO j = nys_x, nyn_x
	1034
	1035	!$ tn = omp_get_thread_num()
	1036
[2300]	1037	IF ( loop_optimization == 'vector' ) THEN
[1]	1038	!
	1039	!-- Code optimized for vector processors
	1040	DO k = 1, nz
	1041
	1042	m = 0
	1043	DO n = 1, pdims(1)
[1003]	1044	DO i = 1, nnx
[1]	1045	work_trix(m,k) = ar(i,k,j,n)
	1046	m = m + 1
	1047	ENDDO
	1048	ENDDO
	1049
	1050	ENDDO
	1051
	1052	CALL fft_x_m( work_trix, 'forward' )
	1053
	1054	ELSE
	1055	!
	1056	!-- Cache optimized code
	1057	DO k = 1, nz
	1058
	1059	m = 0
	1060	DO n = 1, pdims(1)
[1003]	1061	DO i = 1, nnx
[1]	1062	work_fftx(m) = ar(i,k,j,n)
	1063	m = m + 1
	1064	ENDDO
	1065	ENDDO
	1066
[1106]	1067	CALL fft_x_1d( work_fftx, 'forward' )
[1]	1068
	1069	DO i = 0, nx
	1070	work_trix(i,k) = work_fftx(i)
	1071	ENDDO
	1072
	1073	ENDDO
	1074
	1075	ENDIF
	1076
	1077	!
	1078	!-- Solve the linear equation system
	1079	CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) )
	1080
[2300]	1081	IF ( loop_optimization == 'vector' ) THEN
[1]	1082	!
	1083	!-- Code optimized for vector processors
	1084	CALL fft_x_m( work_trix, 'backward' )
	1085
	1086	DO k = 1, nz
	1087
	1088	m = 0
	1089	DO n = 1, pdims(1)
[1003]	1090	DO i = 1, nnx
[1]	1091	ar(i,k,j,n) = work_trix(m,k)
	1092	m = m + 1
	1093	ENDDO
	1094	ENDDO
	1095
	1096	ENDDO
	1097
	1098	ELSE
	1099	!
	1100	!-- Cache optimized code
	1101	DO k = 1, nz
	1102
	1103	DO i = 0, nx
	1104	work_fftx(i) = work_trix(i,k)
	1105	ENDDO
	1106
[1106]	1107	CALL fft_x_1d( work_fftx, 'backward' )
[1]	1108
	1109	m = 0
	1110	DO n = 1, pdims(1)
[1003]	1111	DO i = 1, nnx
[1]	1112	ar(i,k,j,n) = work_fftx(m)
	1113	m = m + 1
	1114	ENDDO
	1115	ENDDO
	1116
	1117	ENDDO
	1118
	1119	ENDIF
	1120
	1121	ENDDO
	1122
	1123	DEALLOCATE( tri )
	1124
[1106]	1125	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'stop' )
[1]	1126
	1127	END SUBROUTINE fftx_tri_fftx
	1128
	1129
	1130	!------------------------------------------------------------------------------!
[1682]	1131	! Description:
	1132	! ------------
	1133	!> Fourier-transformation along x with subsequent transposition x --> y for
	1134	!> a 1d-decomposition along y.
	1135	!>
	1136	!> @attention NEC-branch of this routine may significantly profit from
	1137	!> further optimizations. So far, performance is much worse than
	1138	!> for routine ffty_tr_yx (more than three times slower).
[1]	1139	!------------------------------------------------------------------------------!
[1682]	1140	SUBROUTINE fftx_tr_xy( f_in, f_out )
[1]	1141
[1682]	1142
[1320]	1143	USE control_parameters, &
[2300]	1144	ONLY: loop_optimization
[1320]	1145
	1146	USE cpulog, &
	1147	ONLY: cpu_log, log_point_s
	1148
	1149	USE kinds
	1150
[1]	1151	USE pegrid
	1152
	1153	IMPLICIT NONE
	1154
[1682]	1155	INTEGER(iwp) :: i !<
	1156	INTEGER(iwp) :: j !<
	1157	INTEGER(iwp) :: k !<
[1]	1158
[1682]	1159	REAL(wp), DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx !<
	1160	REAL(wp), DIMENSION(1:nz,nys:nyn,0:nx) :: f_in !<
	1161	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_out !<
	1162	REAL(wp), DIMENSION(nys:nyn,1:nz,0:nx) :: work !<
[1]	1163
	1164	!
	1165	!-- Carry out the FFT along x, where all data are present due to the
	1166	!-- 1d-decomposition along y. Resort the data in a way that y becomes
	1167	!-- the first index.
[1106]	1168	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'start' )
[1]	1169
[2300]	1170	IF ( loop_optimization == 'vector' ) THEN
[1]	1171	!
	1172	!-- Code for vector processors
[85]	1173	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	1174	!$OMP DO
	1175	DO i = 0, nx
	1176
	1177	DO j = nys, nyn
	1178	DO k = 1, nz
	1179	work_fftx(i,k,j) = f_in(k,j,i)
	1180	ENDDO
	1181	ENDDO
	1182
	1183	ENDDO
	1184
	1185	!$OMP DO
	1186	DO j = nys, nyn
	1187
	1188	CALL fft_x_m( work_fftx(:,:,j), 'forward' )
	1189
	1190	DO k = 1, nz
	1191	DO i = 0, nx
	1192	work(j,k,i) = work_fftx(i,k,j)
	1193	ENDDO
	1194	ENDDO
	1195
	1196	ENDDO
	1197	!$OMP END PARALLEL
	1198
	1199	ELSE
	1200
	1201	!
	1202	!-- Cache optimized code (there might be still a potential for better
	1203	!-- optimization).
[696]	1204	!$OMP PARALLEL PRIVATE (i,j,k)
[1]	1205	!$OMP DO
	1206	DO i = 0, nx
	1207
	1208	DO j = nys, nyn
	1209	DO k = 1, nz
	1210	work_fftx(i,k,j) = f_in(k,j,i)
	1211	ENDDO
	1212	ENDDO
	1213
	1214	ENDDO
	1215
	1216	!$OMP DO
	1217	DO j = nys, nyn
	1218	DO k = 1, nz
	1219
[1106]	1220	CALL fft_x_1d( work_fftx(0:nx,k,j), 'forward' )
[1]	1221
	1222	DO i = 0, nx
	1223	work(j,k,i) = work_fftx(i,k,j)
	1224	ENDDO
	1225	ENDDO
	1226
	1227	ENDDO
	1228	!$OMP END PARALLEL
	1229
	1230	ENDIF
[1106]	1231	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'pause' )
[1]	1232
	1233	!
	1234	!-- Transpose array
[1111]	1235	#if defined( __parallel )
[1]	1236	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	1237	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	1238	CALL MPI_ALLTOALL( work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
	1239	f_out(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
	1240	comm1dy, ierr )
	1241	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1111]	1242	#endif
[1]	1243
	1244	END SUBROUTINE fftx_tr_xy
	1245
	1246
	1247	!------------------------------------------------------------------------------!
[1682]	1248	! Description:
	1249	! ------------
	1250	!> Transposition y --> x with a subsequent backward Fourier transformation for
	1251	!> a 1d-decomposition along x.
[1]	1252	!------------------------------------------------------------------------------!
[1682]	1253	SUBROUTINE tr_yx_fftx( f_in, f_out )
[1]	1254
[1682]	1255
[1320]	1256	USE control_parameters, &
[2300]	1257	ONLY: loop_optimization
[1320]	1258
	1259	USE cpulog, &
	1260	ONLY: cpu_log, log_point_s
	1261
	1262	USE kinds
	1263
[1]	1264	USE pegrid
	1265
	1266	IMPLICIT NONE
	1267
[1682]	1268	INTEGER(iwp) :: i !<
	1269	INTEGER(iwp) :: j !<
	1270	INTEGER(iwp) :: k !<
[1]	1271
[1682]	1272	REAL(wp), DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx !<
	1273	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_in !<
	1274	REAL(wp), DIMENSION(1:nz,nys:nyn,0:nx) :: f_out !<
	1275	REAL(wp), DIMENSION(nys:nyn,1:nz,0:nx) :: work !<
[1]	1276
	1277	!
	1278	!-- Transpose array
[1111]	1279	#if defined( __parallel )
[1]	1280	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	1281	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	1282	CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
	1283	work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
	1284	comm1dy, ierr )
	1285	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1111]	1286	#endif
[1]	1287
	1288	!
	1289	!-- Carry out the FFT along x, where all data are present due to the
	1290	!-- 1d-decomposition along y. Resort the data in a way that y becomes
	1291	!-- the first index.
[1106]	1292	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'continue' )
[1]	1293
[2300]	1294	IF ( loop_optimization == 'vector' ) THEN
[1]	1295	!
	1296	!-- Code optimized for vector processors
[85]	1297	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	1298	!$OMP DO
	1299	DO j = nys, nyn
	1300
	1301	DO k = 1, nz
	1302	DO i = 0, nx
	1303	work_fftx(i,k,j) = work(j,k,i)
	1304	ENDDO
	1305	ENDDO
	1306
	1307	CALL fft_x_m( work_fftx(:,:,j), 'backward' )
	1308
	1309	ENDDO
	1310
	1311	!$OMP DO
	1312	DO i = 0, nx
	1313	DO j = nys, nyn
	1314	DO k = 1, nz
	1315	f_out(k,j,i) = work_fftx(i,k,j)
	1316	ENDDO
	1317	ENDDO
	1318	ENDDO
	1319	!$OMP END PARALLEL
	1320
	1321	ELSE
	1322
	1323	!
	1324	!-- Cache optimized code (there might be still a potential for better
	1325	!-- optimization).
[696]	1326	!$OMP PARALLEL PRIVATE (i,j,k)
[1]	1327	!$OMP DO
	1328	DO j = nys, nyn
	1329	DO k = 1, nz
	1330
	1331	DO i = 0, nx
	1332	work_fftx(i,k,j) = work(j,k,i)
	1333	ENDDO
	1334
[1106]	1335	CALL fft_x_1d( work_fftx(0:nx,k,j), 'backward' )
[1]	1336
	1337	ENDDO
	1338	ENDDO
	1339
	1340	!$OMP DO
	1341	DO i = 0, nx
	1342	DO j = nys, nyn
	1343	DO k = 1, nz
	1344	f_out(k,j,i) = work_fftx(i,k,j)
	1345	ENDDO
	1346	ENDDO
	1347	ENDDO
	1348	!$OMP END PARALLEL
	1349
	1350	ENDIF
[1106]	1351	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'stop' )
[1]	1352
	1353	END SUBROUTINE tr_yx_fftx
	1354
	1355
	1356	!------------------------------------------------------------------------------!
[1682]	1357	! Description:
	1358	! ------------
	1359	!> FFT along y, solution of the tridiagonal system and backward FFT for
	1360	!> a 1d-decomposition along y.
	1361	!>
	1362	!> @warning this subroutine may still not work for hybrid parallelization
	1363	!> with OpenMP (for possible necessary changes see the original
	1364	!> routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
[1]	1365	!------------------------------------------------------------------------------!
[1682]	1366	SUBROUTINE ffty_tri_ffty( ar )
[1]	1367
[1682]	1368
[1320]	1369	USE control_parameters, &
[2300]	1370	ONLY: loop_optimization
[1320]	1371
	1372	USE cpulog, &
	1373	ONLY: cpu_log, log_point_s
	1374
	1375	USE grid_variables, &
	1376	ONLY: ddx2, ddy2
	1377
	1378	USE kinds
	1379
[1]	1380	USE pegrid
	1381
	1382	IMPLICIT NONE
	1383
[1682]	1384	INTEGER(iwp) :: i !<
	1385	INTEGER(iwp) :: j !<
	1386	INTEGER(iwp) :: k !<
	1387	INTEGER(iwp) :: m !<
	1388	INTEGER(iwp) :: n !<
[3241]	1389	!$ INTEGER(iwp) :: omp_get_thread_num !<
[1682]	1390	INTEGER(iwp) :: tn !<
[1]	1391
[1682]	1392	REAL(wp), DIMENSION(0:ny) :: work_ffty !<
	1393	REAL(wp), DIMENSION(0:ny,1:nz) :: work_triy !<
	1394	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: ar !<
	1395	REAL(wp), DIMENSION(:,:,:,:), ALLOCATABLE :: tri !<
[1]	1396
	1397
[1106]	1398	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'start' )
[1]	1399
	1400	ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) )
	1401
	1402	tn = 0 ! Default thread number in case of one thread
[696]	1403	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy )
[1]	1404	DO i = nxl_y, nxr_y
	1405
	1406	!$ tn = omp_get_thread_num()
	1407
[2300]	1408	IF ( loop_optimization == 'vector' ) THEN
[1]	1409	!
	1410	!-- Code optimized for vector processors
	1411	DO k = 1, nz
	1412
	1413	m = 0
	1414	DO n = 1, pdims(2)
[1003]	1415	DO j = 1, nny
[1]	1416	work_triy(m,k) = ar(j,k,i,n)
	1417	m = m + 1
	1418	ENDDO
	1419	ENDDO
	1420
	1421	ENDDO
	1422
	1423	CALL fft_y_m( work_triy, ny, 'forward' )
	1424
	1425	ELSE
	1426	!
	1427	!-- Cache optimized code
	1428	DO k = 1, nz
	1429
	1430	m = 0
	1431	DO n = 1, pdims(2)
[1003]	1432	DO j = 1, nny
[1]	1433	work_ffty(m) = ar(j,k,i,n)
	1434	m = m + 1
	1435	ENDDO
	1436	ENDDO
	1437
[1106]	1438	CALL fft_y_1d( work_ffty, 'forward' )
[1]	1439
	1440	DO j = 0, ny
	1441	work_triy(j,k) = work_ffty(j)
	1442	ENDDO
	1443
	1444	ENDDO
	1445
	1446	ENDIF
	1447
	1448	!
	1449	!-- Solve the linear equation system
	1450	CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) )
	1451
[2300]	1452	IF ( loop_optimization == 'vector' ) THEN
[1]	1453	!
	1454	!-- Code optimized for vector processors
	1455	CALL fft_y_m( work_triy, ny, 'backward' )
	1456
	1457	DO k = 1, nz
	1458
	1459	m = 0
	1460	DO n = 1, pdims(2)
[1003]	1461	DO j = 1, nny
[1]	1462	ar(j,k,i,n) = work_triy(m,k)
	1463	m = m + 1
	1464	ENDDO
	1465	ENDDO
	1466
	1467	ENDDO
	1468
	1469	ELSE
	1470	!
	1471	!-- Cache optimized code
	1472	DO k = 1, nz
	1473
	1474	DO j = 0, ny
	1475	work_ffty(j) = work_triy(j,k)
	1476	ENDDO
	1477
[1106]	1478	CALL fft_y_1d( work_ffty, 'backward' )
[1]	1479
	1480	m = 0
	1481	DO n = 1, pdims(2)
[1003]	1482	DO j = 1, nny
[1]	1483	ar(j,k,i,n) = work_ffty(m)
	1484	m = m + 1
	1485	ENDDO
	1486	ENDDO
	1487
	1488	ENDDO
	1489
	1490	ENDIF
	1491
	1492	ENDDO
	1493
	1494	DEALLOCATE( tri )
	1495
[1106]	1496	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'stop' )
[1]	1497
	1498	END SUBROUTINE ffty_tri_ffty
	1499
	1500	END MODULE poisfft_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |