Home

Context Navigation

source: palm/trunk/SOURCE/poisfft.f90 @ 1214

Last change on this file since 1214 was 1213, checked in by raasch, 12 years ago
last commit documented
Property svn:keywords set to `Id`
File size: 31.9 KB

Rev	Line
[1]	1	MODULE poisfft_mod
	2
[1036]	3	!--------------------------------------------------------------------------------!
	4	! This file is part of PALM.
	5	!
	6	! PALM is free software: you can redistribute it and/or modify it under the terms
	7	! of the GNU General Public License as published by the Free Software Foundation,
	8	! either version 3 of the License, or (at your option) any later version.
	9	!
	10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
	11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
	12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
	13	!
	14	! You should have received a copy of the GNU General Public License along with
	15	! PALM. If not, see <http://www.gnu.org/licenses/>.
	16	!
	17	! Copyright 1997-2012 Leibniz University Hannover
	18	!--------------------------------------------------------------------------------!
	19	!
[484]	20	! Current revisions:
[1]	21	! -----------------
[1112]	22	!
[1213]	23	!
[1112]	24	! Former revisions:
	25	! -----------------
	26	! $Id: poisfft.f90 1213 2013-08-15 09:03:50Z kanani $
	27	!
[1213]	28	! 1212 2013-08-15 08:46:27Z raasch
	29	! tridia routines moved to seperate module tridia_solver
	30	!
[1209]	31	! 1208 2013-08-13 06:41:49Z raasch
	32	! acc-update clauses added for "ar" so that ffts other than cufft can also be
	33	! used (although they are not ported and will give a poor performance)
	34	!
[1112]	35	! 1111 2013-03-08 23:54:10Z raasch
[1111]	36	! further openACC porting of non-parallel (MPI) branch:
	37	! tridiagonal routines split into extermal subroutines (instead using CONTAINS),
	38	! no distinction between parallel/non-parallel in poisfft and tridia any more,
[1112]	39	! tridia routines moved to end of file because of probable bug in PGI compiler 12.5
[1111]	40	! (otherwise "invalid device function" is indicated during runtime),
	41	! optimization of tridia routines: constant elements and coefficients of tri are
	42	! stored in seperate arrays ddzuw and tric, last dimension of tri reduced from 5
	43	! to 2,
	44	! poisfft_init is now called internally from poisfft, maketri is called from
	45	! poisfft_init,
	46	! ibc_p_b = 2 removed
[1]	47	!
[1107]	48	! 1106 2013-03-04 05:31:38Z raasch
	49	! routines fftx, ffty, fftxp, fftyp removed, calls replaced by fft_x, fft_y,
	50	! in the 1D-decomposition routines fft_x, ffty are replaced by fft_x_1d,
	51	! fft_y_1d
	52	!
[1104]	53	! 1103 2013-02-20 02:15:53Z raasch
	54	! tri, ar, and ar1 arguments in tridia-routines (2d) are removed because they
	55	! sometimes cause segmentation faults with intel 12.1 compiler
	56	!
[1093]	57	! 1092 2013-02-02 11:24:22Z raasch
	58	! unused variables removed
	59	!
[1037]	60	! 1036 2012-10-22 13:43:42Z raasch
	61	! code put under GPL (PALM 3.9)
	62	!
[1014]	63	! 2012-09-21 07:03:55Z raasch
	64	! FLOAT type conversion replaced by REAL
	65	!
[1004]	66	! 1003 2012-09-14 14:35:53Z raasch
	67	! indices nxa, nya, etc. replaced by nx, ny, etc.
	68	!
[941]	69	! 940 2012-07-09 14:31:00Z raasch
	70	! special handling of tri-array as an argument in tridia_1dd routines switched
	71	! off because it caused segmentation faults with intel 12.1 compiler
	72	!
[878]	73	! 877 2012-04-03 11:21:44Z suehring
	74	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
	75	! pressure at the top of the model domain.
	76	!
[810]	77	! 809 2012-01-30 13:32:58Z maronga
	78	! Bugfix: replaced .AND. and .NOT. with && and ! in the preprocessor directives
	79	!
[808]	80	! 807 2012-01-25 11:53:51Z maronga
	81	! New cpp directive "__check" implemented which is used by check_namelist_files
	82	! (most of the code is unneeded by check_namelist_files).
	83	!
[764]	84	! 763 2011-10-06 09:32:09Z suehring
	85	! Comment added concerning the last change.
	86	!
[762]	87	! 761 2011-10-05 17:58:52Z suehring
	88	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
	89	! pressure at the top of the model domain.
	90	!
[697]	91	! 696 2011-03-18 07:03:49Z raasch
	92	! work_fftx removed from PRIVATE clauses in fftx_tr_xy and tr_yx_fftx
	93	!
[684]	94	! 683 2011-02-09 14:25:15Z raasch
	95	! openMP parallelization for 2d-domain-decomposition
	96	!
[668]	97	! 667 2010-12-23 12:06:00Z suehring/gryschka
	98	! ddzu replaced by ddzu_pres due to changes in zu(0)
	99	!
[623]	100	! 622 2010-12-10 08:08:13Z raasch
	101	! optional barriers included in order to speed up collective operations
	102	!
[392]	103	! 377 2009-09-04 11:09:00Z raasch
	104	! __lcmuk changed to __lc to avoid problems with Intel compiler on sgi-ice
	105	!
[198]	106	! 164 2008-05-15 08:46:15Z raasch
	107	! Arguments removed from transpose routines
	108	!
[139]	109	! 128 2007-10-26 13:11:14Z raasch
	110	! Bugfix: wavenumber calculation for even nx in routines maketri
	111	!
[90]	112	! 85 2007-05-11 09:35:14Z raasch
	113	! Bugfix: work_fft*_vec removed from some PRIVATE-declarations
	114	!
[77]	115	! 76 2007-03-29 00:58:32Z raasch
	116	! Tridiagonal coefficients adjusted for Neumann boundary conditions both at
	117	! the bottom and the top.
	118	!
[3]	119	! RCS Log replace by Id keyword, revision history cleaned up
	120	!
[1]	121	! Revision 1.24 2006/08/04 15:00:24 raasch
	122	! Default setting of the thread number tn in case of not using OpenMP
	123	!
	124	! Revision 1.23 2006/02/23 12:48:38 raasch
	125	! Additional compiler directive in routine tridia_1dd for preventing loop
	126	! exchange on NEC-SX6
	127	!
	128	! Revision 1.20 2004/04/30 12:38:09 raasch
	129	! Parts of former poisfft_hybrid moved to this subroutine,
	130	! former subroutine changed to a module, renaming of FFT-subroutines and
	131	! -module, FFTs completely substituted by calls of fft_x and fft_y,
	132	! NAG fft used in the non-parallel case completely removed, l in maketri
	133	! is now a 1d-array, variables passed by modules instead of using parameter
	134	! lists, enlarged transposition arrays introduced
	135	!
	136	! Revision 1.1 1997/07/24 11:24:14 raasch
	137	! Initial revision
	138	!
	139	!
	140	! Description:
	141	! ------------
	142	! See below.
	143	!------------------------------------------------------------------------------!
	144
	145	!--------------------------------------------------------------------------!
	146	! poisfft !
	147	! !
	148	! Original version: Stephan Siano (pois3d) !
	149	! !
	150	! Institute of Meteorology and Climatology, University of Hannover !
	151	! Germany !
	152	! !
	153	! Version as of July 23,1996 !
	154	! !
	155	! !
	156	! Version for parallel computers: Siegfried Raasch !
	157	! !
	158	! Version as of July 03,1997 !
	159	! !
	160	! Solves the Poisson equation with a 2D spectral method !
	161	! d^2 p / dx^2 + d^2 p / dy^2 + d^2 p / dz^2 = s !
	162	! !
	163	! Input: !
	164	! real ar contains in the (nnx,nny,nnz) elements, !
	165	! starting from the element (1,nys,nxl), the !
	166	! values for s !
	167	! real work Temporary array !
	168	! !
	169	! Output: !
	170	! real ar contains the solution for p !
	171	!--------------------------------------------------------------------------!
	172
	173	USE fft_xy
	174	USE indices
	175	USE transpose_indices
[1212]	176	USE tridia_solver
[1]	177
	178	IMPLICIT NONE
	179
[1111]	180	LOGICAL, SAVE :: poisfft_initialized = .FALSE.
	181
[1]	182	PRIVATE
[807]	183
[809]	184	#if ! defined ( __check )
[1]	185	PUBLIC poisfft, poisfft_init
	186
	187	INTERFACE poisfft
	188	MODULE PROCEDURE poisfft
	189	END INTERFACE poisfft
	190
	191	INTERFACE poisfft_init
	192	MODULE PROCEDURE poisfft_init
	193	END INTERFACE poisfft_init
[807]	194	#else
	195	PUBLIC poisfft_init
[1]	196
[807]	197	INTERFACE poisfft_init
	198	MODULE PROCEDURE poisfft_init
	199	END INTERFACE poisfft_init
	200	#endif
	201
[1]	202	CONTAINS
	203
	204	SUBROUTINE poisfft_init
	205
[1111]	206	USE arrays_3d, ONLY: ddzu_pres, ddzw
	207
	208	IMPLICIT NONE
	209
	210	INTEGER :: k
	211
	212
[1]	213	CALL fft_init
	214
[1212]	215	CALL tridia_init
[1111]	216
	217	poisfft_initialized = .TRUE.
	218
[1]	219	END SUBROUTINE poisfft_init
	220
[1111]	221
[809]	222	#if ! defined ( __check )
[1]	223	SUBROUTINE poisfft( ar, work )
	224
[1208]	225	USE control_parameters, ONLY : fft_method
[1]	226	USE cpulog
	227	USE interfaces
	228	USE pegrid
	229
	230	IMPLICIT NONE
	231
[1003]	232	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar, work
[1]	233
	234
	235	CALL cpu_log( log_point_s(3), 'poisfft', 'start' )
	236
[1111]	237	IF ( .NOT. poisfft_initialized ) CALL poisfft_init
	238
[1]	239	!
	240	!-- Two-dimensional Fourier Transformation in x- and y-direction.
[1111]	241	IF ( pdims(2) == 1 .AND. pdims(1) > 1 ) THEN
[1]	242
	243	!
	244	!-- 1d-domain-decomposition along x:
	245	!-- FFT along y and transposition y --> x
	246	CALL ffty_tr_yx( ar, work, ar )
	247
	248	!
	249	!-- FFT along x, solving the tridiagonal system and backward FFT
	250	CALL fftx_tri_fftx( ar )
	251
	252	!
	253	!-- Transposition x --> y and backward FFT along y
	254	CALL tr_xy_ffty( ar, work, ar )
	255
[1111]	256	ELSEIF ( pdims(1) == 1 .AND. pdims(2) > 1 ) THEN
[1]	257
	258	!
	259	!-- 1d-domain-decomposition along y:
	260	!-- FFT along x and transposition x --> y
	261	CALL fftx_tr_xy( ar, work, ar )
	262
	263	!
	264	!-- FFT along y, solving the tridiagonal system and backward FFT
	265	CALL ffty_tri_ffty( ar )
	266
	267	!
	268	!-- Transposition y --> x and backward FFT along x
	269	CALL tr_yx_fftx( ar, work, ar )
	270
	271	ELSE
	272
	273	!
[1111]	274	!-- 2d-domain-decomposition or no decomposition (1 PE run)
[1]	275	!-- Transposition z --> x
	276	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
[164]	277	CALL transpose_zx( ar, work, ar )
[1]	278	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
	279
	280	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
[1208]	281	IF ( fft_method /= 'system-specific' ) THEN
	282	!$acc update host( ar )
	283	ENDIF
[1106]	284	CALL fft_x( ar, 'forward' )
[1208]	285	IF ( fft_method /= 'system-specific' ) THEN
	286	!$acc update device( ar )
	287	ENDIF
[1]	288	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
	289
	290	!
	291	!-- Transposition x --> y
	292	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
[164]	293	CALL transpose_xy( ar, work, ar )
[1]	294	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
	295
	296	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
[1208]	297	IF ( fft_method /= 'system-specific' ) THEN
	298	!$acc update host( ar )
	299	ENDIF
[1106]	300	CALL fft_y( ar, 'forward' )
[1208]	301	IF ( fft_method /= 'system-specific' ) THEN
	302	!$acc update device( ar )
	303	ENDIF
[1]	304	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	305
	306	!
	307	!-- Transposition y --> z
	308	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
[164]	309	CALL transpose_yz( ar, work, ar )
[1]	310	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
	311
	312	!
[1106]	313	!-- Solve the tridiagonal equation system along z
[1]	314	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
[1212]	315	CALL tridia_substi( ar )
[1]	316	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
	317
	318	!
	319	!-- Inverse Fourier Transformation
	320	!-- Transposition z --> y
	321	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
[164]	322	CALL transpose_zy( ar, work, ar )
[1]	323	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
	324
	325	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
[1208]	326	IF ( fft_method /= 'system-specific' ) THEN
	327	!$acc update host( ar )
	328	ENDIF
[1106]	329	CALL fft_y( ar, 'backward' )
[1208]	330	IF ( fft_method /= 'system-specific' ) THEN
	331	!$acc update device( ar )
	332	ENDIF
[1]	333	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	334
	335	!
	336	!-- Transposition y --> x
	337	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
[164]	338	CALL transpose_yx( ar, work, ar )
[1]	339	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
	340
	341	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
[1208]	342	IF ( fft_method /= 'system-specific' ) THEN
	343	!$acc update host( ar )
	344	ENDIF
[1106]	345	CALL fft_x( ar, 'backward' )
[1208]	346	IF ( fft_method /= 'system-specific' ) THEN
	347	!$acc update device( ar )
	348	ENDIF
[1]	349	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	350
	351	!
	352	!-- Transposition x --> z
	353	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
[164]	354	CALL transpose_xz( ar, work, ar )
[1]	355	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
	356
	357	ENDIF
	358
	359	CALL cpu_log( log_point_s(3), 'poisfft', 'stop' )
	360
	361	END SUBROUTINE poisfft
	362
	363
	364
	365	SUBROUTINE ffty_tr_yx( f_in, work, f_out )
	366
	367	!------------------------------------------------------------------------------!
	368	! Fourier-transformation along y with subsequent transposition y --> x for
	369	! a 1d-decomposition along x
	370	!
	371	! ATTENTION: The performance of this routine is much faster on the NEC-SX6,
	372	! if the first index of work_ffty_vec is odd. Otherwise
	373	! memory bank conflicts may occur (especially if the index is a
	374	! multiple of 128). That's why work_ffty_vec is dimensioned as
	375	! 0:ny+1.
	376	! Of course, this will not work if users are using an odd number
	377	! of gridpoints along y.
	378	!------------------------------------------------------------------------------!
	379
	380	USE control_parameters
	381	USE cpulog
	382	USE indices
	383	USE interfaces
	384	USE pegrid
	385	USE transpose_indices
	386
	387	IMPLICIT NONE
	388
	389	INTEGER :: i, iend, iouter, ir, j, k
	390	INTEGER, PARAMETER :: stridex = 4
	391
	392	REAL, DIMENSION(0:ny,stridex) :: work_ffty
	393	#if defined( __nec )
	394	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
	395	#endif
[1003]	396	REAL, DIMENSION(1:nz,0:ny,nxl:nxr) :: f_in
	397	REAL, DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_out
	398	REAL, DIMENSION(nxl:nxr,1:nz,0:ny) :: work
[1]	399
	400	!
	401	!-- Carry out the FFT along y, where all data are present due to the
	402	!-- 1d-decomposition along x. Resort the data in a way that x becomes
	403	!-- the first index.
[1106]	404	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'start' )
[1]	405
	406	IF ( host(1:3) == 'nec' ) THEN
	407	#if defined( __nec )
	408	!
	409	!-- Code optimized for vector processors
[85]	410	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	411	!$OMP DO
	412	DO i = nxl, nxr
	413
	414	DO j = 0, ny
	415	DO k = 1, nz
	416	work_ffty_vec(j,k,i) = f_in(k,j,i)
	417	ENDDO
	418	ENDDO
	419
	420	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' )
	421
	422	ENDDO
	423
	424	!$OMP DO
	425	DO k = 1, nz
	426	DO j = 0, ny
	427	DO i = nxl, nxr
	428	work(i,k,j) = work_ffty_vec(j,k,i)
	429	ENDDO
	430	ENDDO
	431	ENDDO
	432	!$OMP END PARALLEL
	433	#endif
	434
	435	ELSE
	436
	437	!
	438	!-- Cache optimized code.
	439	!-- The i-(x-)direction is split into a strided outer loop and an inner
	440	!-- loop for better cache performance
	441	!$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty)
	442	!$OMP DO
	443	DO iouter = nxl, nxr, stridex
	444
	445	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
	446
	447	DO k = 1, nz
	448
	449	DO i = iouter, iend
	450
	451	ir = i-iouter+1 ! counter within a stride
	452	DO j = 0, ny
	453	work_ffty(j,ir) = f_in(k,j,i)
	454	ENDDO
	455	!
	456	!-- FFT along y
[1106]	457	CALL fft_y_1d( work_ffty(:,ir), 'forward' )
[1]	458
	459	ENDDO
	460
	461	!
	462	!-- Resort
	463	DO j = 0, ny
	464	DO i = iouter, iend
	465	work(i,k,j) = work_ffty(j,i-iouter+1)
	466	ENDDO
	467	ENDDO
	468
	469	ENDDO
	470
	471	ENDDO
	472	!$OMP END PARALLEL
	473
	474	ENDIF
[1106]	475	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'pause' )
[1]	476
	477	!
	478	!-- Transpose array
[1111]	479	#if defined( __parallel )
[1]	480	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	481	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	482	CALL MPI_ALLTOALL( work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
	483	f_out(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
	484	comm1dx, ierr )
	485	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1111]	486	#endif
[1]	487
	488	END SUBROUTINE ffty_tr_yx
	489
	490
	491	SUBROUTINE tr_xy_ffty( f_in, work, f_out )
	492
	493	!------------------------------------------------------------------------------!
	494	! Transposition x --> y with a subsequent backward Fourier transformation for
	495	! a 1d-decomposition along x
	496	!------------------------------------------------------------------------------!
	497
	498	USE control_parameters
	499	USE cpulog
	500	USE indices
	501	USE interfaces
	502	USE pegrid
	503	USE transpose_indices
	504
	505	IMPLICIT NONE
	506
	507	INTEGER :: i, iend, iouter, ir, j, k
	508	INTEGER, PARAMETER :: stridex = 4
	509
	510	REAL, DIMENSION(0:ny,stridex) :: work_ffty
	511	#if defined( __nec )
	512	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
	513	#endif
[1003]	514	REAL, DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_in
	515	REAL, DIMENSION(1:nz,0:ny,nxl:nxr) :: f_out
	516	REAL, DIMENSION(nxl:nxr,1:nz,0:ny) :: work
[1]	517
	518	!
	519	!-- Transpose array
[1111]	520	#if defined( __parallel )
[1]	521	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	522	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	523	CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
	524	work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
	525	comm1dx, ierr )
	526	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1111]	527	#endif
[1]	528
	529	!
	530	!-- Resort the data in a way that y becomes the first index and carry out the
	531	!-- backward fft along y.
[1106]	532	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'continue' )
[1]	533
	534	IF ( host(1:3) == 'nec' ) THEN
	535	#if defined( __nec )
	536	!
	537	!-- Code optimized for vector processors
[85]	538	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	539	!$OMP DO
	540	DO k = 1, nz
	541	DO j = 0, ny
	542	DO i = nxl, nxr
	543	work_ffty_vec(j,k,i) = work(i,k,j)
	544	ENDDO
	545	ENDDO
	546	ENDDO
	547
	548	!$OMP DO
	549	DO i = nxl, nxr
	550
	551	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' )
	552
	553	DO j = 0, ny
	554	DO k = 1, nz
	555	f_out(k,j,i) = work_ffty_vec(j,k,i)
	556	ENDDO
	557	ENDDO
	558
	559	ENDDO
	560	!$OMP END PARALLEL
	561	#endif
	562
	563	ELSE
	564
	565	!
	566	!-- Cache optimized code.
	567	!-- The i-(x-)direction is split into a strided outer loop and an inner
	568	!-- loop for better cache performance
	569	!$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty )
	570	!$OMP DO
	571	DO iouter = nxl, nxr, stridex
	572
	573	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
	574
	575	DO k = 1, nz
	576	!
	577	!-- Resort
	578	DO j = 0, ny
	579	DO i = iouter, iend
	580	work_ffty(j,i-iouter+1) = work(i,k,j)
	581	ENDDO
	582	ENDDO
	583
	584	DO i = iouter, iend
	585
	586	!
	587	!-- FFT along y
	588	ir = i-iouter+1 ! counter within a stride
[1106]	589	CALL fft_y_1d( work_ffty(:,ir), 'backward' )
[1]	590
	591	DO j = 0, ny
	592	f_out(k,j,i) = work_ffty(j,ir)
	593	ENDDO
	594	ENDDO
	595
	596	ENDDO
	597
	598	ENDDO
	599	!$OMP END PARALLEL
	600
	601	ENDIF
	602
[1106]	603	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'stop' )
[1]	604
	605	END SUBROUTINE tr_xy_ffty
	606
	607
	608	SUBROUTINE fftx_tri_fftx( ar )
	609
	610	!------------------------------------------------------------------------------!
	611	! FFT along x, solution of the tridiagonal system and backward FFT for
	612	! a 1d-decomposition along x
	613	!
	614	! WARNING: this subroutine may still not work for hybrid parallelization
	615	! with OpenMP (for possible necessary changes see the original
	616	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
	617	!------------------------------------------------------------------------------!
	618
	619	USE control_parameters
	620	USE cpulog
	621	USE grid_variables
	622	USE indices
	623	USE interfaces
	624	USE pegrid
	625	USE transpose_indices
	626
	627	IMPLICIT NONE
	628
	629	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
	630
[1003]	631	REAL, DIMENSION(0:nx) :: work_fftx
	632	REAL, DIMENSION(0:nx,1:nz) :: work_trix
	633	REAL, DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: ar
	634	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
[1]	635
	636
[1106]	637	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'start' )
[1]	638
	639	ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) )
	640
	641	tn = 0 ! Default thread number in case of one thread
	642	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_fftx, work_trix )
	643	DO j = nys_x, nyn_x
	644
	645	!$ tn = omp_get_thread_num()
	646
	647	IF ( host(1:3) == 'nec' ) THEN
	648	!
	649	!-- Code optimized for vector processors
	650	DO k = 1, nz
	651
	652	m = 0
	653	DO n = 1, pdims(1)
[1003]	654	DO i = 1, nnx
[1]	655	work_trix(m,k) = ar(i,k,j,n)
	656	m = m + 1
	657	ENDDO
	658	ENDDO
	659
	660	ENDDO
	661
	662	CALL fft_x_m( work_trix, 'forward' )
	663
	664	ELSE
	665	!
	666	!-- Cache optimized code
	667	DO k = 1, nz
	668
	669	m = 0
	670	DO n = 1, pdims(1)
[1003]	671	DO i = 1, nnx
[1]	672	work_fftx(m) = ar(i,k,j,n)
	673	m = m + 1
	674	ENDDO
	675	ENDDO
	676
[1106]	677	CALL fft_x_1d( work_fftx, 'forward' )
[1]	678
	679	DO i = 0, nx
	680	work_trix(i,k) = work_fftx(i)
	681	ENDDO
	682
	683	ENDDO
	684
	685	ENDIF
	686
	687	!
	688	!-- Solve the linear equation system
	689	CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) )
	690
	691	IF ( host(1:3) == 'nec' ) THEN
	692	!
	693	!-- Code optimized for vector processors
	694	CALL fft_x_m( work_trix, 'backward' )
	695
	696	DO k = 1, nz
	697
	698	m = 0
	699	DO n = 1, pdims(1)
[1003]	700	DO i = 1, nnx
[1]	701	ar(i,k,j,n) = work_trix(m,k)
	702	m = m + 1
	703	ENDDO
	704	ENDDO
	705
	706	ENDDO
	707
	708	ELSE
	709	!
	710	!-- Cache optimized code
	711	DO k = 1, nz
	712
	713	DO i = 0, nx
	714	work_fftx(i) = work_trix(i,k)
	715	ENDDO
	716
[1106]	717	CALL fft_x_1d( work_fftx, 'backward' )
[1]	718
	719	m = 0
	720	DO n = 1, pdims(1)
[1003]	721	DO i = 1, nnx
[1]	722	ar(i,k,j,n) = work_fftx(m)
	723	m = m + 1
	724	ENDDO
	725	ENDDO
	726
	727	ENDDO
	728
	729	ENDIF
	730
	731	ENDDO
	732
	733	DEALLOCATE( tri )
	734
[1106]	735	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'stop' )
[1]	736
	737	END SUBROUTINE fftx_tri_fftx
	738
	739
	740	SUBROUTINE fftx_tr_xy( f_in, work, f_out )
	741
	742	!------------------------------------------------------------------------------!
	743	! Fourier-transformation along x with subsequent transposition x --> y for
	744	! a 1d-decomposition along y
	745	!
	746	! ATTENTION: The NEC-branch of this routine may significantly profit from
	747	! further optimizations. So far, performance is much worse than
	748	! for routine ffty_tr_yx (more than three times slower).
	749	!------------------------------------------------------------------------------!
	750
	751	USE control_parameters
	752	USE cpulog
	753	USE indices
	754	USE interfaces
	755	USE pegrid
	756	USE transpose_indices
	757
	758	IMPLICIT NONE
	759
	760	INTEGER :: i, j, k
	761
[1003]	762	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
	763	REAL, DIMENSION(1:nz,nys:nyn,0:nx) :: f_in
	764	REAL, DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_out
	765	REAL, DIMENSION(nys:nyn,1:nz,0:nx) :: work
[1]	766
	767	!
	768	!-- Carry out the FFT along x, where all data are present due to the
	769	!-- 1d-decomposition along y. Resort the data in a way that y becomes
	770	!-- the first index.
[1106]	771	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'start' )
[1]	772
	773	IF ( host(1:3) == 'nec' ) THEN
	774	!
	775	!-- Code for vector processors
[85]	776	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	777	!$OMP DO
	778	DO i = 0, nx
	779
	780	DO j = nys, nyn
	781	DO k = 1, nz
	782	work_fftx(i,k,j) = f_in(k,j,i)
	783	ENDDO
	784	ENDDO
	785
	786	ENDDO
	787
	788	!$OMP DO
	789	DO j = nys, nyn
	790
	791	CALL fft_x_m( work_fftx(:,:,j), 'forward' )
	792
	793	DO k = 1, nz
	794	DO i = 0, nx
	795	work(j,k,i) = work_fftx(i,k,j)
	796	ENDDO
	797	ENDDO
	798
	799	ENDDO
	800	!$OMP END PARALLEL
	801
	802	ELSE
	803
	804	!
	805	!-- Cache optimized code (there might be still a potential for better
	806	!-- optimization).
[696]	807	!$OMP PARALLEL PRIVATE (i,j,k)
[1]	808	!$OMP DO
	809	DO i = 0, nx
	810
	811	DO j = nys, nyn
	812	DO k = 1, nz
	813	work_fftx(i,k,j) = f_in(k,j,i)
	814	ENDDO
	815	ENDDO
	816
	817	ENDDO
	818
	819	!$OMP DO
	820	DO j = nys, nyn
	821	DO k = 1, nz
	822
[1106]	823	CALL fft_x_1d( work_fftx(0:nx,k,j), 'forward' )
[1]	824
	825	DO i = 0, nx
	826	work(j,k,i) = work_fftx(i,k,j)
	827	ENDDO
	828	ENDDO
	829
	830	ENDDO
	831	!$OMP END PARALLEL
	832
	833	ENDIF
[1106]	834	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'pause' )
[1]	835
	836	!
	837	!-- Transpose array
[1111]	838	#if defined( __parallel )
[1]	839	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	840	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	841	CALL MPI_ALLTOALL( work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
	842	f_out(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
	843	comm1dy, ierr )
	844	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1111]	845	#endif
[1]	846
	847	END SUBROUTINE fftx_tr_xy
	848
	849
	850	SUBROUTINE tr_yx_fftx( f_in, work, f_out )
	851
	852	!------------------------------------------------------------------------------!
	853	! Transposition y --> x with a subsequent backward Fourier transformation for
	854	! a 1d-decomposition along x
	855	!------------------------------------------------------------------------------!
	856
	857	USE control_parameters
	858	USE cpulog
	859	USE indices
	860	USE interfaces
	861	USE pegrid
	862	USE transpose_indices
	863
	864	IMPLICIT NONE
	865
	866	INTEGER :: i, j, k
	867
[1003]	868	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
	869	REAL, DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_in
	870	REAL, DIMENSION(1:nz,nys:nyn,0:nx) :: f_out
	871	REAL, DIMENSION(nys:nyn,1:nz,0:nx) :: work
[1]	872
	873	!
	874	!-- Transpose array
[1111]	875	#if defined( __parallel )
[1]	876	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	877	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	878	CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
	879	work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
	880	comm1dy, ierr )
	881	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1111]	882	#endif
[1]	883
	884	!
	885	!-- Carry out the FFT along x, where all data are present due to the
	886	!-- 1d-decomposition along y. Resort the data in a way that y becomes
	887	!-- the first index.
[1106]	888	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'continue' )
[1]	889
	890	IF ( host(1:3) == 'nec' ) THEN
	891	!
	892	!-- Code optimized for vector processors
[85]	893	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	894	!$OMP DO
	895	DO j = nys, nyn
	896
	897	DO k = 1, nz
	898	DO i = 0, nx
	899	work_fftx(i,k,j) = work(j,k,i)
	900	ENDDO
	901	ENDDO
	902
	903	CALL fft_x_m( work_fftx(:,:,j), 'backward' )
	904
	905	ENDDO
	906
	907	!$OMP DO
	908	DO i = 0, nx
	909	DO j = nys, nyn
	910	DO k = 1, nz
	911	f_out(k,j,i) = work_fftx(i,k,j)
	912	ENDDO
	913	ENDDO
	914	ENDDO
	915	!$OMP END PARALLEL
	916
	917	ELSE
	918
	919	!
	920	!-- Cache optimized code (there might be still a potential for better
	921	!-- optimization).
[696]	922	!$OMP PARALLEL PRIVATE (i,j,k)
[1]	923	!$OMP DO
	924	DO j = nys, nyn
	925	DO k = 1, nz
	926
	927	DO i = 0, nx
	928	work_fftx(i,k,j) = work(j,k,i)
	929	ENDDO
	930
[1106]	931	CALL fft_x_1d( work_fftx(0:nx,k,j), 'backward' )
[1]	932
	933	ENDDO
	934	ENDDO
	935
	936	!$OMP DO
	937	DO i = 0, nx
	938	DO j = nys, nyn
	939	DO k = 1, nz
	940	f_out(k,j,i) = work_fftx(i,k,j)
	941	ENDDO
	942	ENDDO
	943	ENDDO
	944	!$OMP END PARALLEL
	945
	946	ENDIF
[1106]	947	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'stop' )
[1]	948
	949	END SUBROUTINE tr_yx_fftx
	950
	951
	952	SUBROUTINE ffty_tri_ffty( ar )
	953
	954	!------------------------------------------------------------------------------!
	955	! FFT along y, solution of the tridiagonal system and backward FFT for
	956	! a 1d-decomposition along y
	957	!
	958	! WARNING: this subroutine may still not work for hybrid parallelization
	959	! with OpenMP (for possible necessary changes see the original
	960	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
	961	!------------------------------------------------------------------------------!
	962
	963	USE control_parameters
	964	USE cpulog
	965	USE grid_variables
	966	USE indices
	967	USE interfaces
	968	USE pegrid
	969	USE transpose_indices
	970
	971	IMPLICIT NONE
	972
	973	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
	974
[1003]	975	REAL, DIMENSION(0:ny) :: work_ffty
	976	REAL, DIMENSION(0:ny,1:nz) :: work_triy
	977	REAL, DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: ar
	978	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
[1]	979
	980
[1106]	981	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'start' )
[1]	982
	983	ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) )
	984
	985	tn = 0 ! Default thread number in case of one thread
[696]	986	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy )
[1]	987	DO i = nxl_y, nxr_y
	988
	989	!$ tn = omp_get_thread_num()
	990
	991	IF ( host(1:3) == 'nec' ) THEN
	992	!
	993	!-- Code optimized for vector processors
	994	DO k = 1, nz
	995
	996	m = 0
	997	DO n = 1, pdims(2)
[1003]	998	DO j = 1, nny
[1]	999	work_triy(m,k) = ar(j,k,i,n)
	1000	m = m + 1
	1001	ENDDO
	1002	ENDDO
	1003
	1004	ENDDO
	1005
	1006	CALL fft_y_m( work_triy, ny, 'forward' )
	1007
	1008	ELSE
	1009	!
	1010	!-- Cache optimized code
	1011	DO k = 1, nz
	1012
	1013	m = 0
	1014	DO n = 1, pdims(2)
[1003]	1015	DO j = 1, nny
[1]	1016	work_ffty(m) = ar(j,k,i,n)
	1017	m = m + 1
	1018	ENDDO
	1019	ENDDO
	1020
[1106]	1021	CALL fft_y_1d( work_ffty, 'forward' )
[1]	1022
	1023	DO j = 0, ny
	1024	work_triy(j,k) = work_ffty(j)
	1025	ENDDO
	1026
	1027	ENDDO
	1028
	1029	ENDIF
	1030
	1031	!
	1032	!-- Solve the linear equation system
	1033	CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) )
	1034
	1035	IF ( host(1:3) == 'nec' ) THEN
	1036	!
	1037	!-- Code optimized for vector processors
	1038	CALL fft_y_m( work_triy, ny, 'backward' )
	1039
	1040	DO k = 1, nz
	1041
	1042	m = 0
	1043	DO n = 1, pdims(2)
[1003]	1044	DO j = 1, nny
[1]	1045	ar(j,k,i,n) = work_triy(m,k)
	1046	m = m + 1
	1047	ENDDO
	1048	ENDDO
	1049
	1050	ENDDO
	1051
	1052	ELSE
	1053	!
	1054	!-- Cache optimized code
	1055	DO k = 1, nz
	1056
	1057	DO j = 0, ny
	1058	work_ffty(j) = work_triy(j,k)
	1059	ENDDO
	1060
[1106]	1061	CALL fft_y_1d( work_ffty, 'backward' )
[1]	1062
	1063	m = 0
	1064	DO n = 1, pdims(2)
[1003]	1065	DO j = 1, nny
[1]	1066	ar(j,k,i,n) = work_ffty(m)
	1067	m = m + 1
	1068	ENDDO
	1069	ENDDO
	1070
	1071	ENDDO
	1072
	1073	ENDIF
	1074
	1075	ENDDO
	1076
	1077	DEALLOCATE( tri )
	1078
[1106]	1079	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'stop' )
[1]	1080
	1081	END SUBROUTINE ffty_tri_ffty
	1082
	1083	#endif
	1084
	1085	END MODULE poisfft_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |