Home

Context Navigation

source: palm/trunk/SOURCE/poisfft.f90 @ 1109

Last change on this file since 1109 was 1107, checked in by raasch, 11 years ago
last commit documented
Property svn:keywords set to `Id`
File size: 45.2 KB

Rev	Line
[1]	1	MODULE poisfft_mod
	2
[1036]	3	!--------------------------------------------------------------------------------!
	4	! This file is part of PALM.
	5	!
	6	! PALM is free software: you can redistribute it and/or modify it under the terms
	7	! of the GNU General Public License as published by the Free Software Foundation,
	8	! either version 3 of the License, or (at your option) any later version.
	9	!
	10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
	11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
	12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
	13	!
	14	! You should have received a copy of the GNU General Public License along with
	15	! PALM. If not, see <http://www.gnu.org/licenses/>.
	16	!
	17	! Copyright 1997-2012 Leibniz University Hannover
	18	!--------------------------------------------------------------------------------!
	19	!
[484]	20	! Current revisions:
[1]	21	! -----------------
[1107]	22	!
[1]	23	!
	24	! Former revisions:
	25	! -----------------
[3]	26	! $Id: poisfft.f90 1107 2013-03-04 06:23:14Z raasch $
[77]	27	!
[1107]	28	! 1106 2013-03-04 05:31:38Z raasch
	29	! routines fftx, ffty, fftxp, fftyp removed, calls replaced by fft_x, fft_y,
	30	! in the 1D-decomposition routines fft_x, ffty are replaced by fft_x_1d,
	31	! fft_y_1d
	32	!
[1104]	33	! 1103 2013-02-20 02:15:53Z raasch
	34	! tri, ar, and ar1 arguments in tridia-routines (2d) are removed because they
	35	! sometimes cause segmentation faults with intel 12.1 compiler
	36	!
[1093]	37	! 1092 2013-02-02 11:24:22Z raasch
	38	! unused variables removed
	39	!
[1037]	40	! 1036 2012-10-22 13:43:42Z raasch
	41	! code put under GPL (PALM 3.9)
	42	!
[1014]	43	! 2012-09-21 07:03:55Z raasch
	44	! FLOAT type conversion replaced by REAL
	45	!
[1004]	46	! 1003 2012-09-14 14:35:53Z raasch
	47	! indices nxa, nya, etc. replaced by nx, ny, etc.
	48	!
[941]	49	! 940 2012-07-09 14:31:00Z raasch
	50	! special handling of tri-array as an argument in tridia_1dd routines switched
	51	! off because it caused segmentation faults with intel 12.1 compiler
	52	!
[878]	53	! 877 2012-04-03 11:21:44Z suehring
	54	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
	55	! pressure at the top of the model domain.
	56	!
[810]	57	! 809 2012-01-30 13:32:58Z maronga
	58	! Bugfix: replaced .AND. and .NOT. with && and ! in the preprocessor directives
	59	!
[808]	60	! 807 2012-01-25 11:53:51Z maronga
	61	! New cpp directive "__check" implemented which is used by check_namelist_files
	62	! (most of the code is unneeded by check_namelist_files).
	63	!
[764]	64	! 763 2011-10-06 09:32:09Z suehring
	65	! Comment added concerning the last change.
	66	!
[762]	67	! 761 2011-10-05 17:58:52Z suehring
	68	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
	69	! pressure at the top of the model domain.
	70	!
[697]	71	! 696 2011-03-18 07:03:49Z raasch
	72	! work_fftx removed from PRIVATE clauses in fftx_tr_xy and tr_yx_fftx
	73	!
[684]	74	! 683 2011-02-09 14:25:15Z raasch
	75	! openMP parallelization for 2d-domain-decomposition
	76	!
[668]	77	! 667 2010-12-23 12:06:00Z suehring/gryschka
	78	! ddzu replaced by ddzu_pres due to changes in zu(0)
	79	!
[623]	80	! 622 2010-12-10 08:08:13Z raasch
	81	! optional barriers included in order to speed up collective operations
	82	!
[392]	83	! 377 2009-09-04 11:09:00Z raasch
	84	! __lcmuk changed to __lc to avoid problems with Intel compiler on sgi-ice
	85	!
[198]	86	! 164 2008-05-15 08:46:15Z raasch
	87	! Arguments removed from transpose routines
	88	!
[139]	89	! 128 2007-10-26 13:11:14Z raasch
	90	! Bugfix: wavenumber calculation for even nx in routines maketri
	91	!
[90]	92	! 85 2007-05-11 09:35:14Z raasch
	93	! Bugfix: work_fft*_vec removed from some PRIVATE-declarations
	94	!
[77]	95	! 76 2007-03-29 00:58:32Z raasch
	96	! Tridiagonal coefficients adjusted for Neumann boundary conditions both at
	97	! the bottom and the top.
	98	!
[3]	99	! RCS Log replace by Id keyword, revision history cleaned up
	100	!
[1]	101	! Revision 1.24 2006/08/04 15:00:24 raasch
	102	! Default setting of the thread number tn in case of not using OpenMP
	103	!
	104	! Revision 1.23 2006/02/23 12:48:38 raasch
	105	! Additional compiler directive in routine tridia_1dd for preventing loop
	106	! exchange on NEC-SX6
	107	!
	108	! Revision 1.20 2004/04/30 12:38:09 raasch
	109	! Parts of former poisfft_hybrid moved to this subroutine,
	110	! former subroutine changed to a module, renaming of FFT-subroutines and
	111	! -module, FFTs completely substituted by calls of fft_x and fft_y,
	112	! NAG fft used in the non-parallel case completely removed, l in maketri
	113	! is now a 1d-array, variables passed by modules instead of using parameter
	114	! lists, enlarged transposition arrays introduced
	115	!
	116	! Revision 1.1 1997/07/24 11:24:14 raasch
	117	! Initial revision
	118	!
	119	!
	120	! Description:
	121	! ------------
	122	! See below.
	123	!------------------------------------------------------------------------------!
	124
	125	!--------------------------------------------------------------------------!
	126	! poisfft !
	127	! !
	128	! Original version: Stephan Siano (pois3d) !
	129	! !
	130	! Institute of Meteorology and Climatology, University of Hannover !
	131	! Germany !
	132	! !
	133	! Version as of July 23,1996 !
	134	! !
	135	! !
	136	! Version for parallel computers: Siegfried Raasch !
	137	! !
	138	! Version as of July 03,1997 !
	139	! !
	140	! Solves the Poisson equation with a 2D spectral method !
	141	! d^2 p / dx^2 + d^2 p / dy^2 + d^2 p / dz^2 = s !
	142	! !
	143	! Input: !
	144	! real ar contains in the (nnx,nny,nnz) elements, !
	145	! starting from the element (1,nys,nxl), the !
	146	! values for s !
	147	! real work Temporary array !
	148	! !
	149	! Output: !
	150	! real ar contains the solution for p !
	151	!--------------------------------------------------------------------------!
	152
	153	USE fft_xy
	154	USE indices
	155	USE transpose_indices
	156
	157	IMPLICIT NONE
	158
	159	PRIVATE
[807]	160
[809]	161	#if ! defined ( __check )
[1]	162	PUBLIC poisfft, poisfft_init
	163
	164	INTERFACE poisfft
	165	MODULE PROCEDURE poisfft
	166	END INTERFACE poisfft
	167
	168	INTERFACE poisfft_init
	169	MODULE PROCEDURE poisfft_init
	170	END INTERFACE poisfft_init
[807]	171	#else
	172	PUBLIC poisfft_init
[1]	173
[807]	174	INTERFACE poisfft_init
	175	MODULE PROCEDURE poisfft_init
	176	END INTERFACE poisfft_init
	177	#endif
	178
[1]	179	CONTAINS
	180
	181	SUBROUTINE poisfft_init
	182
	183	CALL fft_init
	184
	185	END SUBROUTINE poisfft_init
	186
[809]	187	#if ! defined ( __check )
[1]	188	SUBROUTINE poisfft( ar, work )
	189
	190	USE cpulog
	191	USE interfaces
	192	USE pegrid
	193
	194	IMPLICIT NONE
	195
[1003]	196	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar, work
[1]	197
	198
	199	CALL cpu_log( log_point_s(3), 'poisfft', 'start' )
	200
	201	!
	202	!-- Two-dimensional Fourier Transformation in x- and y-direction.
	203	#if defined( __parallel )
	204	IF ( pdims(2) == 1 ) THEN
	205
	206	!
	207	!-- 1d-domain-decomposition along x:
	208	!-- FFT along y and transposition y --> x
	209	CALL ffty_tr_yx( ar, work, ar )
	210
	211	!
	212	!-- FFT along x, solving the tridiagonal system and backward FFT
	213	CALL fftx_tri_fftx( ar )
	214
	215	!
	216	!-- Transposition x --> y and backward FFT along y
	217	CALL tr_xy_ffty( ar, work, ar )
	218
	219	ELSEIF ( pdims(1) == 1 ) THEN
	220
	221	!
	222	!-- 1d-domain-decomposition along y:
	223	!-- FFT along x and transposition x --> y
	224	CALL fftx_tr_xy( ar, work, ar )
	225
	226	!
	227	!-- FFT along y, solving the tridiagonal system and backward FFT
	228	CALL ffty_tri_ffty( ar )
	229
	230	!
	231	!-- Transposition y --> x and backward FFT along x
	232	CALL tr_yx_fftx( ar, work, ar )
	233
	234	ELSE
	235
	236	!
	237	!-- 2d-domain-decomposition
	238	!-- Transposition z --> x
	239	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
[164]	240	CALL transpose_zx( ar, work, ar )
[1]	241	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
	242
	243	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
[1106]	244	CALL fft_x( ar, 'forward' )
[1]	245	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
	246
	247	!
	248	!-- Transposition x --> y
	249	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
[164]	250	CALL transpose_xy( ar, work, ar )
[1]	251	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
	252
	253	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
[1106]	254	CALL fft_y( ar, 'forward' )
[1]	255	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	256
	257	!
	258	!-- Transposition y --> z
	259	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
[164]	260	CALL transpose_yz( ar, work, ar )
[1]	261	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
	262
	263	!
[1106]	264	!-- Solve the tridiagonal equation system along z
[1]	265	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
	266	CALL tridia( ar )
	267	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
	268
	269	!
	270	!-- Inverse Fourier Transformation
	271	!-- Transposition z --> y
	272	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
[164]	273	CALL transpose_zy( ar, work, ar )
[1]	274	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
	275
	276	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
[1106]	277	CALL fft_y( ar, 'backward' )
[1]	278	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	279
	280	!
	281	!-- Transposition y --> x
	282	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
[164]	283	CALL transpose_yx( ar, work, ar )
[1]	284	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
	285
	286	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
[1106]	287	CALL fft_x( ar, 'backward' )
[1]	288	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	289
	290	!
	291	!-- Transposition x --> z
	292	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
[164]	293	CALL transpose_xz( ar, work, ar )
[1]	294	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
	295
	296	ENDIF
	297
	298	#else
	299
	300	!
	301	!-- Two-dimensional Fourier Transformation along x- and y-direction.
[1106]	302	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
	303	!$acc data copyin( ar, work )
	304	CALL transpose_zx( ar, work, ar )
	305	!$acc update host( ar )
	306	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
	307
	308
[1]	309	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
[1106]	310	CALL fft_x( ar, 'forward' )
[1]	311	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
[1106]	312
	313	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
	314	CALL transpose_xy( ar, work, ar )
	315	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
	316
[1]	317	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
[1106]	318	CALL fft_y( ar, 'forward' )
[1]	319	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	320
	321	!
[1106]	322	!-- Solve the tridiagonal equation system along z
	323	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
	324	CALL transpose_yz( ar, work, ar )
	325	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
	326
[1]	327	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
	328	CALL tridia( ar )
	329	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
	330
[1106]	331	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
	332	CALL transpose_zy( ar, work, ar )
	333	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
	334
[1]	335	!
	336	!-- Inverse Fourier Transformation.
	337	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
[1106]	338	CALL fft_y( ar, 'backward' )
[1]	339	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
[1106]	340
	341	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
	342	CALL transpose_yx( ar, work, ar )
	343	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
	344
[1]	345	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
[1106]	346	CALL fft_x( ar, 'backward' )
[1]	347	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
	348
[1106]	349	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
	350	CALL transpose_xz( ar, work, ar )
	351	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
	352
	353	!$acc end data
	354
[1]	355	#endif
	356
	357	CALL cpu_log( log_point_s(3), 'poisfft', 'stop' )
	358
	359	END SUBROUTINE poisfft
	360
	361
	362
	363	SUBROUTINE tridia( ar )
	364
	365	!------------------------------------------------------------------------------!
	366	! solves the linear system of equations:
	367	!
	368	! -(4 pi^2(i^2/(dx^2nnx^2)+j^2/(dy^2nny^2))+
	369	! 1/(dzu(k)dzw(k))+1/(dzu(k-1)dzw(k)))*p(i,j,k)+
	370	! 1/(dzu(k)dzw(k))p(i,j,k+1)+1/(dzu(k-1)dzw(k))p(i,j,k-1)=d(i,j,k)
	371	!
	372	! by using the Thomas algorithm
	373	!------------------------------------------------------------------------------!
	374
	375	USE arrays_3d
	376
	377	IMPLICIT NONE
	378
	379	INTEGER :: i, j, k, nnyh
	380
	381	REAL, DIMENSION(nxl_z:nxr_z,0:nz-1) :: ar1
	382	REAL, DIMENSION(5,nxl_z:nxr_z,0:nz-1) :: tri
	383
[1003]	384	REAL :: ar(nxl_z:nxr_z,nys_z:nyn_z,1:nz)
[1]	385
	386
	387	nnyh = (ny+1) / 2
	388
	389	!
	390	!-- Define constant elements of the tridiagonal matrix.
[683]	391	!$OMP PARALLEL PRIVATE ( k, i )
	392	!$OMP DO
[1]	393	DO k = 0, nz-1
	394	DO i = nxl_z, nxr_z
[667]	395	tri(2,i,k) = ddzu_pres(k+1) * ddzw(k+1)
	396	tri(3,i,k) = ddzu_pres(k+2) * ddzw(k+1)
[1]	397	ENDDO
	398	ENDDO
[683]	399	!$OMP END PARALLEL
[1]	400
	401	#if defined( __parallel )
	402	!
	403	!-- Repeat for all y-levels.
[683]	404	!$OMP PARALLEL FIRSTPRIVATE( tri ) PRIVATE ( ar1, j )
	405	!$OMP DO
[1]	406	DO j = nys_z, nyn_z
	407	IF ( j <= nnyh ) THEN
[1103]	408	CALL maketri( j )
[1]	409	ELSE
[1103]	410	CALL maketri( ny+1-j )
[1]	411	ENDIF
[1103]	412	CALL split
	413	CALL substi( j )
[1]	414	ENDDO
[683]	415	!$OMP END PARALLEL
[1]	416	#else
	417	!
	418	!-- First y-level.
[1103]	419	CALL maketri( nys_z )
	420	CALL split
	421	CALL substi( 0 )
[1]	422
	423	!
	424	!-- Further y-levels.
	425	DO j = 1, nnyh - 1
[1103]	426	CALL maketri( j )
	427	CALL split
	428	CALL substi( j )
	429	CALL substi( ny+1-j )
[1]	430	ENDDO
[1103]	431	CALL maketri( nnyh )
	432	CALL split
	433	CALL substi( nnyh+nys )
[1]	434	#endif
	435
	436	CONTAINS
	437
[1103]	438	SUBROUTINE maketri( j )
[1]	439
	440	!------------------------------------------------------------------------------!
	441	! Computes the i- and j-dependent component of the matrix
	442	!------------------------------------------------------------------------------!
	443
	444	USE arrays_3d
	445	USE constants
	446	USE control_parameters
	447	USE grid_variables
	448
	449	IMPLICIT NONE
	450
	451	INTEGER :: i, j, k, nnxh
	452	REAL :: a, c
	453	REAL :: ll(nxl_z:nxr_z)
	454
	455
	456	nnxh = ( nx + 1 ) / 2
	457
	458	!
	459	!-- Provide the tridiagonal matrix for solution of the Poisson equation in
	460	!-- Fourier space. The coefficients are computed following the method of
	461	!-- Schmidt et al. (DFVLR-Mitteilung 84-15), which departs from Stephan
	462	!-- Siano's original version by discretizing the Poisson equation,
	463	!-- before it is Fourier-transformed
	464	#if defined( __parallel )
	465	DO i = nxl_z, nxr_z
[128]	466	IF ( i >= 0 .AND. i <= nnxh ) THEN
[1]	467	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
[1013]	468	REAL( nx+1 ) ) ) / ( dx * dx ) + &
[1]	469	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
[1013]	470	REAL( ny+1 ) ) ) / ( dy * dy )
[1]	471	ELSE
	472	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
[1013]	473	REAL( nx+1 ) ) ) / ( dx * dx ) + &
[1]	474	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
[1013]	475	REAL( ny+1 ) ) ) / ( dy * dy )
[1]	476	ENDIF
	477	DO k = 0,nz-1
[667]	478	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
	479	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
[1]	480	tri(1,i,k) = a + c - ll(i)
	481	ENDDO
	482	ENDDO
	483	#else
	484	DO i = 0, nnxh
[1013]	485	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / REAL( nx+1 ) ) ) / &
[1]	486	( dx * dx ) + &
[1013]	487	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / REAL( ny+1 ) ) ) / &
[1]	488	( dy * dy )
	489	DO k = 0, nz-1
[667]	490	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
	491	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
[1]	492	tri(1,i,k) = a + c - ll(i)
	493	IF ( i >= 1 .and. i < nnxh ) THEN
	494	tri(1,nx+1-i,k) = tri(1,i,k)
	495	ENDIF
	496	ENDDO
	497	ENDDO
	498	#endif
	499	IF ( ibc_p_b == 1 .OR. ibc_p_b == 2 ) THEN
	500	DO i = nxl_z, nxr_z
	501	tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
	502	ENDDO
	503	ENDIF
	504	IF ( ibc_p_t == 1 ) THEN
	505	DO i = nxl_z, nxr_z
	506	tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
	507	ENDDO
	508	ENDIF
	509
	510	END SUBROUTINE maketri
	511
	512
[1103]	513	SUBROUTINE substi( j )
[1]	514
	515	!------------------------------------------------------------------------------!
	516	! Substitution (Forward and Backward) (Thomas algorithm)
	517	!------------------------------------------------------------------------------!
	518
[76]	519	USE control_parameters
	520
[1]	521	IMPLICIT NONE
	522
	523	INTEGER :: i, j, k
	524
	525	!
	526	!-- Forward substitution.
	527	DO i = nxl_z, nxr_z
	528	ar1(i,0) = ar(i,j,1)
	529	ENDDO
	530	DO k = 1, nz - 1
	531	DO i = nxl_z, nxr_z
	532	ar1(i,k) = ar(i,j,k+1) - tri(5,i,k) * ar1(i,k-1)
	533	ENDDO
	534	ENDDO
	535
	536	!
[877]	537	!-- Backward substitution
	538	!-- Note, the 1.0E-20 in the denominator is due to avoid divisions
	539	!-- by zero appearing if the pressure bc is set to neumann at the top of
	540	!-- the model domain.
[1]	541	DO i = nxl_z, nxr_z
[877]	542	ar(i,j,nz) = ar1(i,nz-1) / ( tri(4,i,nz-1) + 1.0E-20 )
[1]	543	ENDDO
	544	DO k = nz-2, 0, -1
	545	DO i = nxl_z, nxr_z
	546	ar(i,j,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,j,k+2) ) &
	547	/ tri(4,i,k)
	548	ENDDO
	549	ENDDO
	550
[76]	551	!
	552	!-- Indices i=0, j=0 correspond to horizontally averaged pressure.
	553	!-- The respective values of ar should be zero at all k-levels if
	554	!-- acceleration of horizontally averaged vertical velocity is zero.
	555	IF ( ibc_p_b == 1 .AND. ibc_p_t == 1 ) THEN
	556	IF ( j == 0 .AND. nxl_z == 0 ) THEN
	557	DO k = 1, nz
	558	ar(nxl_z,j,k) = 0.0
	559	ENDDO
	560	ENDIF
	561	ENDIF
	562
[1]	563	END SUBROUTINE substi
	564
	565
[1103]	566	SUBROUTINE split
[1]	567
	568	!------------------------------------------------------------------------------!
	569	! Splitting of the tridiagonal matrix (Thomas algorithm)
	570	!------------------------------------------------------------------------------!
	571
	572	IMPLICIT NONE
	573
	574	INTEGER :: i, k
	575
	576	!
	577	!-- Splitting.
	578	DO i = nxl_z, nxr_z
	579	tri(4,i,0) = tri(1,i,0)
	580	ENDDO
	581	DO k = 1, nz-1
	582	DO i = nxl_z, nxr_z
	583	tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
	584	tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
	585	ENDDO
	586	ENDDO
	587
	588	END SUBROUTINE split
	589
	590	END SUBROUTINE tridia
	591
	592
	593	#if defined( __parallel )
	594	SUBROUTINE ffty_tr_yx( f_in, work, f_out )
	595
	596	!------------------------------------------------------------------------------!
	597	! Fourier-transformation along y with subsequent transposition y --> x for
	598	! a 1d-decomposition along x
	599	!
	600	! ATTENTION: The performance of this routine is much faster on the NEC-SX6,
	601	! if the first index of work_ffty_vec is odd. Otherwise
	602	! memory bank conflicts may occur (especially if the index is a
	603	! multiple of 128). That's why work_ffty_vec is dimensioned as
	604	! 0:ny+1.
	605	! Of course, this will not work if users are using an odd number
	606	! of gridpoints along y.
	607	!------------------------------------------------------------------------------!
	608
	609	USE control_parameters
	610	USE cpulog
	611	USE indices
	612	USE interfaces
	613	USE pegrid
	614	USE transpose_indices
	615
	616	IMPLICIT NONE
	617
	618	INTEGER :: i, iend, iouter, ir, j, k
	619	INTEGER, PARAMETER :: stridex = 4
	620
	621	REAL, DIMENSION(0:ny,stridex) :: work_ffty
	622	#if defined( __nec )
	623	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
	624	#endif
[1003]	625	REAL, DIMENSION(1:nz,0:ny,nxl:nxr) :: f_in
	626	REAL, DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_out
	627	REAL, DIMENSION(nxl:nxr,1:nz,0:ny) :: work
[1]	628
	629	!
	630	!-- Carry out the FFT along y, where all data are present due to the
	631	!-- 1d-decomposition along x. Resort the data in a way that x becomes
	632	!-- the first index.
[1106]	633	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'start' )
[1]	634
	635	IF ( host(1:3) == 'nec' ) THEN
	636	#if defined( __nec )
	637	!
	638	!-- Code optimized for vector processors
[85]	639	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	640	!$OMP DO
	641	DO i = nxl, nxr
	642
	643	DO j = 0, ny
	644	DO k = 1, nz
	645	work_ffty_vec(j,k,i) = f_in(k,j,i)
	646	ENDDO
	647	ENDDO
	648
	649	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' )
	650
	651	ENDDO
	652
	653	!$OMP DO
	654	DO k = 1, nz
	655	DO j = 0, ny
	656	DO i = nxl, nxr
	657	work(i,k,j) = work_ffty_vec(j,k,i)
	658	ENDDO
	659	ENDDO
	660	ENDDO
	661	!$OMP END PARALLEL
	662	#endif
	663
	664	ELSE
	665
	666	!
	667	!-- Cache optimized code.
	668	!-- The i-(x-)direction is split into a strided outer loop and an inner
	669	!-- loop for better cache performance
	670	!$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty)
	671	!$OMP DO
	672	DO iouter = nxl, nxr, stridex
	673
	674	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
	675
	676	DO k = 1, nz
	677
	678	DO i = iouter, iend
	679
	680	ir = i-iouter+1 ! counter within a stride
	681	DO j = 0, ny
	682	work_ffty(j,ir) = f_in(k,j,i)
	683	ENDDO
	684	!
	685	!-- FFT along y
[1106]	686	CALL fft_y_1d( work_ffty(:,ir), 'forward' )
[1]	687
	688	ENDDO
	689
	690	!
	691	!-- Resort
	692	DO j = 0, ny
	693	DO i = iouter, iend
	694	work(i,k,j) = work_ffty(j,i-iouter+1)
	695	ENDDO
	696	ENDDO
	697
	698	ENDDO
	699
	700	ENDDO
	701	!$OMP END PARALLEL
	702
	703	ENDIF
[1106]	704	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'pause' )
[1]	705
	706	!
	707	!-- Transpose array
	708	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	709	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	710	CALL MPI_ALLTOALL( work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
	711	f_out(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
	712	comm1dx, ierr )
	713	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	714
	715	END SUBROUTINE ffty_tr_yx
	716
	717
	718	SUBROUTINE tr_xy_ffty( f_in, work, f_out )
	719
	720	!------------------------------------------------------------------------------!
	721	! Transposition x --> y with a subsequent backward Fourier transformation for
	722	! a 1d-decomposition along x
	723	!------------------------------------------------------------------------------!
	724
	725	USE control_parameters
	726	USE cpulog
	727	USE indices
	728	USE interfaces
	729	USE pegrid
	730	USE transpose_indices
	731
	732	IMPLICIT NONE
	733
	734	INTEGER :: i, iend, iouter, ir, j, k
	735	INTEGER, PARAMETER :: stridex = 4
	736
	737	REAL, DIMENSION(0:ny,stridex) :: work_ffty
	738	#if defined( __nec )
	739	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
	740	#endif
[1003]	741	REAL, DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_in
	742	REAL, DIMENSION(1:nz,0:ny,nxl:nxr) :: f_out
	743	REAL, DIMENSION(nxl:nxr,1:nz,0:ny) :: work
[1]	744
	745	!
	746	!-- Transpose array
	747	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	748	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	749	CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
	750	work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
	751	comm1dx, ierr )
	752	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	753
	754	!
	755	!-- Resort the data in a way that y becomes the first index and carry out the
	756	!-- backward fft along y.
[1106]	757	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'continue' )
[1]	758
	759	IF ( host(1:3) == 'nec' ) THEN
	760	#if defined( __nec )
	761	!
	762	!-- Code optimized for vector processors
[85]	763	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	764	!$OMP DO
	765	DO k = 1, nz
	766	DO j = 0, ny
	767	DO i = nxl, nxr
	768	work_ffty_vec(j,k,i) = work(i,k,j)
	769	ENDDO
	770	ENDDO
	771	ENDDO
	772
	773	!$OMP DO
	774	DO i = nxl, nxr
	775
	776	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' )
	777
	778	DO j = 0, ny
	779	DO k = 1, nz
	780	f_out(k,j,i) = work_ffty_vec(j,k,i)
	781	ENDDO
	782	ENDDO
	783
	784	ENDDO
	785	!$OMP END PARALLEL
	786	#endif
	787
	788	ELSE
	789
	790	!
	791	!-- Cache optimized code.
	792	!-- The i-(x-)direction is split into a strided outer loop and an inner
	793	!-- loop for better cache performance
	794	!$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty )
	795	!$OMP DO
	796	DO iouter = nxl, nxr, stridex
	797
	798	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
	799
	800	DO k = 1, nz
	801	!
	802	!-- Resort
	803	DO j = 0, ny
	804	DO i = iouter, iend
	805	work_ffty(j,i-iouter+1) = work(i,k,j)
	806	ENDDO
	807	ENDDO
	808
	809	DO i = iouter, iend
	810
	811	!
	812	!-- FFT along y
	813	ir = i-iouter+1 ! counter within a stride
[1106]	814	CALL fft_y_1d( work_ffty(:,ir), 'backward' )
[1]	815
	816	DO j = 0, ny
	817	f_out(k,j,i) = work_ffty(j,ir)
	818	ENDDO
	819	ENDDO
	820
	821	ENDDO
	822
	823	ENDDO
	824	!$OMP END PARALLEL
	825
	826	ENDIF
	827
[1106]	828	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'stop' )
[1]	829
	830	END SUBROUTINE tr_xy_ffty
	831
	832
	833	SUBROUTINE fftx_tri_fftx( ar )
	834
	835	!------------------------------------------------------------------------------!
	836	! FFT along x, solution of the tridiagonal system and backward FFT for
	837	! a 1d-decomposition along x
	838	!
	839	! WARNING: this subroutine may still not work for hybrid parallelization
	840	! with OpenMP (for possible necessary changes see the original
	841	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
	842	!------------------------------------------------------------------------------!
	843
	844	USE control_parameters
	845	USE cpulog
	846	USE grid_variables
	847	USE indices
	848	USE interfaces
	849	USE pegrid
	850	USE transpose_indices
	851
	852	IMPLICIT NONE
	853
	854	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
	855
[1003]	856	REAL, DIMENSION(0:nx) :: work_fftx
	857	REAL, DIMENSION(0:nx,1:nz) :: work_trix
	858	REAL, DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: ar
	859	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
[1]	860
	861
[1106]	862	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'start' )
[1]	863
	864	ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) )
	865
	866	tn = 0 ! Default thread number in case of one thread
	867	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_fftx, work_trix )
	868	DO j = nys_x, nyn_x
	869
	870	!$ tn = omp_get_thread_num()
	871
	872	IF ( host(1:3) == 'nec' ) THEN
	873	!
	874	!-- Code optimized for vector processors
	875	DO k = 1, nz
	876
	877	m = 0
	878	DO n = 1, pdims(1)
[1003]	879	DO i = 1, nnx
[1]	880	work_trix(m,k) = ar(i,k,j,n)
	881	m = m + 1
	882	ENDDO
	883	ENDDO
	884
	885	ENDDO
	886
	887	CALL fft_x_m( work_trix, 'forward' )
	888
	889	ELSE
	890	!
	891	!-- Cache optimized code
	892	DO k = 1, nz
	893
	894	m = 0
	895	DO n = 1, pdims(1)
[1003]	896	DO i = 1, nnx
[1]	897	work_fftx(m) = ar(i,k,j,n)
	898	m = m + 1
	899	ENDDO
	900	ENDDO
	901
[1106]	902	CALL fft_x_1d( work_fftx, 'forward' )
[1]	903
	904	DO i = 0, nx
	905	work_trix(i,k) = work_fftx(i)
	906	ENDDO
	907
	908	ENDDO
	909
	910	ENDIF
	911
	912	!
	913	!-- Solve the linear equation system
	914	CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) )
	915
	916	IF ( host(1:3) == 'nec' ) THEN
	917	!
	918	!-- Code optimized for vector processors
	919	CALL fft_x_m( work_trix, 'backward' )
	920
	921	DO k = 1, nz
	922
	923	m = 0
	924	DO n = 1, pdims(1)
[1003]	925	DO i = 1, nnx
[1]	926	ar(i,k,j,n) = work_trix(m,k)
	927	m = m + 1
	928	ENDDO
	929	ENDDO
	930
	931	ENDDO
	932
	933	ELSE
	934	!
	935	!-- Cache optimized code
	936	DO k = 1, nz
	937
	938	DO i = 0, nx
	939	work_fftx(i) = work_trix(i,k)
	940	ENDDO
	941
[1106]	942	CALL fft_x_1d( work_fftx, 'backward' )
[1]	943
	944	m = 0
	945	DO n = 1, pdims(1)
[1003]	946	DO i = 1, nnx
[1]	947	ar(i,k,j,n) = work_fftx(m)
	948	m = m + 1
	949	ENDDO
	950	ENDDO
	951
	952	ENDDO
	953
	954	ENDIF
	955
	956	ENDDO
	957
	958	DEALLOCATE( tri )
	959
[1106]	960	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'stop' )
[1]	961
	962	END SUBROUTINE fftx_tri_fftx
	963
	964
	965	SUBROUTINE fftx_tr_xy( f_in, work, f_out )
	966
	967	!------------------------------------------------------------------------------!
	968	! Fourier-transformation along x with subsequent transposition x --> y for
	969	! a 1d-decomposition along y
	970	!
	971	! ATTENTION: The NEC-branch of this routine may significantly profit from
	972	! further optimizations. So far, performance is much worse than
	973	! for routine ffty_tr_yx (more than three times slower).
	974	!------------------------------------------------------------------------------!
	975
	976	USE control_parameters
	977	USE cpulog
	978	USE indices
	979	USE interfaces
	980	USE pegrid
	981	USE transpose_indices
	982
	983	IMPLICIT NONE
	984
	985	INTEGER :: i, j, k
	986
[1003]	987	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
	988	REAL, DIMENSION(1:nz,nys:nyn,0:nx) :: f_in
	989	REAL, DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_out
	990	REAL, DIMENSION(nys:nyn,1:nz,0:nx) :: work
[1]	991
	992	!
	993	!-- Carry out the FFT along x, where all data are present due to the
	994	!-- 1d-decomposition along y. Resort the data in a way that y becomes
	995	!-- the first index.
[1106]	996	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'start' )
[1]	997
	998	IF ( host(1:3) == 'nec' ) THEN
	999	!
	1000	!-- Code for vector processors
[85]	1001	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	1002	!$OMP DO
	1003	DO i = 0, nx
	1004
	1005	DO j = nys, nyn
	1006	DO k = 1, nz
	1007	work_fftx(i,k,j) = f_in(k,j,i)
	1008	ENDDO
	1009	ENDDO
	1010
	1011	ENDDO
	1012
	1013	!$OMP DO
	1014	DO j = nys, nyn
	1015
	1016	CALL fft_x_m( work_fftx(:,:,j), 'forward' )
	1017
	1018	DO k = 1, nz
	1019	DO i = 0, nx
	1020	work(j,k,i) = work_fftx(i,k,j)
	1021	ENDDO
	1022	ENDDO
	1023
	1024	ENDDO
	1025	!$OMP END PARALLEL
	1026
	1027	ELSE
	1028
	1029	!
	1030	!-- Cache optimized code (there might be still a potential for better
	1031	!-- optimization).
[696]	1032	!$OMP PARALLEL PRIVATE (i,j,k)
[1]	1033	!$OMP DO
	1034	DO i = 0, nx
	1035
	1036	DO j = nys, nyn
	1037	DO k = 1, nz
	1038	work_fftx(i,k,j) = f_in(k,j,i)
	1039	ENDDO
	1040	ENDDO
	1041
	1042	ENDDO
	1043
	1044	!$OMP DO
	1045	DO j = nys, nyn
	1046	DO k = 1, nz
	1047
[1106]	1048	CALL fft_x_1d( work_fftx(0:nx,k,j), 'forward' )
[1]	1049
	1050	DO i = 0, nx
	1051	work(j,k,i) = work_fftx(i,k,j)
	1052	ENDDO
	1053	ENDDO
	1054
	1055	ENDDO
	1056	!$OMP END PARALLEL
	1057
	1058	ENDIF
[1106]	1059	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'pause' )
[1]	1060
	1061	!
	1062	!-- Transpose array
	1063	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	1064	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	1065	CALL MPI_ALLTOALL( work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
	1066	f_out(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
	1067	comm1dy, ierr )
	1068	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	1069
	1070	END SUBROUTINE fftx_tr_xy
	1071
	1072
	1073	SUBROUTINE tr_yx_fftx( f_in, work, f_out )
	1074
	1075	!------------------------------------------------------------------------------!
	1076	! Transposition y --> x with a subsequent backward Fourier transformation for
	1077	! a 1d-decomposition along x
	1078	!------------------------------------------------------------------------------!
	1079
	1080	USE control_parameters
	1081	USE cpulog
	1082	USE indices
	1083	USE interfaces
	1084	USE pegrid
	1085	USE transpose_indices
	1086
	1087	IMPLICIT NONE
	1088
	1089	INTEGER :: i, j, k
	1090
[1003]	1091	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
	1092	REAL, DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_in
	1093	REAL, DIMENSION(1:nz,nys:nyn,0:nx) :: f_out
	1094	REAL, DIMENSION(nys:nyn,1:nz,0:nx) :: work
[1]	1095
	1096	!
	1097	!-- Transpose array
	1098	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	1099	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	1100	CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
	1101	work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
	1102	comm1dy, ierr )
	1103	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	1104
	1105	!
	1106	!-- Carry out the FFT along x, where all data are present due to the
	1107	!-- 1d-decomposition along y. Resort the data in a way that y becomes
	1108	!-- the first index.
[1106]	1109	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'continue' )
[1]	1110
	1111	IF ( host(1:3) == 'nec' ) THEN
	1112	!
	1113	!-- Code optimized for vector processors
[85]	1114	!$OMP PARALLEL PRIVATE ( i, j, k )
[1]	1115	!$OMP DO
	1116	DO j = nys, nyn
	1117
	1118	DO k = 1, nz
	1119	DO i = 0, nx
	1120	work_fftx(i,k,j) = work(j,k,i)
	1121	ENDDO
	1122	ENDDO
	1123
	1124	CALL fft_x_m( work_fftx(:,:,j), 'backward' )
	1125
	1126	ENDDO
	1127
	1128	!$OMP DO
	1129	DO i = 0, nx
	1130	DO j = nys, nyn
	1131	DO k = 1, nz
	1132	f_out(k,j,i) = work_fftx(i,k,j)
	1133	ENDDO
	1134	ENDDO
	1135	ENDDO
	1136	!$OMP END PARALLEL
	1137
	1138	ELSE
	1139
	1140	!
	1141	!-- Cache optimized code (there might be still a potential for better
	1142	!-- optimization).
[696]	1143	!$OMP PARALLEL PRIVATE (i,j,k)
[1]	1144	!$OMP DO
	1145	DO j = nys, nyn
	1146	DO k = 1, nz
	1147
	1148	DO i = 0, nx
	1149	work_fftx(i,k,j) = work(j,k,i)
	1150	ENDDO
	1151
[1106]	1152	CALL fft_x_1d( work_fftx(0:nx,k,j), 'backward' )
[1]	1153
	1154	ENDDO
	1155	ENDDO
	1156
	1157	!$OMP DO
	1158	DO i = 0, nx
	1159	DO j = nys, nyn
	1160	DO k = 1, nz
	1161	f_out(k,j,i) = work_fftx(i,k,j)
	1162	ENDDO
	1163	ENDDO
	1164	ENDDO
	1165	!$OMP END PARALLEL
	1166
	1167	ENDIF
[1106]	1168	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'stop' )
[1]	1169
	1170	END SUBROUTINE tr_yx_fftx
	1171
	1172
	1173	SUBROUTINE ffty_tri_ffty( ar )
	1174
	1175	!------------------------------------------------------------------------------!
	1176	! FFT along y, solution of the tridiagonal system and backward FFT for
	1177	! a 1d-decomposition along y
	1178	!
	1179	! WARNING: this subroutine may still not work for hybrid parallelization
	1180	! with OpenMP (for possible necessary changes see the original
	1181	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
	1182	!------------------------------------------------------------------------------!
	1183
	1184	USE control_parameters
	1185	USE cpulog
	1186	USE grid_variables
	1187	USE indices
	1188	USE interfaces
	1189	USE pegrid
	1190	USE transpose_indices
	1191
	1192	IMPLICIT NONE
	1193
	1194	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
	1195
[1003]	1196	REAL, DIMENSION(0:ny) :: work_ffty
	1197	REAL, DIMENSION(0:ny,1:nz) :: work_triy
	1198	REAL, DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: ar
	1199	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
[1]	1200
	1201
[1106]	1202	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'start' )
[1]	1203
	1204	ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) )
	1205
	1206	tn = 0 ! Default thread number in case of one thread
[696]	1207	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy )
[1]	1208	DO i = nxl_y, nxr_y
	1209
	1210	!$ tn = omp_get_thread_num()
	1211
	1212	IF ( host(1:3) == 'nec' ) THEN
	1213	!
	1214	!-- Code optimized for vector processors
	1215	DO k = 1, nz
	1216
	1217	m = 0
	1218	DO n = 1, pdims(2)
[1003]	1219	DO j = 1, nny
[1]	1220	work_triy(m,k) = ar(j,k,i,n)
	1221	m = m + 1
	1222	ENDDO
	1223	ENDDO
	1224
	1225	ENDDO
	1226
	1227	CALL fft_y_m( work_triy, ny, 'forward' )
	1228
	1229	ELSE
	1230	!
	1231	!-- Cache optimized code
	1232	DO k = 1, nz
	1233
	1234	m = 0
	1235	DO n = 1, pdims(2)
[1003]	1236	DO j = 1, nny
[1]	1237	work_ffty(m) = ar(j,k,i,n)
	1238	m = m + 1
	1239	ENDDO
	1240	ENDDO
	1241
[1106]	1242	CALL fft_y_1d( work_ffty, 'forward' )
[1]	1243
	1244	DO j = 0, ny
	1245	work_triy(j,k) = work_ffty(j)
	1246	ENDDO
	1247
	1248	ENDDO
	1249
	1250	ENDIF
	1251
	1252	!
	1253	!-- Solve the linear equation system
	1254	CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) )
	1255
	1256	IF ( host(1:3) == 'nec' ) THEN
	1257	!
	1258	!-- Code optimized for vector processors
	1259	CALL fft_y_m( work_triy, ny, 'backward' )
	1260
	1261	DO k = 1, nz
	1262
	1263	m = 0
	1264	DO n = 1, pdims(2)
[1003]	1265	DO j = 1, nny
[1]	1266	ar(j,k,i,n) = work_triy(m,k)
	1267	m = m + 1
	1268	ENDDO
	1269	ENDDO
	1270
	1271	ENDDO
	1272
	1273	ELSE
	1274	!
	1275	!-- Cache optimized code
	1276	DO k = 1, nz
	1277
	1278	DO j = 0, ny
	1279	work_ffty(j) = work_triy(j,k)
	1280	ENDDO
	1281
[1106]	1282	CALL fft_y_1d( work_ffty, 'backward' )
[1]	1283
	1284	m = 0
	1285	DO n = 1, pdims(2)
[1003]	1286	DO j = 1, nny
[1]	1287	ar(j,k,i,n) = work_ffty(m)
	1288	m = m + 1
	1289	ENDDO
	1290	ENDDO
	1291
	1292	ENDDO
	1293
	1294	ENDIF
	1295
	1296	ENDDO
	1297
	1298	DEALLOCATE( tri )
	1299
[1106]	1300	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'stop' )
[1]	1301
	1302	END SUBROUTINE ffty_tri_ffty
	1303
	1304
	1305	SUBROUTINE tridia_1dd( ddx2, ddy2, nx, ny, j, ar, tri )
	1306
	1307	!------------------------------------------------------------------------------!
	1308	! Solves the linear system of equations for a 1d-decomposition along x (see
	1309	! tridia)
	1310	!
[940]	1311	! Attention: when using the intel compilers older than 12.0, array tri must
	1312	! be passed as an argument to the contained subroutines. Otherwise
	1313	! addres faults will occur. This feature can be activated with
	1314	! cpp-switch __intel11
[1]	1315	! On NEC, tri should not be passed (except for routine substi_1dd)
	1316	! because this causes very bad performance.
	1317	!------------------------------------------------------------------------------!
	1318
	1319	USE arrays_3d
	1320	USE control_parameters
	1321
	1322	USE pegrid
	1323
	1324	IMPLICIT NONE
	1325
	1326	INTEGER :: i, j, k, nnyh, nx, ny, omp_get_thread_num, tn
	1327
	1328	REAL :: ddx2, ddy2
	1329
	1330	REAL, DIMENSION(0:nx,1:nz) :: ar
	1331	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	1332
	1333
	1334	nnyh = ( ny + 1 ) / 2
	1335
	1336	!
	1337	!-- Define constant elements of the tridiagonal matrix.
	1338	!-- The compiler on SX6 does loop exchange. If 0:nx is a high power of 2,
	1339	!-- the exchanged loops create bank conflicts. The following directive
	1340	!-- prohibits loop exchange and the loops perform much better.
	1341	! tn = omp_get_thread_num()
	1342	! WRITE( 120+tn, * ) '+++ id=',myid,' nx=',nx,' thread=', omp_get_thread_num()
[82]	1343	! CALL local_flush( 120+tn )
[1]	1344	!CDIR NOLOOPCHG
	1345	DO k = 0, nz-1
	1346	DO i = 0,nx
[667]	1347	tri(2,i,k) = ddzu_pres(k+1) * ddzw(k+1)
	1348	tri(3,i,k) = ddzu_pres(k+2) * ddzw(k+1)
[1]	1349	ENDDO
	1350	ENDDO
	1351	! WRITE( 120+tn, * ) '+++ id=',myid,' end of first tridia loop thread=', omp_get_thread_num()
[82]	1352	! CALL local_flush( 120+tn )
[1]	1353
	1354	IF ( j <= nnyh ) THEN
[940]	1355	#if defined( __intel11 )
[1]	1356	CALL maketri_1dd( j, tri )
	1357	#else
	1358	CALL maketri_1dd( j )
	1359	#endif
	1360	ELSE
[940]	1361	#if defined( __intel11 )
[1]	1362	CALL maketri_1dd( ny+1-j, tri )
	1363	#else
	1364	CALL maketri_1dd( ny+1-j )
	1365	#endif
	1366	ENDIF
[940]	1367	#if defined( __intel11 )
[1]	1368	CALL split_1dd( tri )
	1369	#else
	1370	CALL split_1dd
	1371	#endif
	1372	CALL substi_1dd( ar, tri )
	1373
	1374	CONTAINS
	1375
[940]	1376	#if defined( __intel11 )
[1]	1377	SUBROUTINE maketri_1dd( j, tri )
	1378	#else
	1379	SUBROUTINE maketri_1dd( j )
	1380	#endif
	1381
	1382	!------------------------------------------------------------------------------!
	1383	! computes the i- and j-dependent component of the matrix
	1384	!------------------------------------------------------------------------------!
	1385
	1386	USE constants
	1387
	1388	IMPLICIT NONE
	1389
	1390	INTEGER :: i, j, k, nnxh
	1391	REAL :: a, c
	1392
	1393	REAL, DIMENSION(0:nx) :: l
	1394
[940]	1395	#if defined( __intel11 )
[1]	1396	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	1397	#endif
	1398
	1399
	1400	nnxh = ( nx + 1 ) / 2
	1401	!
	1402	!-- Provide the tridiagonal matrix for solution of the Poisson equation in
	1403	!-- Fourier space. The coefficients are computed following the method of
	1404	!-- Schmidt et al. (DFVLR-Mitteilung 84-15), which departs from Stephan
	1405	!-- Siano's original version by discretizing the Poisson equation,
	1406	!-- before it is Fourier-transformed
	1407	DO i = 0, nx
[128]	1408	IF ( i >= 0 .AND. i <= nnxh ) THEN
[1]	1409	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
[1013]	1410	REAL( nx+1 ) ) ) * ddx2 + &
[1]	1411	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
[1013]	1412	REAL( ny+1 ) ) ) * ddy2
[1]	1413	ELSE
	1414	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
[1013]	1415	REAL( nx+1 ) ) ) * ddx2 + &
[1]	1416	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
[1013]	1417	REAL( ny+1 ) ) ) * ddy2
[1]	1418	ENDIF
	1419	ENDDO
	1420
	1421	DO k = 0, nz-1
	1422	DO i = 0, nx
[667]	1423	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
	1424	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
[1]	1425	tri(1,i,k) = a + c - l(i)
	1426	ENDDO
	1427	ENDDO
	1428	IF ( ibc_p_b == 1 .OR. ibc_p_b == 2 ) THEN
	1429	DO i = 0, nx
	1430	tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
	1431	ENDDO
	1432	ENDIF
	1433	IF ( ibc_p_t == 1 ) THEN
	1434	DO i = 0, nx
	1435	tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
	1436	ENDDO
	1437	ENDIF
	1438
	1439	END SUBROUTINE maketri_1dd
	1440
	1441
[940]	1442	#if defined( __intel11 )
[1]	1443	SUBROUTINE split_1dd( tri )
	1444	#else
	1445	SUBROUTINE split_1dd
	1446	#endif
	1447
	1448	!------------------------------------------------------------------------------!
	1449	! Splitting of the tridiagonal matrix (Thomas algorithm)
	1450	!------------------------------------------------------------------------------!
	1451
	1452	IMPLICIT NONE
	1453
	1454	INTEGER :: i, k
	1455
[940]	1456	#if defined( __intel11 )
[1]	1457	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	1458	#endif
	1459
	1460
	1461	!
	1462	!-- Splitting
	1463	DO i = 0, nx
	1464	tri(4,i,0) = tri(1,i,0)
	1465	ENDDO
	1466	DO k = 1, nz-1
	1467	DO i = 0, nx
	1468	tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
	1469	tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
	1470	ENDDO
	1471	ENDDO
	1472
	1473	END SUBROUTINE split_1dd
	1474
	1475
	1476	SUBROUTINE substi_1dd( ar, tri )
	1477
	1478	!------------------------------------------------------------------------------!
	1479	! Substitution (Forward and Backward) (Thomas algorithm)
	1480	!------------------------------------------------------------------------------!
	1481
	1482	IMPLICIT NONE
	1483
[76]	1484	INTEGER :: i, k
[1]	1485
	1486	REAL, DIMENSION(0:nx,nz) :: ar
	1487	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
	1488	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	1489
	1490	!
	1491	!-- Forward substitution
	1492	DO i = 0, nx
	1493	ar1(i,0) = ar(i,1)
	1494	ENDDO
	1495	DO k = 1, nz-1
	1496	DO i = 0, nx
	1497	ar1(i,k) = ar(i,k+1) - tri(5,i,k) * ar1(i,k-1)
	1498	ENDDO
	1499	ENDDO
	1500
	1501	!
	1502	!-- Backward substitution
[763]	1503	!-- Note, the add of 1.0E-20 in the denominator is due to avoid divisions
	1504	!-- by zero appearing if the pressure bc is set to neumann at the top of
	1505	!-- the model domain.
[1]	1506	DO i = 0, nx
[761]	1507	ar(i,nz) = ar1(i,nz-1) / ( tri(4,i,nz-1) + 1.0E-20 )
[1]	1508	ENDDO
	1509	DO k = nz-2, 0, -1
	1510	DO i = 0, nx
	1511	ar(i,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,k+2) ) &
	1512	/ tri(4,i,k)
	1513	ENDDO
	1514	ENDDO
	1515
[76]	1516	!
	1517	!-- Indices i=0, j=0 correspond to horizontally averaged pressure.
	1518	!-- The respective values of ar should be zero at all k-levels if
	1519	!-- acceleration of horizontally averaged vertical velocity is zero.
	1520	IF ( ibc_p_b == 1 .AND. ibc_p_t == 1 ) THEN
	1521	IF ( j == 0 ) THEN
	1522	DO k = 1, nz
	1523	ar(0,k) = 0.0
	1524	ENDDO
	1525	ENDIF
	1526	ENDIF
	1527
[1]	1528	END SUBROUTINE substi_1dd
	1529
	1530	END SUBROUTINE tridia_1dd
	1531
	1532	#endif
[807]	1533	#endif
[1]	1534	END MODULE poisfft_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |