Home

Context Navigation

source: palm/trunk/SOURCE/poisfft.f90 @ 1104

Last change on this file since 1104 was 1104, checked in by raasch, 11 years ago
last commit documented
Property svn:keywords set to `Id`
File size: 47.4 KB

Line
1	MODULE poisfft_mod
2
3	!--------------------------------------------------------------------------------!
4	! This file is part of PALM.
5	!
6	! PALM is free software: you can redistribute it and/or modify it under the terms
7	! of the GNU General Public License as published by the Free Software Foundation,
8	! either version 3 of the License, or (at your option) any later version.
9	!
10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13	!
14	! You should have received a copy of the GNU General Public License along with
15	! PALM. If not, see <http://www.gnu.org/licenses/>.
16	!
17	! Copyright 1997-2012 Leibniz University Hannover
18	!--------------------------------------------------------------------------------!
19	!
20	! Current revisions:
21	! -----------------
22	!
23	!
24	! Former revisions:
25	! -----------------
26	! $Id: poisfft.f90 1104 2013-02-20 02:37:12Z raasch $
27	!
28	! 1103 2013-02-20 02:15:53Z raasch
29	! tri, ar, and ar1 arguments in tridia-routines (2d) are removed because they
30	! sometimes cause segmentation faults with intel 12.1 compiler
31	!
32	! 1092 2013-02-02 11:24:22Z raasch
33	! unused variables removed
34	!
35	! 1036 2012-10-22 13:43:42Z raasch
36	! code put under GPL (PALM 3.9)
37	!
38	! 2012-09-21 07:03:55Z raasch
39	! FLOAT type conversion replaced by REAL
40	!
41	! 1003 2012-09-14 14:35:53Z raasch
42	! indices nxa, nya, etc. replaced by nx, ny, etc.
43	!
44	! 940 2012-07-09 14:31:00Z raasch
45	! special handling of tri-array as an argument in tridia_1dd routines switched
46	! off because it caused segmentation faults with intel 12.1 compiler
47	!
48	! 877 2012-04-03 11:21:44Z suehring
49	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
50	! pressure at the top of the model domain.
51	!
52	! 809 2012-01-30 13:32:58Z maronga
53	! Bugfix: replaced .AND. and .NOT. with && and ! in the preprocessor directives
54	!
55	! 807 2012-01-25 11:53:51Z maronga
56	! New cpp directive "__check" implemented which is used by check_namelist_files
57	! (most of the code is unneeded by check_namelist_files).
58	!
59	! 763 2011-10-06 09:32:09Z suehring
60	! Comment added concerning the last change.
61	!
62	! 761 2011-10-05 17:58:52Z suehring
63	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
64	! pressure at the top of the model domain.
65	!
66	! 696 2011-03-18 07:03:49Z raasch
67	! work_fftx removed from PRIVATE clauses in fftx_tr_xy and tr_yx_fftx
68	!
69	! 683 2011-02-09 14:25:15Z raasch
70	! openMP parallelization for 2d-domain-decomposition
71	!
72	! 667 2010-12-23 12:06:00Z suehring/gryschka
73	! ddzu replaced by ddzu_pres due to changes in zu(0)
74	!
75	! 622 2010-12-10 08:08:13Z raasch
76	! optional barriers included in order to speed up collective operations
77	!
78	! 377 2009-09-04 11:09:00Z raasch
79	! __lcmuk changed to __lc to avoid problems with Intel compiler on sgi-ice
80	!
81	! 164 2008-05-15 08:46:15Z raasch
82	! Arguments removed from transpose routines
83	!
84	! 128 2007-10-26 13:11:14Z raasch
85	! Bugfix: wavenumber calculation for even nx in routines maketri
86	!
87	! 85 2007-05-11 09:35:14Z raasch
88	! Bugfix: work_fft*_vec removed from some PRIVATE-declarations
89	!
90	! 76 2007-03-29 00:58:32Z raasch
91	! Tridiagonal coefficients adjusted for Neumann boundary conditions both at
92	! the bottom and the top.
93	!
94	! RCS Log replace by Id keyword, revision history cleaned up
95	!
96	! Revision 1.24 2006/08/04 15:00:24 raasch
97	! Default setting of the thread number tn in case of not using OpenMP
98	!
99	! Revision 1.23 2006/02/23 12:48:38 raasch
100	! Additional compiler directive in routine tridia_1dd for preventing loop
101	! exchange on NEC-SX6
102	!
103	! Revision 1.20 2004/04/30 12:38:09 raasch
104	! Parts of former poisfft_hybrid moved to this subroutine,
105	! former subroutine changed to a module, renaming of FFT-subroutines and
106	! -module, FFTs completely substituted by calls of fft_x and fft_y,
107	! NAG fft used in the non-parallel case completely removed, l in maketri
108	! is now a 1d-array, variables passed by modules instead of using parameter
109	! lists, enlarged transposition arrays introduced
110	!
111	! Revision 1.1 1997/07/24 11:24:14 raasch
112	! Initial revision
113	!
114	!
115	! Description:
116	! ------------
117	! See below.
118	!------------------------------------------------------------------------------!
119
120	!--------------------------------------------------------------------------!
121	! poisfft !
122	! !
123	! Original version: Stephan Siano (pois3d) !
124	! !
125	! Institute of Meteorology and Climatology, University of Hannover !
126	! Germany !
127	! !
128	! Version as of July 23,1996 !
129	! !
130	! !
131	! Version for parallel computers: Siegfried Raasch !
132	! !
133	! Version as of July 03,1997 !
134	! !
135	! Solves the Poisson equation with a 2D spectral method !
136	! d^2 p / dx^2 + d^2 p / dy^2 + d^2 p / dz^2 = s !
137	! !
138	! Input: !
139	! real ar contains in the (nnx,nny,nnz) elements, !
140	! starting from the element (1,nys,nxl), the !
141	! values for s !
142	! real work Temporary array !
143	! !
144	! Output: !
145	! real ar contains the solution for p !
146	!--------------------------------------------------------------------------!
147
148	USE fft_xy
149	USE indices
150	USE transpose_indices
151
152	IMPLICIT NONE
153
154	PRIVATE
155
156	#if ! defined ( __check )
157	PUBLIC poisfft, poisfft_init
158
159	INTERFACE poisfft
160	MODULE PROCEDURE poisfft
161	END INTERFACE poisfft
162
163	INTERFACE poisfft_init
164	MODULE PROCEDURE poisfft_init
165	END INTERFACE poisfft_init
166	#else
167	PUBLIC poisfft_init
168
169	INTERFACE poisfft_init
170	MODULE PROCEDURE poisfft_init
171	END INTERFACE poisfft_init
172	#endif
173
174	CONTAINS
175
176	SUBROUTINE poisfft_init
177
178	CALL fft_init
179
180	END SUBROUTINE poisfft_init
181
182	#if ! defined ( __check )
183	SUBROUTINE poisfft( ar, work )
184
185	USE cpulog
186	USE interfaces
187	USE pegrid
188
189	IMPLICIT NONE
190
191	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar, work
192
193
194	CALL cpu_log( log_point_s(3), 'poisfft', 'start' )
195
196	!
197	!-- Two-dimensional Fourier Transformation in x- and y-direction.
198	#if defined( __parallel )
199	IF ( pdims(2) == 1 ) THEN
200
201	!
202	!-- 1d-domain-decomposition along x:
203	!-- FFT along y and transposition y --> x
204	CALL ffty_tr_yx( ar, work, ar )
205
206	!
207	!-- FFT along x, solving the tridiagonal system and backward FFT
208	CALL fftx_tri_fftx( ar )
209
210	!
211	!-- Transposition x --> y and backward FFT along y
212	CALL tr_xy_ffty( ar, work, ar )
213
214	ELSEIF ( pdims(1) == 1 ) THEN
215
216	!
217	!-- 1d-domain-decomposition along y:
218	!-- FFT along x and transposition x --> y
219	CALL fftx_tr_xy( ar, work, ar )
220
221	!
222	!-- FFT along y, solving the tridiagonal system and backward FFT
223	CALL ffty_tri_ffty( ar )
224
225	!
226	!-- Transposition y --> x and backward FFT along x
227	CALL tr_yx_fftx( ar, work, ar )
228
229	ELSE
230
231	!
232	!-- 2d-domain-decomposition
233	!-- Transposition z --> x
234	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
235	CALL transpose_zx( ar, work, ar )
236	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
237
238	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
239	CALL fftxp( ar, 'forward' )
240	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
241
242	!
243	!-- Transposition x --> y
244	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
245	CALL transpose_xy( ar, work, ar )
246	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
247
248	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
249	CALL fftyp( ar, 'forward' )
250	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
251
252	!
253	!-- Transposition y --> z
254	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
255	CALL transpose_yz( ar, work, ar )
256	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
257
258	!
259	!-- Solve the Poisson equation in z-direction in cartesian space.
260	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
261	CALL tridia( ar )
262	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
263
264	!
265	!-- Inverse Fourier Transformation
266	!-- Transposition z --> y
267	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
268	CALL transpose_zy( ar, work, ar )
269	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
270
271	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
272	CALL fftyp( ar, 'backward' )
273	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
274
275	!
276	!-- Transposition y --> x
277	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
278	CALL transpose_yx( ar, work, ar )
279	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
280
281	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
282	CALL fftxp( ar, 'backward' )
283	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
284
285	!
286	!-- Transposition x --> z
287	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
288	CALL transpose_xz( ar, work, ar )
289	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
290
291	ENDIF
292
293	#else
294
295	!
296	!-- Two-dimensional Fourier Transformation along x- and y-direction.
297	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
298	CALL fftx( ar, 'forward' )
299	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
300	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
301	CALL ffty( ar, 'forward' )
302	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
303
304	!
305	!-- Solve the Poisson equation in z-direction in cartesian space.
306	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
307	CALL tridia( ar )
308	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
309
310	!
311	!-- Inverse Fourier Transformation.
312	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
313	CALL ffty( ar, 'backward' )
314	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
315	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
316	CALL fftx( ar, 'backward' )
317	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
318
319	#endif
320
321	CALL cpu_log( log_point_s(3), 'poisfft', 'stop' )
322
323	END SUBROUTINE poisfft
324
325
326
327	SUBROUTINE tridia( ar )
328
329	!------------------------------------------------------------------------------!
330	! solves the linear system of equations:
331	!
332	! -(4 pi^2(i^2/(dx^2nnx^2)+j^2/(dy^2nny^2))+
333	! 1/(dzu(k)dzw(k))+1/(dzu(k-1)dzw(k)))*p(i,j,k)+
334	! 1/(dzu(k)dzw(k))p(i,j,k+1)+1/(dzu(k-1)dzw(k))p(i,j,k-1)=d(i,j,k)
335	!
336	! by using the Thomas algorithm
337	!------------------------------------------------------------------------------!
338
339	USE arrays_3d
340
341	IMPLICIT NONE
342
343	INTEGER :: i, j, k, nnyh
344
345	REAL, DIMENSION(nxl_z:nxr_z,0:nz-1) :: ar1
346	REAL, DIMENSION(5,nxl_z:nxr_z,0:nz-1) :: tri
347
348	#if defined( __parallel )
349	REAL :: ar(nxl_z:nxr_z,nys_z:nyn_z,1:nz)
350	#else
351	REAL :: ar(1:nz,nys_z:nyn_z,nxl_z:nxr_z)
352	#endif
353
354
355	nnyh = (ny+1) / 2
356
357	!
358	!-- Define constant elements of the tridiagonal matrix.
359	!$OMP PARALLEL PRIVATE ( k, i )
360	!$OMP DO
361	DO k = 0, nz-1
362	DO i = nxl_z, nxr_z
363	tri(2,i,k) = ddzu_pres(k+1) * ddzw(k+1)
364	tri(3,i,k) = ddzu_pres(k+2) * ddzw(k+1)
365	ENDDO
366	ENDDO
367	!$OMP END PARALLEL
368
369	#if defined( __parallel )
370	!
371	!-- Repeat for all y-levels.
372	!$OMP PARALLEL FIRSTPRIVATE( tri ) PRIVATE ( ar1, j )
373	!$OMP DO
374	DO j = nys_z, nyn_z
375	IF ( j <= nnyh ) THEN
376	CALL maketri( j )
377	ELSE
378	CALL maketri( ny+1-j )
379	ENDIF
380	CALL split
381	CALL substi( j )
382	ENDDO
383	!$OMP END PARALLEL
384	#else
385	!
386	!-- First y-level.
387	CALL maketri( nys_z )
388	CALL split
389	CALL substi( 0 )
390
391	!
392	!-- Further y-levels.
393	DO j = 1, nnyh - 1
394	CALL maketri( j )
395	CALL split
396	CALL substi( j )
397	CALL substi( ny+1-j )
398	ENDDO
399	CALL maketri( nnyh )
400	CALL split
401	CALL substi( nnyh+nys )
402	#endif
403
404	CONTAINS
405
406	SUBROUTINE maketri( j )
407
408	!------------------------------------------------------------------------------!
409	! Computes the i- and j-dependent component of the matrix
410	!------------------------------------------------------------------------------!
411
412	USE arrays_3d
413	USE constants
414	USE control_parameters
415	USE grid_variables
416
417	IMPLICIT NONE
418
419	INTEGER :: i, j, k, nnxh
420	REAL :: a, c
421	REAL :: ll(nxl_z:nxr_z)
422
423
424	nnxh = ( nx + 1 ) / 2
425
426	!
427	!-- Provide the tridiagonal matrix for solution of the Poisson equation in
428	!-- Fourier space. The coefficients are computed following the method of
429	!-- Schmidt et al. (DFVLR-Mitteilung 84-15), which departs from Stephan
430	!-- Siano's original version by discretizing the Poisson equation,
431	!-- before it is Fourier-transformed
432	#if defined( __parallel )
433	DO i = nxl_z, nxr_z
434	IF ( i >= 0 .AND. i <= nnxh ) THEN
435	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
436	REAL( nx+1 ) ) ) / ( dx * dx ) + &
437	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
438	REAL( ny+1 ) ) ) / ( dy * dy )
439	ELSE
440	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
441	REAL( nx+1 ) ) ) / ( dx * dx ) + &
442	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
443	REAL( ny+1 ) ) ) / ( dy * dy )
444	ENDIF
445	DO k = 0,nz-1
446	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
447	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
448	tri(1,i,k) = a + c - ll(i)
449	ENDDO
450	ENDDO
451	#else
452	DO i = 0, nnxh
453	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / REAL( nx+1 ) ) ) / &
454	( dx * dx ) + &
455	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / REAL( ny+1 ) ) ) / &
456	( dy * dy )
457	DO k = 0, nz-1
458	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
459	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
460	tri(1,i,k) = a + c - ll(i)
461	IF ( i >= 1 .and. i < nnxh ) THEN
462	tri(1,nx+1-i,k) = tri(1,i,k)
463	ENDIF
464	ENDDO
465	ENDDO
466	#endif
467	IF ( ibc_p_b == 1 .OR. ibc_p_b == 2 ) THEN
468	DO i = nxl_z, nxr_z
469	tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
470	ENDDO
471	ENDIF
472	IF ( ibc_p_t == 1 ) THEN
473	DO i = nxl_z, nxr_z
474	tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
475	ENDDO
476	ENDIF
477
478	END SUBROUTINE maketri
479
480
481	SUBROUTINE substi( j )
482
483	!------------------------------------------------------------------------------!
484	! Substitution (Forward and Backward) (Thomas algorithm)
485	!------------------------------------------------------------------------------!
486
487	USE control_parameters
488
489	IMPLICIT NONE
490
491	INTEGER :: i, j, k
492
493	!
494	!-- Forward substitution.
495	DO i = nxl_z, nxr_z
496	#if defined( __parallel )
497	ar1(i,0) = ar(i,j,1)
498	#else
499	ar1(i,0) = ar(1,j,i)
500	#endif
501	ENDDO
502	DO k = 1, nz - 1
503	DO i = nxl_z, nxr_z
504	#if defined( __parallel )
505	ar1(i,k) = ar(i,j,k+1) - tri(5,i,k) * ar1(i,k-1)
506	#else
507	ar1(i,k) = ar(k+1,j,i) - tri(5,i,k) * ar1(i,k-1)
508	#endif
509	ENDDO
510	ENDDO
511
512	!
513	!-- Backward substitution
514	!-- Note, the 1.0E-20 in the denominator is due to avoid divisions
515	!-- by zero appearing if the pressure bc is set to neumann at the top of
516	!-- the model domain.
517	DO i = nxl_z, nxr_z
518	#if defined( __parallel )
519	ar(i,j,nz) = ar1(i,nz-1) / ( tri(4,i,nz-1) + 1.0E-20 )
520	#else
521	ar(nz,j,i) = ar1(i,nz-1) / ( tri(4,i,nz-1) + 1.0E-20 )
522	#endif
523	ENDDO
524	DO k = nz-2, 0, -1
525	DO i = nxl_z, nxr_z
526	#if defined( __parallel )
527	ar(i,j,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,j,k+2) ) &
528	/ tri(4,i,k)
529	#else
530	ar(k+1,j,i) = ( ar1(i,k) - tri(3,i,k) * ar(k+2,j,i) ) &
531	/ tri(4,i,k)
532	#endif
533	ENDDO
534	ENDDO
535
536	!
537	!-- Indices i=0, j=0 correspond to horizontally averaged pressure.
538	!-- The respective values of ar should be zero at all k-levels if
539	!-- acceleration of horizontally averaged vertical velocity is zero.
540	IF ( ibc_p_b == 1 .AND. ibc_p_t == 1 ) THEN
541	IF ( j == 0 .AND. nxl_z == 0 ) THEN
542	#if defined( __parallel )
543	DO k = 1, nz
544	ar(nxl_z,j,k) = 0.0
545	ENDDO
546	#else
547	DO k = 1, nz
548	ar(k,j,nxl_z) = 0.0
549	ENDDO
550	#endif
551	ENDIF
552	ENDIF
553
554	END SUBROUTINE substi
555
556
557	SUBROUTINE split
558
559	!------------------------------------------------------------------------------!
560	! Splitting of the tridiagonal matrix (Thomas algorithm)
561	!------------------------------------------------------------------------------!
562
563	IMPLICIT NONE
564
565	INTEGER :: i, k
566
567	!
568	!-- Splitting.
569	DO i = nxl_z, nxr_z
570	tri(4,i,0) = tri(1,i,0)
571	ENDDO
572	DO k = 1, nz-1
573	DO i = nxl_z, nxr_z
574	tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
575	tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
576	ENDDO
577	ENDDO
578
579	END SUBROUTINE split
580
581	END SUBROUTINE tridia
582
583
584	#if defined( __parallel )
585	SUBROUTINE fftxp( ar, direction )
586
587	!------------------------------------------------------------------------------!
588	! Fourier-transformation along x-direction Parallelized version
589	!------------------------------------------------------------------------------!
590
591	IMPLICIT NONE
592
593	CHARACTER (LEN=*) :: direction
594	INTEGER :: j, k
595	REAL :: ar(0:nx,nys_x:nyn_x,nzb_x:nzt_x)
596
597	!
598	!-- Performing the fft with one of the methods implemented
599	!$OMP PARALLEL PRIVATE ( j, k )
600	!$OMP DO
601	DO k = nzb_x, nzt_x
602	DO j = nys_x, nyn_x
603	CALL fft_x( ar(0:nx,j,k), direction )
604	ENDDO
605	ENDDO
606	!$OMP END PARALLEL
607
608	END SUBROUTINE fftxp
609
610	#else
611	SUBROUTINE fftx( ar, direction )
612
613	!------------------------------------------------------------------------------!
614	! Fourier-transformation along x-direction Non parallel version
615	!------------------------------------------------------------------------------!
616
617	IMPLICIT NONE
618
619	CHARACTER (LEN=*) :: direction
620	INTEGER :: i, j, k
621	REAL :: ar(1:nz,0:ny,0:nx)
622
623	!
624	!-- Performing the fft with one of the methods implemented
625	!$OMP PARALLEL PRIVATE ( j, k )
626	!$OMP DO
627	DO k = 1, nz
628	DO j = 0, ny
629	CALL fft_x( ar(k,j,0:nx), direction )
630	ENDDO
631	ENDDO
632	!$OMP END PARALLEL
633
634	END SUBROUTINE fftx
635	#endif
636
637
638	#if defined( __parallel )
639	SUBROUTINE fftyp( ar, direction )
640
641	!------------------------------------------------------------------------------!
642	! Fourier-transformation along y-direction Parallelized version
643	!------------------------------------------------------------------------------!
644
645	IMPLICIT NONE
646
647	CHARACTER (LEN=*) :: direction
648	INTEGER :: i, k
649	REAL :: ar(0:ny,nxl_y:nxr_y,nzb_y:nzt_y)
650
651	!
652	!-- Performing the fft with one of the methods implemented
653	!$OMP PARALLEL PRIVATE ( i, k )
654	!$OMP DO
655	DO k = nzb_y, nzt_y
656	DO i = nxl_y, nxr_y
657	CALL fft_y( ar(0:ny,i,k), direction )
658	ENDDO
659	ENDDO
660	!$OMP END PARALLEL
661
662	END SUBROUTINE fftyp
663
664	#else
665	SUBROUTINE ffty( ar, direction )
666
667	!------------------------------------------------------------------------------!
668	! Fourier-transformation along y-direction Non parallel version
669	!------------------------------------------------------------------------------!
670
671	IMPLICIT NONE
672
673	CHARACTER (LEN=*) :: direction
674	INTEGER :: i, k
675	REAL :: ar(1:nz,0:ny,0:nx)
676
677	!
678	!-- Performing the fft with one of the methods implemented
679	!$OMP PARALLEL PRIVATE ( i, k )
680	!$OMP DO
681	DO k = 1, nz
682	DO i = 0, nx
683	CALL fft_y( ar(k,0:ny,i), direction )
684	ENDDO
685	ENDDO
686	!$OMP END PARALLEL
687
688	END SUBROUTINE ffty
689	#endif
690
691	#if defined( __parallel )
692	SUBROUTINE ffty_tr_yx( f_in, work, f_out )
693
694	!------------------------------------------------------------------------------!
695	! Fourier-transformation along y with subsequent transposition y --> x for
696	! a 1d-decomposition along x
697	!
698	! ATTENTION: The performance of this routine is much faster on the NEC-SX6,
699	! if the first index of work_ffty_vec is odd. Otherwise
700	! memory bank conflicts may occur (especially if the index is a
701	! multiple of 128). That's why work_ffty_vec is dimensioned as
702	! 0:ny+1.
703	! Of course, this will not work if users are using an odd number
704	! of gridpoints along y.
705	!------------------------------------------------------------------------------!
706
707	USE control_parameters
708	USE cpulog
709	USE indices
710	USE interfaces
711	USE pegrid
712	USE transpose_indices
713
714	IMPLICIT NONE
715
716	INTEGER :: i, iend, iouter, ir, j, k
717	INTEGER, PARAMETER :: stridex = 4
718
719	REAL, DIMENSION(0:ny,stridex) :: work_ffty
720	#if defined( __nec )
721	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
722	#endif
723	REAL, DIMENSION(1:nz,0:ny,nxl:nxr) :: f_in
724	REAL, DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_out
725	REAL, DIMENSION(nxl:nxr,1:nz,0:ny) :: work
726
727	!
728	!-- Carry out the FFT along y, where all data are present due to the
729	!-- 1d-decomposition along x. Resort the data in a way that x becomes
730	!-- the first index.
731	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
732
733	IF ( host(1:3) == 'nec' ) THEN
734	#if defined( __nec )
735	!
736	!-- Code optimized for vector processors
737	!$OMP PARALLEL PRIVATE ( i, j, k )
738	!$OMP DO
739	DO i = nxl, nxr
740
741	DO j = 0, ny
742	DO k = 1, nz
743	work_ffty_vec(j,k,i) = f_in(k,j,i)
744	ENDDO
745	ENDDO
746
747	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' )
748
749	ENDDO
750
751	!$OMP DO
752	DO k = 1, nz
753	DO j = 0, ny
754	DO i = nxl, nxr
755	work(i,k,j) = work_ffty_vec(j,k,i)
756	ENDDO
757	ENDDO
758	ENDDO
759	!$OMP END PARALLEL
760	#endif
761
762	ELSE
763
764	!
765	!-- Cache optimized code.
766	!-- The i-(x-)direction is split into a strided outer loop and an inner
767	!-- loop for better cache performance
768	!$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty)
769	!$OMP DO
770	DO iouter = nxl, nxr, stridex
771
772	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
773
774	DO k = 1, nz
775
776	DO i = iouter, iend
777
778	ir = i-iouter+1 ! counter within a stride
779	DO j = 0, ny
780	work_ffty(j,ir) = f_in(k,j,i)
781	ENDDO
782	!
783	!-- FFT along y
784	CALL fft_y( work_ffty(:,ir), 'forward' )
785
786	ENDDO
787
788	!
789	!-- Resort
790	DO j = 0, ny
791	DO i = iouter, iend
792	work(i,k,j) = work_ffty(j,i-iouter+1)
793	ENDDO
794	ENDDO
795
796	ENDDO
797
798	ENDDO
799	!$OMP END PARALLEL
800
801	ENDIF
802	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
803
804	!
805	!-- Transpose array
806	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
807	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
808	CALL MPI_ALLTOALL( work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
809	f_out(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
810	comm1dx, ierr )
811	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
812
813	END SUBROUTINE ffty_tr_yx
814
815
816	SUBROUTINE tr_xy_ffty( f_in, work, f_out )
817
818	!------------------------------------------------------------------------------!
819	! Transposition x --> y with a subsequent backward Fourier transformation for
820	! a 1d-decomposition along x
821	!------------------------------------------------------------------------------!
822
823	USE control_parameters
824	USE cpulog
825	USE indices
826	USE interfaces
827	USE pegrid
828	USE transpose_indices
829
830	IMPLICIT NONE
831
832	INTEGER :: i, iend, iouter, ir, j, k
833	INTEGER, PARAMETER :: stridex = 4
834
835	REAL, DIMENSION(0:ny,stridex) :: work_ffty
836	#if defined( __nec )
837	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
838	#endif
839	REAL, DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_in
840	REAL, DIMENSION(1:nz,0:ny,nxl:nxr) :: f_out
841	REAL, DIMENSION(nxl:nxr,1:nz,0:ny) :: work
842
843	!
844	!-- Transpose array
845	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
846	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
847	CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
848	work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
849	comm1dx, ierr )
850	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
851
852	!
853	!-- Resort the data in a way that y becomes the first index and carry out the
854	!-- backward fft along y.
855	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
856
857	IF ( host(1:3) == 'nec' ) THEN
858	#if defined( __nec )
859	!
860	!-- Code optimized for vector processors
861	!$OMP PARALLEL PRIVATE ( i, j, k )
862	!$OMP DO
863	DO k = 1, nz
864	DO j = 0, ny
865	DO i = nxl, nxr
866	work_ffty_vec(j,k,i) = work(i,k,j)
867	ENDDO
868	ENDDO
869	ENDDO
870
871	!$OMP DO
872	DO i = nxl, nxr
873
874	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' )
875
876	DO j = 0, ny
877	DO k = 1, nz
878	f_out(k,j,i) = work_ffty_vec(j,k,i)
879	ENDDO
880	ENDDO
881
882	ENDDO
883	!$OMP END PARALLEL
884	#endif
885
886	ELSE
887
888	!
889	!-- Cache optimized code.
890	!-- The i-(x-)direction is split into a strided outer loop and an inner
891	!-- loop for better cache performance
892	!$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty )
893	!$OMP DO
894	DO iouter = nxl, nxr, stridex
895
896	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
897
898	DO k = 1, nz
899	!
900	!-- Resort
901	DO j = 0, ny
902	DO i = iouter, iend
903	work_ffty(j,i-iouter+1) = work(i,k,j)
904	ENDDO
905	ENDDO
906
907	DO i = iouter, iend
908
909	!
910	!-- FFT along y
911	ir = i-iouter+1 ! counter within a stride
912	CALL fft_y( work_ffty(:,ir), 'backward' )
913
914	DO j = 0, ny
915	f_out(k,j,i) = work_ffty(j,ir)
916	ENDDO
917	ENDDO
918
919	ENDDO
920
921	ENDDO
922	!$OMP END PARALLEL
923
924	ENDIF
925
926	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
927
928	END SUBROUTINE tr_xy_ffty
929
930
931	SUBROUTINE fftx_tri_fftx( ar )
932
933	!------------------------------------------------------------------------------!
934	! FFT along x, solution of the tridiagonal system and backward FFT for
935	! a 1d-decomposition along x
936	!
937	! WARNING: this subroutine may still not work for hybrid parallelization
938	! with OpenMP (for possible necessary changes see the original
939	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
940	!------------------------------------------------------------------------------!
941
942	USE control_parameters
943	USE cpulog
944	USE grid_variables
945	USE indices
946	USE interfaces
947	USE pegrid
948	USE transpose_indices
949
950	IMPLICIT NONE
951
952	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
953
954	REAL, DIMENSION(0:nx) :: work_fftx
955	REAL, DIMENSION(0:nx,1:nz) :: work_trix
956	REAL, DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: ar
957	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
958
959
960	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'start' )
961
962	ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) )
963
964	tn = 0 ! Default thread number in case of one thread
965	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_fftx, work_trix )
966	DO j = nys_x, nyn_x
967
968	!$ tn = omp_get_thread_num()
969
970	IF ( host(1:3) == 'nec' ) THEN
971	!
972	!-- Code optimized for vector processors
973	DO k = 1, nz
974
975	m = 0
976	DO n = 1, pdims(1)
977	DO i = 1, nnx
978	work_trix(m,k) = ar(i,k,j,n)
979	m = m + 1
980	ENDDO
981	ENDDO
982
983	ENDDO
984
985	CALL fft_x_m( work_trix, 'forward' )
986
987	ELSE
988	!
989	!-- Cache optimized code
990	DO k = 1, nz
991
992	m = 0
993	DO n = 1, pdims(1)
994	DO i = 1, nnx
995	work_fftx(m) = ar(i,k,j,n)
996	m = m + 1
997	ENDDO
998	ENDDO
999
1000	CALL fft_x( work_fftx, 'forward' )
1001
1002	DO i = 0, nx
1003	work_trix(i,k) = work_fftx(i)
1004	ENDDO
1005
1006	ENDDO
1007
1008	ENDIF
1009
1010	!
1011	!-- Solve the linear equation system
1012	CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) )
1013
1014	IF ( host(1:3) == 'nec' ) THEN
1015	!
1016	!-- Code optimized for vector processors
1017	CALL fft_x_m( work_trix, 'backward' )
1018
1019	DO k = 1, nz
1020
1021	m = 0
1022	DO n = 1, pdims(1)
1023	DO i = 1, nnx
1024	ar(i,k,j,n) = work_trix(m,k)
1025	m = m + 1
1026	ENDDO
1027	ENDDO
1028
1029	ENDDO
1030
1031	ELSE
1032	!
1033	!-- Cache optimized code
1034	DO k = 1, nz
1035
1036	DO i = 0, nx
1037	work_fftx(i) = work_trix(i,k)
1038	ENDDO
1039
1040	CALL fft_x( work_fftx, 'backward' )
1041
1042	m = 0
1043	DO n = 1, pdims(1)
1044	DO i = 1, nnx
1045	ar(i,k,j,n) = work_fftx(m)
1046	m = m + 1
1047	ENDDO
1048	ENDDO
1049
1050	ENDDO
1051
1052	ENDIF
1053
1054	ENDDO
1055
1056	DEALLOCATE( tri )
1057
1058	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'stop' )
1059
1060	END SUBROUTINE fftx_tri_fftx
1061
1062
1063	SUBROUTINE fftx_tr_xy( f_in, work, f_out )
1064
1065	!------------------------------------------------------------------------------!
1066	! Fourier-transformation along x with subsequent transposition x --> y for
1067	! a 1d-decomposition along y
1068	!
1069	! ATTENTION: The NEC-branch of this routine may significantly profit from
1070	! further optimizations. So far, performance is much worse than
1071	! for routine ffty_tr_yx (more than three times slower).
1072	!------------------------------------------------------------------------------!
1073
1074	USE control_parameters
1075	USE cpulog
1076	USE indices
1077	USE interfaces
1078	USE pegrid
1079	USE transpose_indices
1080
1081	IMPLICIT NONE
1082
1083	INTEGER :: i, j, k
1084
1085	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
1086	REAL, DIMENSION(1:nz,nys:nyn,0:nx) :: f_in
1087	REAL, DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_out
1088	REAL, DIMENSION(nys:nyn,1:nz,0:nx) :: work
1089
1090	!
1091	!-- Carry out the FFT along x, where all data are present due to the
1092	!-- 1d-decomposition along y. Resort the data in a way that y becomes
1093	!-- the first index.
1094	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
1095
1096	IF ( host(1:3) == 'nec' ) THEN
1097	!
1098	!-- Code for vector processors
1099	!$OMP PARALLEL PRIVATE ( i, j, k )
1100	!$OMP DO
1101	DO i = 0, nx
1102
1103	DO j = nys, nyn
1104	DO k = 1, nz
1105	work_fftx(i,k,j) = f_in(k,j,i)
1106	ENDDO
1107	ENDDO
1108
1109	ENDDO
1110
1111	!$OMP DO
1112	DO j = nys, nyn
1113
1114	CALL fft_x_m( work_fftx(:,:,j), 'forward' )
1115
1116	DO k = 1, nz
1117	DO i = 0, nx
1118	work(j,k,i) = work_fftx(i,k,j)
1119	ENDDO
1120	ENDDO
1121
1122	ENDDO
1123	!$OMP END PARALLEL
1124
1125	ELSE
1126
1127	!
1128	!-- Cache optimized code (there might be still a potential for better
1129	!-- optimization).
1130	!$OMP PARALLEL PRIVATE (i,j,k)
1131	!$OMP DO
1132	DO i = 0, nx
1133
1134	DO j = nys, nyn
1135	DO k = 1, nz
1136	work_fftx(i,k,j) = f_in(k,j,i)
1137	ENDDO
1138	ENDDO
1139
1140	ENDDO
1141
1142	!$OMP DO
1143	DO j = nys, nyn
1144	DO k = 1, nz
1145
1146	CALL fft_x( work_fftx(0:nx,k,j), 'forward' )
1147
1148	DO i = 0, nx
1149	work(j,k,i) = work_fftx(i,k,j)
1150	ENDDO
1151	ENDDO
1152
1153	ENDDO
1154	!$OMP END PARALLEL
1155
1156	ENDIF
1157	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
1158
1159	!
1160	!-- Transpose array
1161	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1162	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1163	CALL MPI_ALLTOALL( work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
1164	f_out(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
1165	comm1dy, ierr )
1166	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1167
1168	END SUBROUTINE fftx_tr_xy
1169
1170
1171	SUBROUTINE tr_yx_fftx( f_in, work, f_out )
1172
1173	!------------------------------------------------------------------------------!
1174	! Transposition y --> x with a subsequent backward Fourier transformation for
1175	! a 1d-decomposition along x
1176	!------------------------------------------------------------------------------!
1177
1178	USE control_parameters
1179	USE cpulog
1180	USE indices
1181	USE interfaces
1182	USE pegrid
1183	USE transpose_indices
1184
1185	IMPLICIT NONE
1186
1187	INTEGER :: i, j, k
1188
1189	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
1190	REAL, DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_in
1191	REAL, DIMENSION(1:nz,nys:nyn,0:nx) :: f_out
1192	REAL, DIMENSION(nys:nyn,1:nz,0:nx) :: work
1193
1194	!
1195	!-- Transpose array
1196	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1197	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1198	CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
1199	work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
1200	comm1dy, ierr )
1201	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1202
1203	!
1204	!-- Carry out the FFT along x, where all data are present due to the
1205	!-- 1d-decomposition along y. Resort the data in a way that y becomes
1206	!-- the first index.
1207	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
1208
1209	IF ( host(1:3) == 'nec' ) THEN
1210	!
1211	!-- Code optimized for vector processors
1212	!$OMP PARALLEL PRIVATE ( i, j, k )
1213	!$OMP DO
1214	DO j = nys, nyn
1215
1216	DO k = 1, nz
1217	DO i = 0, nx
1218	work_fftx(i,k,j) = work(j,k,i)
1219	ENDDO
1220	ENDDO
1221
1222	CALL fft_x_m( work_fftx(:,:,j), 'backward' )
1223
1224	ENDDO
1225
1226	!$OMP DO
1227	DO i = 0, nx
1228	DO j = nys, nyn
1229	DO k = 1, nz
1230	f_out(k,j,i) = work_fftx(i,k,j)
1231	ENDDO
1232	ENDDO
1233	ENDDO
1234	!$OMP END PARALLEL
1235
1236	ELSE
1237
1238	!
1239	!-- Cache optimized code (there might be still a potential for better
1240	!-- optimization).
1241	!$OMP PARALLEL PRIVATE (i,j,k)
1242	!$OMP DO
1243	DO j = nys, nyn
1244	DO k = 1, nz
1245
1246	DO i = 0, nx
1247	work_fftx(i,k,j) = work(j,k,i)
1248	ENDDO
1249
1250	CALL fft_x( work_fftx(0:nx,k,j), 'backward' )
1251
1252	ENDDO
1253	ENDDO
1254
1255	!$OMP DO
1256	DO i = 0, nx
1257	DO j = nys, nyn
1258	DO k = 1, nz
1259	f_out(k,j,i) = work_fftx(i,k,j)
1260	ENDDO
1261	ENDDO
1262	ENDDO
1263	!$OMP END PARALLEL
1264
1265	ENDIF
1266	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
1267
1268	END SUBROUTINE tr_yx_fftx
1269
1270
1271	SUBROUTINE ffty_tri_ffty( ar )
1272
1273	!------------------------------------------------------------------------------!
1274	! FFT along y, solution of the tridiagonal system and backward FFT for
1275	! a 1d-decomposition along y
1276	!
1277	! WARNING: this subroutine may still not work for hybrid parallelization
1278	! with OpenMP (for possible necessary changes see the original
1279	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
1280	!------------------------------------------------------------------------------!
1281
1282	USE control_parameters
1283	USE cpulog
1284	USE grid_variables
1285	USE indices
1286	USE interfaces
1287	USE pegrid
1288	USE transpose_indices
1289
1290	IMPLICIT NONE
1291
1292	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
1293
1294	REAL, DIMENSION(0:ny) :: work_ffty
1295	REAL, DIMENSION(0:ny,1:nz) :: work_triy
1296	REAL, DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: ar
1297	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
1298
1299
1300	CALL cpu_log( log_point_s(39), 'fft_y + tridia', 'start' )
1301
1302	ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) )
1303
1304	tn = 0 ! Default thread number in case of one thread
1305	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy )
1306	DO i = nxl_y, nxr_y
1307
1308	!$ tn = omp_get_thread_num()
1309
1310	IF ( host(1:3) == 'nec' ) THEN
1311	!
1312	!-- Code optimized for vector processors
1313	DO k = 1, nz
1314
1315	m = 0
1316	DO n = 1, pdims(2)
1317	DO j = 1, nny
1318	work_triy(m,k) = ar(j,k,i,n)
1319	m = m + 1
1320	ENDDO
1321	ENDDO
1322
1323	ENDDO
1324
1325	CALL fft_y_m( work_triy, ny, 'forward' )
1326
1327	ELSE
1328	!
1329	!-- Cache optimized code
1330	DO k = 1, nz
1331
1332	m = 0
1333	DO n = 1, pdims(2)
1334	DO j = 1, nny
1335	work_ffty(m) = ar(j,k,i,n)
1336	m = m + 1
1337	ENDDO
1338	ENDDO
1339
1340	CALL fft_y( work_ffty, 'forward' )
1341
1342	DO j = 0, ny
1343	work_triy(j,k) = work_ffty(j)
1344	ENDDO
1345
1346	ENDDO
1347
1348	ENDIF
1349
1350	!
1351	!-- Solve the linear equation system
1352	CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) )
1353
1354	IF ( host(1:3) == 'nec' ) THEN
1355	!
1356	!-- Code optimized for vector processors
1357	CALL fft_y_m( work_triy, ny, 'backward' )
1358
1359	DO k = 1, nz
1360
1361	m = 0
1362	DO n = 1, pdims(2)
1363	DO j = 1, nny
1364	ar(j,k,i,n) = work_triy(m,k)
1365	m = m + 1
1366	ENDDO
1367	ENDDO
1368
1369	ENDDO
1370
1371	ELSE
1372	!
1373	!-- Cache optimized code
1374	DO k = 1, nz
1375
1376	DO j = 0, ny
1377	work_ffty(j) = work_triy(j,k)
1378	ENDDO
1379
1380	CALL fft_y( work_ffty, 'backward' )
1381
1382	m = 0
1383	DO n = 1, pdims(2)
1384	DO j = 1, nny
1385	ar(j,k,i,n) = work_ffty(m)
1386	m = m + 1
1387	ENDDO
1388	ENDDO
1389
1390	ENDDO
1391
1392	ENDIF
1393
1394	ENDDO
1395
1396	DEALLOCATE( tri )
1397
1398	CALL cpu_log( log_point_s(39), 'fft_y + tridia', 'stop' )
1399
1400	END SUBROUTINE ffty_tri_ffty
1401
1402
1403	SUBROUTINE tridia_1dd( ddx2, ddy2, nx, ny, j, ar, tri )
1404
1405	!------------------------------------------------------------------------------!
1406	! Solves the linear system of equations for a 1d-decomposition along x (see
1407	! tridia)
1408	!
1409	! Attention: when using the intel compilers older than 12.0, array tri must
1410	! be passed as an argument to the contained subroutines. Otherwise
1411	! addres faults will occur. This feature can be activated with
1412	! cpp-switch __intel11
1413	! On NEC, tri should not be passed (except for routine substi_1dd)
1414	! because this causes very bad performance.
1415	!------------------------------------------------------------------------------!
1416
1417	USE arrays_3d
1418	USE control_parameters
1419
1420	USE pegrid
1421
1422	IMPLICIT NONE
1423
1424	INTEGER :: i, j, k, nnyh, nx, ny, omp_get_thread_num, tn
1425
1426	REAL :: ddx2, ddy2
1427
1428	REAL, DIMENSION(0:nx,1:nz) :: ar
1429	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
1430
1431
1432	nnyh = ( ny + 1 ) / 2
1433
1434	!
1435	!-- Define constant elements of the tridiagonal matrix.
1436	!-- The compiler on SX6 does loop exchange. If 0:nx is a high power of 2,
1437	!-- the exchanged loops create bank conflicts. The following directive
1438	!-- prohibits loop exchange and the loops perform much better.
1439	! tn = omp_get_thread_num()
1440	! WRITE( 120+tn, * ) '+++ id=',myid,' nx=',nx,' thread=', omp_get_thread_num()
1441	! CALL local_flush( 120+tn )
1442	!CDIR NOLOOPCHG
1443	DO k = 0, nz-1
1444	DO i = 0,nx
1445	tri(2,i,k) = ddzu_pres(k+1) * ddzw(k+1)
1446	tri(3,i,k) = ddzu_pres(k+2) * ddzw(k+1)
1447	ENDDO
1448	ENDDO
1449	! WRITE( 120+tn, * ) '+++ id=',myid,' end of first tridia loop thread=', omp_get_thread_num()
1450	! CALL local_flush( 120+tn )
1451
1452	IF ( j <= nnyh ) THEN
1453	#if defined( __intel11 )
1454	CALL maketri_1dd( j, tri )
1455	#else
1456	CALL maketri_1dd( j )
1457	#endif
1458	ELSE
1459	#if defined( __intel11 )
1460	CALL maketri_1dd( ny+1-j, tri )
1461	#else
1462	CALL maketri_1dd( ny+1-j )
1463	#endif
1464	ENDIF
1465	#if defined( __intel11 )
1466	CALL split_1dd( tri )
1467	#else
1468	CALL split_1dd
1469	#endif
1470	CALL substi_1dd( ar, tri )
1471
1472	CONTAINS
1473
1474	#if defined( __intel11 )
1475	SUBROUTINE maketri_1dd( j, tri )
1476	#else
1477	SUBROUTINE maketri_1dd( j )
1478	#endif
1479
1480	!------------------------------------------------------------------------------!
1481	! computes the i- and j-dependent component of the matrix
1482	!------------------------------------------------------------------------------!
1483
1484	USE constants
1485
1486	IMPLICIT NONE
1487
1488	INTEGER :: i, j, k, nnxh
1489	REAL :: a, c
1490
1491	REAL, DIMENSION(0:nx) :: l
1492
1493	#if defined( __intel11 )
1494	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
1495	#endif
1496
1497
1498	nnxh = ( nx + 1 ) / 2
1499	!
1500	!-- Provide the tridiagonal matrix for solution of the Poisson equation in
1501	!-- Fourier space. The coefficients are computed following the method of
1502	!-- Schmidt et al. (DFVLR-Mitteilung 84-15), which departs from Stephan
1503	!-- Siano's original version by discretizing the Poisson equation,
1504	!-- before it is Fourier-transformed
1505	DO i = 0, nx
1506	IF ( i >= 0 .AND. i <= nnxh ) THEN
1507	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
1508	REAL( nx+1 ) ) ) * ddx2 + &
1509	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
1510	REAL( ny+1 ) ) ) * ddy2
1511	ELSE
1512	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
1513	REAL( nx+1 ) ) ) * ddx2 + &
1514	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
1515	REAL( ny+1 ) ) ) * ddy2
1516	ENDIF
1517	ENDDO
1518
1519	DO k = 0, nz-1
1520	DO i = 0, nx
1521	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
1522	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
1523	tri(1,i,k) = a + c - l(i)
1524	ENDDO
1525	ENDDO
1526	IF ( ibc_p_b == 1 .OR. ibc_p_b == 2 ) THEN
1527	DO i = 0, nx
1528	tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
1529	ENDDO
1530	ENDIF
1531	IF ( ibc_p_t == 1 ) THEN
1532	DO i = 0, nx
1533	tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
1534	ENDDO
1535	ENDIF
1536
1537	END SUBROUTINE maketri_1dd
1538
1539
1540	#if defined( __intel11 )
1541	SUBROUTINE split_1dd( tri )
1542	#else
1543	SUBROUTINE split_1dd
1544	#endif
1545
1546	!------------------------------------------------------------------------------!
1547	! Splitting of the tridiagonal matrix (Thomas algorithm)
1548	!------------------------------------------------------------------------------!
1549
1550	IMPLICIT NONE
1551
1552	INTEGER :: i, k
1553
1554	#if defined( __intel11 )
1555	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
1556	#endif
1557
1558
1559	!
1560	!-- Splitting
1561	DO i = 0, nx
1562	tri(4,i,0) = tri(1,i,0)
1563	ENDDO
1564	DO k = 1, nz-1
1565	DO i = 0, nx
1566	tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
1567	tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
1568	ENDDO
1569	ENDDO
1570
1571	END SUBROUTINE split_1dd
1572
1573
1574	SUBROUTINE substi_1dd( ar, tri )
1575
1576	!------------------------------------------------------------------------------!
1577	! Substitution (Forward and Backward) (Thomas algorithm)
1578	!------------------------------------------------------------------------------!
1579
1580	IMPLICIT NONE
1581
1582	INTEGER :: i, k
1583
1584	REAL, DIMENSION(0:nx,nz) :: ar
1585	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
1586	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
1587
1588	!
1589	!-- Forward substitution
1590	DO i = 0, nx
1591	ar1(i,0) = ar(i,1)
1592	ENDDO
1593	DO k = 1, nz-1
1594	DO i = 0, nx
1595	ar1(i,k) = ar(i,k+1) - tri(5,i,k) * ar1(i,k-1)
1596	ENDDO
1597	ENDDO
1598
1599	!
1600	!-- Backward substitution
1601	!-- Note, the add of 1.0E-20 in the denominator is due to avoid divisions
1602	!-- by zero appearing if the pressure bc is set to neumann at the top of
1603	!-- the model domain.
1604	DO i = 0, nx
1605	ar(i,nz) = ar1(i,nz-1) / ( tri(4,i,nz-1) + 1.0E-20 )
1606	ENDDO
1607	DO k = nz-2, 0, -1
1608	DO i = 0, nx
1609	ar(i,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,k+2) ) &
1610	/ tri(4,i,k)
1611	ENDDO
1612	ENDDO
1613
1614	!
1615	!-- Indices i=0, j=0 correspond to horizontally averaged pressure.
1616	!-- The respective values of ar should be zero at all k-levels if
1617	!-- acceleration of horizontally averaged vertical velocity is zero.
1618	IF ( ibc_p_b == 1 .AND. ibc_p_t == 1 ) THEN
1619	IF ( j == 0 ) THEN
1620	DO k = 1, nz
1621	ar(0,k) = 0.0
1622	ENDDO
1623	ENDIF
1624	ENDIF
1625
1626	END SUBROUTINE substi_1dd
1627
1628	END SUBROUTINE tridia_1dd
1629
1630	#endif
1631	#endif
1632	END MODULE poisfft_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |