Home

Context Navigation

source: palm/trunk/SOURCE/poisfft.f90 @ 1074

Last change on this file since 1074 was 1037, checked in by raasch, 12 years ago
last commit documented
Property svn:keywords set to `Id`
File size: 47.7 KB

Line
1	MODULE poisfft_mod
2
3	!--------------------------------------------------------------------------------!
4	! This file is part of PALM.
5	!
6	! PALM is free software: you can redistribute it and/or modify it under the terms
7	! of the GNU General Public License as published by the Free Software Foundation,
8	! either version 3 of the License, or (at your option) any later version.
9	!
10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13	!
14	! You should have received a copy of the GNU General Public License along with
15	! PALM. If not, see <http://www.gnu.org/licenses/>.
16	!
17	! Copyright 1997-2012 Leibniz University Hannover
18	!--------------------------------------------------------------------------------!
19	!
20	! Current revisions:
21	! -----------------
22	!
23	!
24	! Former revisions:
25	! -----------------
26	! $Id: poisfft.f90 1037 2012-10-22 14:10:22Z hoffmann $
27	!
28	! 1036 2012-10-22 13:43:42Z raasch
29	! code put under GPL (PALM 3.9)
30	!
31	! 2012-09-21 07:03:55Z raasch
32	! FLOAT type conversion replaced by REAL
33	!
34	! 1003 2012-09-14 14:35:53Z raasch
35	! indices nxa, nya, etc. replaced by nx, ny, etc.
36	!
37	! 940 2012-07-09 14:31:00Z raasch
38	! special handling of tri-array as an argument in tridia_1dd routines switched
39	! off because it caused segmentation faults with intel 12.1 compiler
40	!
41	! 877 2012-04-03 11:21:44Z suehring
42	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
43	! pressure at the top of the model domain.
44	!
45	! 809 2012-01-30 13:32:58Z maronga
46	! Bugfix: replaced .AND. and .NOT. with && and ! in the preprocessor directives
47	!
48	! 807 2012-01-25 11:53:51Z maronga
49	! New cpp directive "__check" implemented which is used by check_namelist_files
50	! (most of the code is unneeded by check_namelist_files).
51	!
52	! 763 2011-10-06 09:32:09Z suehring
53	! Comment added concerning the last change.
54	!
55	! 761 2011-10-05 17:58:52Z suehring
56	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
57	! pressure at the top of the model domain.
58	!
59	! 696 2011-03-18 07:03:49Z raasch
60	! work_fftx removed from PRIVATE clauses in fftx_tr_xy and tr_yx_fftx
61	!
62	! 683 2011-02-09 14:25:15Z raasch
63	! openMP parallelization for 2d-domain-decomposition
64	!
65	! 667 2010-12-23 12:06:00Z suehring/gryschka
66	! ddzu replaced by ddzu_pres due to changes in zu(0)
67	!
68	! 622 2010-12-10 08:08:13Z raasch
69	! optional barriers included in order to speed up collective operations
70	!
71	! 377 2009-09-04 11:09:00Z raasch
72	! __lcmuk changed to __lc to avoid problems with Intel compiler on sgi-ice
73	!
74	! 164 2008-05-15 08:46:15Z raasch
75	! Arguments removed from transpose routines
76	!
77	! 128 2007-10-26 13:11:14Z raasch
78	! Bugfix: wavenumber calculation for even nx in routines maketri
79	!
80	! 85 2007-05-11 09:35:14Z raasch
81	! Bugfix: work_fft*_vec removed from some PRIVATE-declarations
82	!
83	! 76 2007-03-29 00:58:32Z raasch
84	! Tridiagonal coefficients adjusted for Neumann boundary conditions both at
85	! the bottom and the top.
86	!
87	! RCS Log replace by Id keyword, revision history cleaned up
88	!
89	! Revision 1.24 2006/08/04 15:00:24 raasch
90	! Default setting of the thread number tn in case of not using OpenMP
91	!
92	! Revision 1.23 2006/02/23 12:48:38 raasch
93	! Additional compiler directive in routine tridia_1dd for preventing loop
94	! exchange on NEC-SX6
95	!
96	! Revision 1.20 2004/04/30 12:38:09 raasch
97	! Parts of former poisfft_hybrid moved to this subroutine,
98	! former subroutine changed to a module, renaming of FFT-subroutines and
99	! -module, FFTs completely substituted by calls of fft_x and fft_y,
100	! NAG fft used in the non-parallel case completely removed, l in maketri
101	! is now a 1d-array, variables passed by modules instead of using parameter
102	! lists, enlarged transposition arrays introduced
103	!
104	! Revision 1.1 1997/07/24 11:24:14 raasch
105	! Initial revision
106	!
107	!
108	! Description:
109	! ------------
110	! See below.
111	!------------------------------------------------------------------------------!
112
113	!--------------------------------------------------------------------------!
114	! poisfft !
115	! !
116	! Original version: Stephan Siano (pois3d) !
117	! !
118	! Institute of Meteorology and Climatology, University of Hannover !
119	! Germany !
120	! !
121	! Version as of July 23,1996 !
122	! !
123	! !
124	! Version for parallel computers: Siegfried Raasch !
125	! !
126	! Version as of July 03,1997 !
127	! !
128	! Solves the Poisson equation with a 2D spectral method !
129	! d^2 p / dx^2 + d^2 p / dy^2 + d^2 p / dz^2 = s !
130	! !
131	! Input: !
132	! real ar contains in the (nnx,nny,nnz) elements, !
133	! starting from the element (1,nys,nxl), the !
134	! values for s !
135	! real work Temporary array !
136	! !
137	! Output: !
138	! real ar contains the solution for p !
139	!--------------------------------------------------------------------------!
140
141	USE fft_xy
142	USE indices
143	USE transpose_indices
144
145	IMPLICIT NONE
146
147	PRIVATE
148
149	#if ! defined ( __check )
150	PUBLIC poisfft, poisfft_init
151
152	INTERFACE poisfft
153	MODULE PROCEDURE poisfft
154	END INTERFACE poisfft
155
156	INTERFACE poisfft_init
157	MODULE PROCEDURE poisfft_init
158	END INTERFACE poisfft_init
159	#else
160	PUBLIC poisfft_init
161
162	INTERFACE poisfft_init
163	MODULE PROCEDURE poisfft_init
164	END INTERFACE poisfft_init
165	#endif
166
167	CONTAINS
168
169	SUBROUTINE poisfft_init
170
171	CALL fft_init
172
173	END SUBROUTINE poisfft_init
174
175	#if ! defined ( __check )
176	SUBROUTINE poisfft( ar, work )
177
178	USE cpulog
179	USE interfaces
180	USE pegrid
181
182	IMPLICIT NONE
183
184	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar, work
185
186
187	CALL cpu_log( log_point_s(3), 'poisfft', 'start' )
188
189	!
190	!-- Two-dimensional Fourier Transformation in x- and y-direction.
191	#if defined( __parallel )
192	IF ( pdims(2) == 1 ) THEN
193
194	!
195	!-- 1d-domain-decomposition along x:
196	!-- FFT along y and transposition y --> x
197	CALL ffty_tr_yx( ar, work, ar )
198
199	!
200	!-- FFT along x, solving the tridiagonal system and backward FFT
201	CALL fftx_tri_fftx( ar )
202
203	!
204	!-- Transposition x --> y and backward FFT along y
205	CALL tr_xy_ffty( ar, work, ar )
206
207	ELSEIF ( pdims(1) == 1 ) THEN
208
209	!
210	!-- 1d-domain-decomposition along y:
211	!-- FFT along x and transposition x --> y
212	CALL fftx_tr_xy( ar, work, ar )
213
214	!
215	!-- FFT along y, solving the tridiagonal system and backward FFT
216	CALL ffty_tri_ffty( ar )
217
218	!
219	!-- Transposition y --> x and backward FFT along x
220	CALL tr_yx_fftx( ar, work, ar )
221
222	ELSE
223
224	!
225	!-- 2d-domain-decomposition
226	!-- Transposition z --> x
227	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
228	CALL transpose_zx( ar, work, ar )
229	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
230
231	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
232	CALL fftxp( ar, 'forward' )
233	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
234
235	!
236	!-- Transposition x --> y
237	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
238	CALL transpose_xy( ar, work, ar )
239	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
240
241	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
242	CALL fftyp( ar, 'forward' )
243	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
244
245	!
246	!-- Transposition y --> z
247	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
248	CALL transpose_yz( ar, work, ar )
249	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
250
251	!
252	!-- Solve the Poisson equation in z-direction in cartesian space.
253	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
254	CALL tridia( ar )
255	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
256
257	!
258	!-- Inverse Fourier Transformation
259	!-- Transposition z --> y
260	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
261	CALL transpose_zy( ar, work, ar )
262	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
263
264	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
265	CALL fftyp( ar, 'backward' )
266	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
267
268	!
269	!-- Transposition y --> x
270	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
271	CALL transpose_yx( ar, work, ar )
272	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
273
274	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
275	CALL fftxp( ar, 'backward' )
276	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
277
278	!
279	!-- Transposition x --> z
280	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
281	CALL transpose_xz( ar, work, ar )
282	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
283
284	ENDIF
285
286	#else
287
288	!
289	!-- Two-dimensional Fourier Transformation along x- and y-direction.
290	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
291	CALL fftx( ar, 'forward' )
292	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
293	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
294	CALL ffty( ar, 'forward' )
295	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
296
297	!
298	!-- Solve the Poisson equation in z-direction in cartesian space.
299	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
300	CALL tridia( ar )
301	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
302
303	!
304	!-- Inverse Fourier Transformation.
305	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
306	CALL ffty( ar, 'backward' )
307	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
308	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
309	CALL fftx( ar, 'backward' )
310	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
311
312	#endif
313
314	CALL cpu_log( log_point_s(3), 'poisfft', 'stop' )
315
316	END SUBROUTINE poisfft
317
318
319
320	SUBROUTINE tridia( ar )
321
322	!------------------------------------------------------------------------------!
323	! solves the linear system of equations:
324	!
325	! -(4 pi^2(i^2/(dx^2nnx^2)+j^2/(dy^2nny^2))+
326	! 1/(dzu(k)dzw(k))+1/(dzu(k-1)dzw(k)))*p(i,j,k)+
327	! 1/(dzu(k)dzw(k))p(i,j,k+1)+1/(dzu(k-1)dzw(k))p(i,j,k-1)=d(i,j,k)
328	!
329	! by using the Thomas algorithm
330	!------------------------------------------------------------------------------!
331
332	USE arrays_3d
333
334	IMPLICIT NONE
335
336	INTEGER :: i, j, k, nnyh
337
338	REAL, DIMENSION(nxl_z:nxr_z,0:nz-1) :: ar1
339	REAL, DIMENSION(5,nxl_z:nxr_z,0:nz-1) :: tri
340
341	#if defined( __parallel )
342	REAL :: ar(nxl_z:nxr_z,nys_z:nyn_z,1:nz)
343	#else
344	REAL :: ar(1:nz,nys_z:nyn_z,nxl_z:nxr_z)
345	#endif
346
347
348	nnyh = (ny+1) / 2
349
350	!
351	!-- Define constant elements of the tridiagonal matrix.
352	!$OMP PARALLEL PRIVATE ( k, i )
353	!$OMP DO
354	DO k = 0, nz-1
355	DO i = nxl_z, nxr_z
356	tri(2,i,k) = ddzu_pres(k+1) * ddzw(k+1)
357	tri(3,i,k) = ddzu_pres(k+2) * ddzw(k+1)
358	ENDDO
359	ENDDO
360	!$OMP END PARALLEL
361
362	#if defined( __parallel )
363	!
364	!-- Repeat for all y-levels.
365	!$OMP PARALLEL FIRSTPRIVATE( tri ) PRIVATE ( ar1, j )
366	!$OMP DO
367	DO j = nys_z, nyn_z
368	IF ( j <= nnyh ) THEN
369	CALL maketri( tri, j )
370	ELSE
371	CALL maketri( tri, ny+1-j )
372	ENDIF
373	CALL split( tri )
374	CALL substi( ar, ar1, tri, j )
375	ENDDO
376	!$OMP END PARALLEL
377	#else
378	!
379	!-- First y-level.
380	CALL maketri( tri, nys_z )
381	CALL split( tri )
382	CALL substi( ar, ar1, tri, 0 )
383
384	!
385	!-- Further y-levels.
386	DO j = 1, nnyh - 1
387	CALL maketri( tri, j )
388	CALL split( tri )
389	CALL substi( ar, ar1, tri, j )
390	CALL substi( ar, ar1, tri, ny+1-j )
391	ENDDO
392	CALL maketri( tri, nnyh )
393	CALL split( tri )
394	CALL substi( ar, ar1, tri, nnyh+nys )
395	#endif
396
397	CONTAINS
398
399	SUBROUTINE maketri( tri, j )
400
401	!------------------------------------------------------------------------------!
402	! Computes the i- and j-dependent component of the matrix
403	!------------------------------------------------------------------------------!
404
405	USE arrays_3d
406	USE constants
407	USE control_parameters
408	USE grid_variables
409
410	IMPLICIT NONE
411
412	INTEGER :: i, j, k, nnxh
413	REAL :: a, c
414	REAL :: ll(nxl_z:nxr_z)
415	REAL :: tri(5,nxl_z:nxr_z,0:nz-1)
416
417
418	nnxh = ( nx + 1 ) / 2
419
420	!
421	!-- Provide the tridiagonal matrix for solution of the Poisson equation in
422	!-- Fourier space. The coefficients are computed following the method of
423	!-- Schmidt et al. (DFVLR-Mitteilung 84-15), which departs from Stephan
424	!-- Siano's original version by discretizing the Poisson equation,
425	!-- before it is Fourier-transformed
426	#if defined( __parallel )
427	DO i = nxl_z, nxr_z
428	IF ( i >= 0 .AND. i <= nnxh ) THEN
429	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
430	REAL( nx+1 ) ) ) / ( dx * dx ) + &
431	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
432	REAL( ny+1 ) ) ) / ( dy * dy )
433	ELSE
434	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
435	REAL( nx+1 ) ) ) / ( dx * dx ) + &
436	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
437	REAL( ny+1 ) ) ) / ( dy * dy )
438	ENDIF
439	DO k = 0,nz-1
440	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
441	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
442	tri(1,i,k) = a + c - ll(i)
443	ENDDO
444	ENDDO
445	#else
446	DO i = 0, nnxh
447	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / REAL( nx+1 ) ) ) / &
448	( dx * dx ) + &
449	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / REAL( ny+1 ) ) ) / &
450	( dy * dy )
451	DO k = 0, nz-1
452	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
453	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
454	tri(1,i,k) = a + c - ll(i)
455	IF ( i >= 1 .and. i < nnxh ) THEN
456	tri(1,nx+1-i,k) = tri(1,i,k)
457	ENDIF
458	ENDDO
459	ENDDO
460	#endif
461	IF ( ibc_p_b == 1 .OR. ibc_p_b == 2 ) THEN
462	DO i = nxl_z, nxr_z
463	tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
464	ENDDO
465	ENDIF
466	IF ( ibc_p_t == 1 ) THEN
467	DO i = nxl_z, nxr_z
468	tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
469	ENDDO
470	ENDIF
471
472	END SUBROUTINE maketri
473
474
475	SUBROUTINE substi( ar, ar1, tri, j )
476
477	!------------------------------------------------------------------------------!
478	! Substitution (Forward and Backward) (Thomas algorithm)
479	!------------------------------------------------------------------------------!
480
481	USE control_parameters
482
483	IMPLICIT NONE
484
485	INTEGER :: i, j, k
486	REAL :: ar1(nxl_z:nxr_z,0:nz-1)
487	REAL :: tri(5,nxl_z:nxr_z,0:nz-1)
488	#if defined( __parallel )
489	REAL :: ar(nxl_z:nxr_z,nys_z:nyn_z,1:nz)
490	#else
491	REAL :: ar(1:nz,nys_z:nyn_z,nxl_z:nxr_z)
492	#endif
493
494	!
495	!-- Forward substitution.
496	DO i = nxl_z, nxr_z
497	#if defined( __parallel )
498	ar1(i,0) = ar(i,j,1)
499	#else
500	ar1(i,0) = ar(1,j,i)
501	#endif
502	ENDDO
503	DO k = 1, nz - 1
504	DO i = nxl_z, nxr_z
505	#if defined( __parallel )
506	ar1(i,k) = ar(i,j,k+1) - tri(5,i,k) * ar1(i,k-1)
507	#else
508	ar1(i,k) = ar(k+1,j,i) - tri(5,i,k) * ar1(i,k-1)
509	#endif
510	ENDDO
511	ENDDO
512
513	!
514	!-- Backward substitution
515	!-- Note, the 1.0E-20 in the denominator is due to avoid divisions
516	!-- by zero appearing if the pressure bc is set to neumann at the top of
517	!-- the model domain.
518	DO i = nxl_z, nxr_z
519	#if defined( __parallel )
520	ar(i,j,nz) = ar1(i,nz-1) / ( tri(4,i,nz-1) + 1.0E-20 )
521	#else
522	ar(nz,j,i) = ar1(i,nz-1) / ( tri(4,i,nz-1) + 1.0E-20 )
523	#endif
524	ENDDO
525	DO k = nz-2, 0, -1
526	DO i = nxl_z, nxr_z
527	#if defined( __parallel )
528	ar(i,j,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,j,k+2) ) &
529	/ tri(4,i,k)
530	#else
531	ar(k+1,j,i) = ( ar1(i,k) - tri(3,i,k) * ar(k+2,j,i) ) &
532	/ tri(4,i,k)
533	#endif
534	ENDDO
535	ENDDO
536
537	!
538	!-- Indices i=0, j=0 correspond to horizontally averaged pressure.
539	!-- The respective values of ar should be zero at all k-levels if
540	!-- acceleration of horizontally averaged vertical velocity is zero.
541	IF ( ibc_p_b == 1 .AND. ibc_p_t == 1 ) THEN
542	IF ( j == 0 .AND. nxl_z == 0 ) THEN
543	#if defined( __parallel )
544	DO k = 1, nz
545	ar(nxl_z,j,k) = 0.0
546	ENDDO
547	#else
548	DO k = 1, nz
549	ar(k,j,nxl_z) = 0.0
550	ENDDO
551	#endif
552	ENDIF
553	ENDIF
554
555	END SUBROUTINE substi
556
557
558	SUBROUTINE split( tri )
559
560	!------------------------------------------------------------------------------!
561	! Splitting of the tridiagonal matrix (Thomas algorithm)
562	!------------------------------------------------------------------------------!
563
564	IMPLICIT NONE
565
566	INTEGER :: i, k
567	REAL :: tri(5,nxl_z:nxr_z,0:nz-1)
568
569	!
570	!-- Splitting.
571	DO i = nxl_z, nxr_z
572	tri(4,i,0) = tri(1,i,0)
573	ENDDO
574	DO k = 1, nz-1
575	DO i = nxl_z, nxr_z
576	tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
577	tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
578	ENDDO
579	ENDDO
580
581	END SUBROUTINE split
582
583	END SUBROUTINE tridia
584
585
586	#if defined( __parallel )
587	SUBROUTINE fftxp( ar, direction )
588
589	!------------------------------------------------------------------------------!
590	! Fourier-transformation along x-direction Parallelized version
591	!------------------------------------------------------------------------------!
592
593	IMPLICIT NONE
594
595	CHARACTER (LEN=*) :: direction
596	INTEGER :: j, k
597	REAL :: ar(0:nx,nys_x:nyn_x,nzb_x:nzt_x)
598
599	!
600	!-- Performing the fft with one of the methods implemented
601	!$OMP PARALLEL PRIVATE ( j, k )
602	!$OMP DO
603	DO k = nzb_x, nzt_x
604	DO j = nys_x, nyn_x
605	CALL fft_x( ar(0:nx,j,k), direction )
606	ENDDO
607	ENDDO
608	!$OMP END PARALLEL
609
610	END SUBROUTINE fftxp
611
612	#else
613	SUBROUTINE fftx( ar, direction )
614
615	!------------------------------------------------------------------------------!
616	! Fourier-transformation along x-direction Non parallel version
617	!------------------------------------------------------------------------------!
618
619	IMPLICIT NONE
620
621	CHARACTER (LEN=*) :: direction
622	INTEGER :: i, j, k
623	REAL :: ar(1:nz,0:ny,0:nx)
624
625	!
626	!-- Performing the fft with one of the methods implemented
627	!$OMP PARALLEL PRIVATE ( j, k )
628	!$OMP DO
629	DO k = 1, nz
630	DO j = 0, ny
631	CALL fft_x( ar(k,j,0:nx), direction )
632	ENDDO
633	ENDDO
634	!$OMP END PARALLEL
635
636	END SUBROUTINE fftx
637	#endif
638
639
640	#if defined( __parallel )
641	SUBROUTINE fftyp( ar, direction )
642
643	!------------------------------------------------------------------------------!
644	! Fourier-transformation along y-direction Parallelized version
645	!------------------------------------------------------------------------------!
646
647	IMPLICIT NONE
648
649	CHARACTER (LEN=*) :: direction
650	INTEGER :: i, k
651	REAL :: ar(0:ny,nxl_y:nxr_y,nzb_y:nzt_y)
652
653	!
654	!-- Performing the fft with one of the methods implemented
655	!$OMP PARALLEL PRIVATE ( i, k )
656	!$OMP DO
657	DO k = nzb_y, nzt_y
658	DO i = nxl_y, nxr_y
659	CALL fft_y( ar(0:ny,i,k), direction )
660	ENDDO
661	ENDDO
662	!$OMP END PARALLEL
663
664	END SUBROUTINE fftyp
665
666	#else
667	SUBROUTINE ffty( ar, direction )
668
669	!------------------------------------------------------------------------------!
670	! Fourier-transformation along y-direction Non parallel version
671	!------------------------------------------------------------------------------!
672
673	IMPLICIT NONE
674
675	CHARACTER (LEN=*) :: direction
676	INTEGER :: i, k
677	REAL :: ar(1:nz,0:ny,0:nx)
678
679	!
680	!-- Performing the fft with one of the methods implemented
681	!$OMP PARALLEL PRIVATE ( i, k )
682	!$OMP DO
683	DO k = 1, nz
684	DO i = 0, nx
685	CALL fft_y( ar(k,0:ny,i), direction )
686	ENDDO
687	ENDDO
688	!$OMP END PARALLEL
689
690	END SUBROUTINE ffty
691	#endif
692
693	#if defined( __parallel )
694	SUBROUTINE ffty_tr_yx( f_in, work, f_out )
695
696	!------------------------------------------------------------------------------!
697	! Fourier-transformation along y with subsequent transposition y --> x for
698	! a 1d-decomposition along x
699	!
700	! ATTENTION: The performance of this routine is much faster on the NEC-SX6,
701	! if the first index of work_ffty_vec is odd. Otherwise
702	! memory bank conflicts may occur (especially if the index is a
703	! multiple of 128). That's why work_ffty_vec is dimensioned as
704	! 0:ny+1.
705	! Of course, this will not work if users are using an odd number
706	! of gridpoints along y.
707	!------------------------------------------------------------------------------!
708
709	USE control_parameters
710	USE cpulog
711	USE indices
712	USE interfaces
713	USE pegrid
714	USE transpose_indices
715
716	IMPLICIT NONE
717
718	INTEGER :: i, iend, iouter, ir, j, k
719	INTEGER, PARAMETER :: stridex = 4
720
721	REAL, DIMENSION(0:ny,stridex) :: work_ffty
722	#if defined( __nec )
723	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
724	#endif
725	REAL, DIMENSION(1:nz,0:ny,nxl:nxr) :: f_in
726	REAL, DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_out
727	REAL, DIMENSION(nxl:nxr,1:nz,0:ny) :: work
728
729	!
730	!-- Carry out the FFT along y, where all data are present due to the
731	!-- 1d-decomposition along x. Resort the data in a way that x becomes
732	!-- the first index.
733	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
734
735	IF ( host(1:3) == 'nec' ) THEN
736	#if defined( __nec )
737	!
738	!-- Code optimized for vector processors
739	!$OMP PARALLEL PRIVATE ( i, j, k )
740	!$OMP DO
741	DO i = nxl, nxr
742
743	DO j = 0, ny
744	DO k = 1, nz
745	work_ffty_vec(j,k,i) = f_in(k,j,i)
746	ENDDO
747	ENDDO
748
749	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' )
750
751	ENDDO
752
753	!$OMP DO
754	DO k = 1, nz
755	DO j = 0, ny
756	DO i = nxl, nxr
757	work(i,k,j) = work_ffty_vec(j,k,i)
758	ENDDO
759	ENDDO
760	ENDDO
761	!$OMP END PARALLEL
762	#endif
763
764	ELSE
765
766	!
767	!-- Cache optimized code.
768	!-- The i-(x-)direction is split into a strided outer loop and an inner
769	!-- loop for better cache performance
770	!$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty)
771	!$OMP DO
772	DO iouter = nxl, nxr, stridex
773
774	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
775
776	DO k = 1, nz
777
778	DO i = iouter, iend
779
780	ir = i-iouter+1 ! counter within a stride
781	DO j = 0, ny
782	work_ffty(j,ir) = f_in(k,j,i)
783	ENDDO
784	!
785	!-- FFT along y
786	CALL fft_y( work_ffty(:,ir), 'forward' )
787
788	ENDDO
789
790	!
791	!-- Resort
792	DO j = 0, ny
793	DO i = iouter, iend
794	work(i,k,j) = work_ffty(j,i-iouter+1)
795	ENDDO
796	ENDDO
797
798	ENDDO
799
800	ENDDO
801	!$OMP END PARALLEL
802
803	ENDIF
804	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
805
806	!
807	!-- Transpose array
808	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
809	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
810	CALL MPI_ALLTOALL( work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
811	f_out(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
812	comm1dx, ierr )
813	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
814
815	END SUBROUTINE ffty_tr_yx
816
817
818	SUBROUTINE tr_xy_ffty( f_in, work, f_out )
819
820	!------------------------------------------------------------------------------!
821	! Transposition x --> y with a subsequent backward Fourier transformation for
822	! a 1d-decomposition along x
823	!------------------------------------------------------------------------------!
824
825	USE control_parameters
826	USE cpulog
827	USE indices
828	USE interfaces
829	USE pegrid
830	USE transpose_indices
831
832	IMPLICIT NONE
833
834	INTEGER :: i, iend, iouter, ir, j, k
835	INTEGER, PARAMETER :: stridex = 4
836
837	REAL, DIMENSION(0:ny,stridex) :: work_ffty
838	#if defined( __nec )
839	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
840	#endif
841	REAL, DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_in
842	REAL, DIMENSION(1:nz,0:ny,nxl:nxr) :: f_out
843	REAL, DIMENSION(nxl:nxr,1:nz,0:ny) :: work
844
845	!
846	!-- Transpose array
847	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
848	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
849	CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
850	work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
851	comm1dx, ierr )
852	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
853
854	!
855	!-- Resort the data in a way that y becomes the first index and carry out the
856	!-- backward fft along y.
857	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
858
859	IF ( host(1:3) == 'nec' ) THEN
860	#if defined( __nec )
861	!
862	!-- Code optimized for vector processors
863	!$OMP PARALLEL PRIVATE ( i, j, k )
864	!$OMP DO
865	DO k = 1, nz
866	DO j = 0, ny
867	DO i = nxl, nxr
868	work_ffty_vec(j,k,i) = work(i,k,j)
869	ENDDO
870	ENDDO
871	ENDDO
872
873	!$OMP DO
874	DO i = nxl, nxr
875
876	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' )
877
878	DO j = 0, ny
879	DO k = 1, nz
880	f_out(k,j,i) = work_ffty_vec(j,k,i)
881	ENDDO
882	ENDDO
883
884	ENDDO
885	!$OMP END PARALLEL
886	#endif
887
888	ELSE
889
890	!
891	!-- Cache optimized code.
892	!-- The i-(x-)direction is split into a strided outer loop and an inner
893	!-- loop for better cache performance
894	!$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty )
895	!$OMP DO
896	DO iouter = nxl, nxr, stridex
897
898	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
899
900	DO k = 1, nz
901	!
902	!-- Resort
903	DO j = 0, ny
904	DO i = iouter, iend
905	work_ffty(j,i-iouter+1) = work(i,k,j)
906	ENDDO
907	ENDDO
908
909	DO i = iouter, iend
910
911	!
912	!-- FFT along y
913	ir = i-iouter+1 ! counter within a stride
914	CALL fft_y( work_ffty(:,ir), 'backward' )
915
916	DO j = 0, ny
917	f_out(k,j,i) = work_ffty(j,ir)
918	ENDDO
919	ENDDO
920
921	ENDDO
922
923	ENDDO
924	!$OMP END PARALLEL
925
926	ENDIF
927
928	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
929
930	END SUBROUTINE tr_xy_ffty
931
932
933	SUBROUTINE fftx_tri_fftx( ar )
934
935	!------------------------------------------------------------------------------!
936	! FFT along x, solution of the tridiagonal system and backward FFT for
937	! a 1d-decomposition along x
938	!
939	! WARNING: this subroutine may still not work for hybrid parallelization
940	! with OpenMP (for possible necessary changes see the original
941	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
942	!------------------------------------------------------------------------------!
943
944	USE control_parameters
945	USE cpulog
946	USE grid_variables
947	USE indices
948	USE interfaces
949	USE pegrid
950	USE transpose_indices
951
952	IMPLICIT NONE
953
954	character(len=3) :: myth_char
955
956	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
957
958	REAL, DIMENSION(0:nx) :: work_fftx
959	REAL, DIMENSION(0:nx,1:nz) :: work_trix
960	REAL, DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: ar
961	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
962
963
964	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'start' )
965
966	ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) )
967
968	tn = 0 ! Default thread number in case of one thread
969	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_fftx, work_trix )
970	DO j = nys_x, nyn_x
971
972	!$ tn = omp_get_thread_num()
973
974	IF ( host(1:3) == 'nec' ) THEN
975	!
976	!-- Code optimized for vector processors
977	DO k = 1, nz
978
979	m = 0
980	DO n = 1, pdims(1)
981	DO i = 1, nnx
982	work_trix(m,k) = ar(i,k,j,n)
983	m = m + 1
984	ENDDO
985	ENDDO
986
987	ENDDO
988
989	CALL fft_x_m( work_trix, 'forward' )
990
991	ELSE
992	!
993	!-- Cache optimized code
994	DO k = 1, nz
995
996	m = 0
997	DO n = 1, pdims(1)
998	DO i = 1, nnx
999	work_fftx(m) = ar(i,k,j,n)
1000	m = m + 1
1001	ENDDO
1002	ENDDO
1003
1004	CALL fft_x( work_fftx, 'forward' )
1005
1006	DO i = 0, nx
1007	work_trix(i,k) = work_fftx(i)
1008	ENDDO
1009
1010	ENDDO
1011
1012	ENDIF
1013
1014	!
1015	!-- Solve the linear equation system
1016	CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) )
1017
1018	IF ( host(1:3) == 'nec' ) THEN
1019	!
1020	!-- Code optimized for vector processors
1021	CALL fft_x_m( work_trix, 'backward' )
1022
1023	DO k = 1, nz
1024
1025	m = 0
1026	DO n = 1, pdims(1)
1027	DO i = 1, nnx
1028	ar(i,k,j,n) = work_trix(m,k)
1029	m = m + 1
1030	ENDDO
1031	ENDDO
1032
1033	ENDDO
1034
1035	ELSE
1036	!
1037	!-- Cache optimized code
1038	DO k = 1, nz
1039
1040	DO i = 0, nx
1041	work_fftx(i) = work_trix(i,k)
1042	ENDDO
1043
1044	CALL fft_x( work_fftx, 'backward' )
1045
1046	m = 0
1047	DO n = 1, pdims(1)
1048	DO i = 1, nnx
1049	ar(i,k,j,n) = work_fftx(m)
1050	m = m + 1
1051	ENDDO
1052	ENDDO
1053
1054	ENDDO
1055
1056	ENDIF
1057
1058	ENDDO
1059
1060	DEALLOCATE( tri )
1061
1062	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'stop' )
1063
1064	END SUBROUTINE fftx_tri_fftx
1065
1066
1067	SUBROUTINE fftx_tr_xy( f_in, work, f_out )
1068
1069	!------------------------------------------------------------------------------!
1070	! Fourier-transformation along x with subsequent transposition x --> y for
1071	! a 1d-decomposition along y
1072	!
1073	! ATTENTION: The NEC-branch of this routine may significantly profit from
1074	! further optimizations. So far, performance is much worse than
1075	! for routine ffty_tr_yx (more than three times slower).
1076	!------------------------------------------------------------------------------!
1077
1078	USE control_parameters
1079	USE cpulog
1080	USE indices
1081	USE interfaces
1082	USE pegrid
1083	USE transpose_indices
1084
1085	IMPLICIT NONE
1086
1087	INTEGER :: i, j, k
1088
1089	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
1090	REAL, DIMENSION(1:nz,nys:nyn,0:nx) :: f_in
1091	REAL, DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_out
1092	REAL, DIMENSION(nys:nyn,1:nz,0:nx) :: work
1093
1094	!
1095	!-- Carry out the FFT along x, where all data are present due to the
1096	!-- 1d-decomposition along y. Resort the data in a way that y becomes
1097	!-- the first index.
1098	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
1099
1100	IF ( host(1:3) == 'nec' ) THEN
1101	!
1102	!-- Code for vector processors
1103	!$OMP PARALLEL PRIVATE ( i, j, k )
1104	!$OMP DO
1105	DO i = 0, nx
1106
1107	DO j = nys, nyn
1108	DO k = 1, nz
1109	work_fftx(i,k,j) = f_in(k,j,i)
1110	ENDDO
1111	ENDDO
1112
1113	ENDDO
1114
1115	!$OMP DO
1116	DO j = nys, nyn
1117
1118	CALL fft_x_m( work_fftx(:,:,j), 'forward' )
1119
1120	DO k = 1, nz
1121	DO i = 0, nx
1122	work(j,k,i) = work_fftx(i,k,j)
1123	ENDDO
1124	ENDDO
1125
1126	ENDDO
1127	!$OMP END PARALLEL
1128
1129	ELSE
1130
1131	!
1132	!-- Cache optimized code (there might be still a potential for better
1133	!-- optimization).
1134	!$OMP PARALLEL PRIVATE (i,j,k)
1135	!$OMP DO
1136	DO i = 0, nx
1137
1138	DO j = nys, nyn
1139	DO k = 1, nz
1140	work_fftx(i,k,j) = f_in(k,j,i)
1141	ENDDO
1142	ENDDO
1143
1144	ENDDO
1145
1146	!$OMP DO
1147	DO j = nys, nyn
1148	DO k = 1, nz
1149
1150	CALL fft_x( work_fftx(0:nx,k,j), 'forward' )
1151
1152	DO i = 0, nx
1153	work(j,k,i) = work_fftx(i,k,j)
1154	ENDDO
1155	ENDDO
1156
1157	ENDDO
1158	!$OMP END PARALLEL
1159
1160	ENDIF
1161	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
1162
1163	!
1164	!-- Transpose array
1165	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1166	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1167	CALL MPI_ALLTOALL( work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
1168	f_out(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
1169	comm1dy, ierr )
1170	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1171
1172	END SUBROUTINE fftx_tr_xy
1173
1174
1175	SUBROUTINE tr_yx_fftx( f_in, work, f_out )
1176
1177	!------------------------------------------------------------------------------!
1178	! Transposition y --> x with a subsequent backward Fourier transformation for
1179	! a 1d-decomposition along x
1180	!------------------------------------------------------------------------------!
1181
1182	USE control_parameters
1183	USE cpulog
1184	USE indices
1185	USE interfaces
1186	USE pegrid
1187	USE transpose_indices
1188
1189	IMPLICIT NONE
1190
1191	INTEGER :: i, j, k
1192
1193	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
1194	REAL, DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_in
1195	REAL, DIMENSION(1:nz,nys:nyn,0:nx) :: f_out
1196	REAL, DIMENSION(nys:nyn,1:nz,0:nx) :: work
1197
1198	!
1199	!-- Transpose array
1200	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1201	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1202	CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
1203	work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
1204	comm1dy, ierr )
1205	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1206
1207	!
1208	!-- Carry out the FFT along x, where all data are present due to the
1209	!-- 1d-decomposition along y. Resort the data in a way that y becomes
1210	!-- the first index.
1211	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
1212
1213	IF ( host(1:3) == 'nec' ) THEN
1214	!
1215	!-- Code optimized for vector processors
1216	!$OMP PARALLEL PRIVATE ( i, j, k )
1217	!$OMP DO
1218	DO j = nys, nyn
1219
1220	DO k = 1, nz
1221	DO i = 0, nx
1222	work_fftx(i,k,j) = work(j,k,i)
1223	ENDDO
1224	ENDDO
1225
1226	CALL fft_x_m( work_fftx(:,:,j), 'backward' )
1227
1228	ENDDO
1229
1230	!$OMP DO
1231	DO i = 0, nx
1232	DO j = nys, nyn
1233	DO k = 1, nz
1234	f_out(k,j,i) = work_fftx(i,k,j)
1235	ENDDO
1236	ENDDO
1237	ENDDO
1238	!$OMP END PARALLEL
1239
1240	ELSE
1241
1242	!
1243	!-- Cache optimized code (there might be still a potential for better
1244	!-- optimization).
1245	!$OMP PARALLEL PRIVATE (i,j,k)
1246	!$OMP DO
1247	DO j = nys, nyn
1248	DO k = 1, nz
1249
1250	DO i = 0, nx
1251	work_fftx(i,k,j) = work(j,k,i)
1252	ENDDO
1253
1254	CALL fft_x( work_fftx(0:nx,k,j), 'backward' )
1255
1256	ENDDO
1257	ENDDO
1258
1259	!$OMP DO
1260	DO i = 0, nx
1261	DO j = nys, nyn
1262	DO k = 1, nz
1263	f_out(k,j,i) = work_fftx(i,k,j)
1264	ENDDO
1265	ENDDO
1266	ENDDO
1267	!$OMP END PARALLEL
1268
1269	ENDIF
1270	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
1271
1272	END SUBROUTINE tr_yx_fftx
1273
1274
1275	SUBROUTINE ffty_tri_ffty( ar )
1276
1277	!------------------------------------------------------------------------------!
1278	! FFT along y, solution of the tridiagonal system and backward FFT for
1279	! a 1d-decomposition along y
1280	!
1281	! WARNING: this subroutine may still not work for hybrid parallelization
1282	! with OpenMP (for possible necessary changes see the original
1283	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
1284	!------------------------------------------------------------------------------!
1285
1286	USE control_parameters
1287	USE cpulog
1288	USE grid_variables
1289	USE indices
1290	USE interfaces
1291	USE pegrid
1292	USE transpose_indices
1293
1294	IMPLICIT NONE
1295
1296	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
1297
1298	REAL, DIMENSION(0:ny) :: work_ffty
1299	REAL, DIMENSION(0:ny,1:nz) :: work_triy
1300	REAL, DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: ar
1301	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
1302
1303
1304	CALL cpu_log( log_point_s(39), 'fft_y + tridia', 'start' )
1305
1306	ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) )
1307
1308	tn = 0 ! Default thread number in case of one thread
1309	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy )
1310	DO i = nxl_y, nxr_y
1311
1312	!$ tn = omp_get_thread_num()
1313
1314	IF ( host(1:3) == 'nec' ) THEN
1315	!
1316	!-- Code optimized for vector processors
1317	DO k = 1, nz
1318
1319	m = 0
1320	DO n = 1, pdims(2)
1321	DO j = 1, nny
1322	work_triy(m,k) = ar(j,k,i,n)
1323	m = m + 1
1324	ENDDO
1325	ENDDO
1326
1327	ENDDO
1328
1329	CALL fft_y_m( work_triy, ny, 'forward' )
1330
1331	ELSE
1332	!
1333	!-- Cache optimized code
1334	DO k = 1, nz
1335
1336	m = 0
1337	DO n = 1, pdims(2)
1338	DO j = 1, nny
1339	work_ffty(m) = ar(j,k,i,n)
1340	m = m + 1
1341	ENDDO
1342	ENDDO
1343
1344	CALL fft_y( work_ffty, 'forward' )
1345
1346	DO j = 0, ny
1347	work_triy(j,k) = work_ffty(j)
1348	ENDDO
1349
1350	ENDDO
1351
1352	ENDIF
1353
1354	!
1355	!-- Solve the linear equation system
1356	CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) )
1357
1358	IF ( host(1:3) == 'nec' ) THEN
1359	!
1360	!-- Code optimized for vector processors
1361	CALL fft_y_m( work_triy, ny, 'backward' )
1362
1363	DO k = 1, nz
1364
1365	m = 0
1366	DO n = 1, pdims(2)
1367	DO j = 1, nny
1368	ar(j,k,i,n) = work_triy(m,k)
1369	m = m + 1
1370	ENDDO
1371	ENDDO
1372
1373	ENDDO
1374
1375	ELSE
1376	!
1377	!-- Cache optimized code
1378	DO k = 1, nz
1379
1380	DO j = 0, ny
1381	work_ffty(j) = work_triy(j,k)
1382	ENDDO
1383
1384	CALL fft_y( work_ffty, 'backward' )
1385
1386	m = 0
1387	DO n = 1, pdims(2)
1388	DO j = 1, nny
1389	ar(j,k,i,n) = work_ffty(m)
1390	m = m + 1
1391	ENDDO
1392	ENDDO
1393
1394	ENDDO
1395
1396	ENDIF
1397
1398	ENDDO
1399
1400	DEALLOCATE( tri )
1401
1402	CALL cpu_log( log_point_s(39), 'fft_y + tridia', 'stop' )
1403
1404	END SUBROUTINE ffty_tri_ffty
1405
1406
1407	SUBROUTINE tridia_1dd( ddx2, ddy2, nx, ny, j, ar, tri )
1408
1409	!------------------------------------------------------------------------------!
1410	! Solves the linear system of equations for a 1d-decomposition along x (see
1411	! tridia)
1412	!
1413	! Attention: when using the intel compilers older than 12.0, array tri must
1414	! be passed as an argument to the contained subroutines. Otherwise
1415	! addres faults will occur. This feature can be activated with
1416	! cpp-switch __intel11
1417	! On NEC, tri should not be passed (except for routine substi_1dd)
1418	! because this causes very bad performance.
1419	!------------------------------------------------------------------------------!
1420
1421	USE arrays_3d
1422	USE control_parameters
1423
1424	USE pegrid
1425
1426	IMPLICIT NONE
1427
1428	INTEGER :: i, j, k, nnyh, nx, ny, omp_get_thread_num, tn
1429
1430	REAL :: ddx2, ddy2
1431
1432	REAL, DIMENSION(0:nx,1:nz) :: ar
1433	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
1434	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
1435
1436
1437	nnyh = ( ny + 1 ) / 2
1438
1439	!
1440	!-- Define constant elements of the tridiagonal matrix.
1441	!-- The compiler on SX6 does loop exchange. If 0:nx is a high power of 2,
1442	!-- the exchanged loops create bank conflicts. The following directive
1443	!-- prohibits loop exchange and the loops perform much better.
1444	! tn = omp_get_thread_num()
1445	! WRITE( 120+tn, * ) '+++ id=',myid,' nx=',nx,' thread=', omp_get_thread_num()
1446	! CALL local_flush( 120+tn )
1447	!CDIR NOLOOPCHG
1448	DO k = 0, nz-1
1449	DO i = 0,nx
1450	tri(2,i,k) = ddzu_pres(k+1) * ddzw(k+1)
1451	tri(3,i,k) = ddzu_pres(k+2) * ddzw(k+1)
1452	ENDDO
1453	ENDDO
1454	! WRITE( 120+tn, * ) '+++ id=',myid,' end of first tridia loop thread=', omp_get_thread_num()
1455	! CALL local_flush( 120+tn )
1456
1457	IF ( j <= nnyh ) THEN
1458	#if defined( __intel11 )
1459	CALL maketri_1dd( j, tri )
1460	#else
1461	CALL maketri_1dd( j )
1462	#endif
1463	ELSE
1464	#if defined( __intel11 )
1465	CALL maketri_1dd( ny+1-j, tri )
1466	#else
1467	CALL maketri_1dd( ny+1-j )
1468	#endif
1469	ENDIF
1470	#if defined( __intel11 )
1471	CALL split_1dd( tri )
1472	#else
1473	CALL split_1dd
1474	#endif
1475	CALL substi_1dd( ar, tri )
1476
1477	CONTAINS
1478
1479	#if defined( __intel11 )
1480	SUBROUTINE maketri_1dd( j, tri )
1481	#else
1482	SUBROUTINE maketri_1dd( j )
1483	#endif
1484
1485	!------------------------------------------------------------------------------!
1486	! computes the i- and j-dependent component of the matrix
1487	!------------------------------------------------------------------------------!
1488
1489	USE constants
1490
1491	IMPLICIT NONE
1492
1493	INTEGER :: i, j, k, nnxh
1494	REAL :: a, c
1495
1496	REAL, DIMENSION(0:nx) :: l
1497
1498	#if defined( __intel11 )
1499	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
1500	#endif
1501
1502
1503	nnxh = ( nx + 1 ) / 2
1504	!
1505	!-- Provide the tridiagonal matrix for solution of the Poisson equation in
1506	!-- Fourier space. The coefficients are computed following the method of
1507	!-- Schmidt et al. (DFVLR-Mitteilung 84-15), which departs from Stephan
1508	!-- Siano's original version by discretizing the Poisson equation,
1509	!-- before it is Fourier-transformed
1510	DO i = 0, nx
1511	IF ( i >= 0 .AND. i <= nnxh ) THEN
1512	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
1513	REAL( nx+1 ) ) ) * ddx2 + &
1514	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
1515	REAL( ny+1 ) ) ) * ddy2
1516	ELSE
1517	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
1518	REAL( nx+1 ) ) ) * ddx2 + &
1519	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
1520	REAL( ny+1 ) ) ) * ddy2
1521	ENDIF
1522	ENDDO
1523
1524	DO k = 0, nz-1
1525	DO i = 0, nx
1526	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
1527	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
1528	tri(1,i,k) = a + c - l(i)
1529	ENDDO
1530	ENDDO
1531	IF ( ibc_p_b == 1 .OR. ibc_p_b == 2 ) THEN
1532	DO i = 0, nx
1533	tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
1534	ENDDO
1535	ENDIF
1536	IF ( ibc_p_t == 1 ) THEN
1537	DO i = 0, nx
1538	tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
1539	ENDDO
1540	ENDIF
1541
1542	END SUBROUTINE maketri_1dd
1543
1544
1545	#if defined( __intel11 )
1546	SUBROUTINE split_1dd( tri )
1547	#else
1548	SUBROUTINE split_1dd
1549	#endif
1550
1551	!------------------------------------------------------------------------------!
1552	! Splitting of the tridiagonal matrix (Thomas algorithm)
1553	!------------------------------------------------------------------------------!
1554
1555	IMPLICIT NONE
1556
1557	INTEGER :: i, k
1558
1559	#if defined( __intel11 )
1560	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
1561	#endif
1562
1563
1564	!
1565	!-- Splitting
1566	DO i = 0, nx
1567	tri(4,i,0) = tri(1,i,0)
1568	ENDDO
1569	DO k = 1, nz-1
1570	DO i = 0, nx
1571	tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
1572	tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
1573	ENDDO
1574	ENDDO
1575
1576	END SUBROUTINE split_1dd
1577
1578
1579	SUBROUTINE substi_1dd( ar, tri )
1580
1581	!------------------------------------------------------------------------------!
1582	! Substitution (Forward and Backward) (Thomas algorithm)
1583	!------------------------------------------------------------------------------!
1584
1585	IMPLICIT NONE
1586
1587	INTEGER :: i, k
1588
1589	REAL, DIMENSION(0:nx,nz) :: ar
1590	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
1591	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
1592
1593	!
1594	!-- Forward substitution
1595	DO i = 0, nx
1596	ar1(i,0) = ar(i,1)
1597	ENDDO
1598	DO k = 1, nz-1
1599	DO i = 0, nx
1600	ar1(i,k) = ar(i,k+1) - tri(5,i,k) * ar1(i,k-1)
1601	ENDDO
1602	ENDDO
1603
1604	!
1605	!-- Backward substitution
1606	!-- Note, the add of 1.0E-20 in the denominator is due to avoid divisions
1607	!-- by zero appearing if the pressure bc is set to neumann at the top of
1608	!-- the model domain.
1609	DO i = 0, nx
1610	ar(i,nz) = ar1(i,nz-1) / ( tri(4,i,nz-1) + 1.0E-20 )
1611	ENDDO
1612	DO k = nz-2, 0, -1
1613	DO i = 0, nx
1614	ar(i,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,k+2) ) &
1615	/ tri(4,i,k)
1616	ENDDO
1617	ENDDO
1618
1619	!
1620	!-- Indices i=0, j=0 correspond to horizontally averaged pressure.
1621	!-- The respective values of ar should be zero at all k-levels if
1622	!-- acceleration of horizontally averaged vertical velocity is zero.
1623	IF ( ibc_p_b == 1 .AND. ibc_p_t == 1 ) THEN
1624	IF ( j == 0 ) THEN
1625	DO k = 1, nz
1626	ar(0,k) = 0.0
1627	ENDDO
1628	ENDIF
1629	ENDIF
1630
1631	END SUBROUTINE substi_1dd
1632
1633	END SUBROUTINE tridia_1dd
1634
1635	#endif
1636	#endif
1637	END MODULE poisfft_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |