Home

Context Navigation

source: palm/trunk/SOURCE/poisfft.f90 @ 1103

Last change on this file since 1103 was 1103, checked in by raasch, 11 years ago
small bugfixes; mrun and subjob scripts are made bash compatible; further adjustments for lckyuh
Property svn:keywords set to `Id`
File size: 47.4 KB

Line
1	MODULE poisfft_mod
2
3	!--------------------------------------------------------------------------------!
4	! This file is part of PALM.
5	!
6	! PALM is free software: you can redistribute it and/or modify it under the terms
7	! of the GNU General Public License as published by the Free Software Foundation,
8	! either version 3 of the License, or (at your option) any later version.
9	!
10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13	!
14	! You should have received a copy of the GNU General Public License along with
15	! PALM. If not, see <http://www.gnu.org/licenses/>.
16	!
17	! Copyright 1997-2012 Leibniz University Hannover
18	!--------------------------------------------------------------------------------!
19	!
20	! Current revisions:
21	! -----------------
22	! tri, ar, and ar1 arguments in tridia-routines (2d) are removed because they
23	! sometimes cause segmentation faults with intel 12.1 compiler
24	!
25	! Former revisions:
26	! -----------------
27	! $Id: poisfft.f90 1103 2013-02-20 02:15:53Z raasch $
28	!
29	! 1092 2013-02-02 11:24:22Z raasch
30	! unused variables removed
31	!
32	! 1036 2012-10-22 13:43:42Z raasch
33	! code put under GPL (PALM 3.9)
34	!
35	! 2012-09-21 07:03:55Z raasch
36	! FLOAT type conversion replaced by REAL
37	!
38	! 1003 2012-09-14 14:35:53Z raasch
39	! indices nxa, nya, etc. replaced by nx, ny, etc.
40	!
41	! 940 2012-07-09 14:31:00Z raasch
42	! special handling of tri-array as an argument in tridia_1dd routines switched
43	! off because it caused segmentation faults with intel 12.1 compiler
44	!
45	! 877 2012-04-03 11:21:44Z suehring
46	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
47	! pressure at the top of the model domain.
48	!
49	! 809 2012-01-30 13:32:58Z maronga
50	! Bugfix: replaced .AND. and .NOT. with && and ! in the preprocessor directives
51	!
52	! 807 2012-01-25 11:53:51Z maronga
53	! New cpp directive "__check" implemented which is used by check_namelist_files
54	! (most of the code is unneeded by check_namelist_files).
55	!
56	! 763 2011-10-06 09:32:09Z suehring
57	! Comment added concerning the last change.
58	!
59	! 761 2011-10-05 17:58:52Z suehring
60	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
61	! pressure at the top of the model domain.
62	!
63	! 696 2011-03-18 07:03:49Z raasch
64	! work_fftx removed from PRIVATE clauses in fftx_tr_xy and tr_yx_fftx
65	!
66	! 683 2011-02-09 14:25:15Z raasch
67	! openMP parallelization for 2d-domain-decomposition
68	!
69	! 667 2010-12-23 12:06:00Z suehring/gryschka
70	! ddzu replaced by ddzu_pres due to changes in zu(0)
71	!
72	! 622 2010-12-10 08:08:13Z raasch
73	! optional barriers included in order to speed up collective operations
74	!
75	! 377 2009-09-04 11:09:00Z raasch
76	! __lcmuk changed to __lc to avoid problems with Intel compiler on sgi-ice
77	!
78	! 164 2008-05-15 08:46:15Z raasch
79	! Arguments removed from transpose routines
80	!
81	! 128 2007-10-26 13:11:14Z raasch
82	! Bugfix: wavenumber calculation for even nx in routines maketri
83	!
84	! 85 2007-05-11 09:35:14Z raasch
85	! Bugfix: work_fft*_vec removed from some PRIVATE-declarations
86	!
87	! 76 2007-03-29 00:58:32Z raasch
88	! Tridiagonal coefficients adjusted for Neumann boundary conditions both at
89	! the bottom and the top.
90	!
91	! RCS Log replace by Id keyword, revision history cleaned up
92	!
93	! Revision 1.24 2006/08/04 15:00:24 raasch
94	! Default setting of the thread number tn in case of not using OpenMP
95	!
96	! Revision 1.23 2006/02/23 12:48:38 raasch
97	! Additional compiler directive in routine tridia_1dd for preventing loop
98	! exchange on NEC-SX6
99	!
100	! Revision 1.20 2004/04/30 12:38:09 raasch
101	! Parts of former poisfft_hybrid moved to this subroutine,
102	! former subroutine changed to a module, renaming of FFT-subroutines and
103	! -module, FFTs completely substituted by calls of fft_x and fft_y,
104	! NAG fft used in the non-parallel case completely removed, l in maketri
105	! is now a 1d-array, variables passed by modules instead of using parameter
106	! lists, enlarged transposition arrays introduced
107	!
108	! Revision 1.1 1997/07/24 11:24:14 raasch
109	! Initial revision
110	!
111	!
112	! Description:
113	! ------------
114	! See below.
115	!------------------------------------------------------------------------------!
116
117	!--------------------------------------------------------------------------!
118	! poisfft !
119	! !
120	! Original version: Stephan Siano (pois3d) !
121	! !
122	! Institute of Meteorology and Climatology, University of Hannover !
123	! Germany !
124	! !
125	! Version as of July 23,1996 !
126	! !
127	! !
128	! Version for parallel computers: Siegfried Raasch !
129	! !
130	! Version as of July 03,1997 !
131	! !
132	! Solves the Poisson equation with a 2D spectral method !
133	! d^2 p / dx^2 + d^2 p / dy^2 + d^2 p / dz^2 = s !
134	! !
135	! Input: !
136	! real ar contains in the (nnx,nny,nnz) elements, !
137	! starting from the element (1,nys,nxl), the !
138	! values for s !
139	! real work Temporary array !
140	! !
141	! Output: !
142	! real ar contains the solution for p !
143	!--------------------------------------------------------------------------!
144
145	USE fft_xy
146	USE indices
147	USE transpose_indices
148
149	IMPLICIT NONE
150
151	PRIVATE
152
153	#if ! defined ( __check )
154	PUBLIC poisfft, poisfft_init
155
156	INTERFACE poisfft
157	MODULE PROCEDURE poisfft
158	END INTERFACE poisfft
159
160	INTERFACE poisfft_init
161	MODULE PROCEDURE poisfft_init
162	END INTERFACE poisfft_init
163	#else
164	PUBLIC poisfft_init
165
166	INTERFACE poisfft_init
167	MODULE PROCEDURE poisfft_init
168	END INTERFACE poisfft_init
169	#endif
170
171	CONTAINS
172
173	SUBROUTINE poisfft_init
174
175	CALL fft_init
176
177	END SUBROUTINE poisfft_init
178
179	#if ! defined ( __check )
180	SUBROUTINE poisfft( ar, work )
181
182	USE cpulog
183	USE interfaces
184	USE pegrid
185
186	IMPLICIT NONE
187
188	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar, work
189
190
191	CALL cpu_log( log_point_s(3), 'poisfft', 'start' )
192
193	!
194	!-- Two-dimensional Fourier Transformation in x- and y-direction.
195	#if defined( __parallel )
196	IF ( pdims(2) == 1 ) THEN
197
198	!
199	!-- 1d-domain-decomposition along x:
200	!-- FFT along y and transposition y --> x
201	CALL ffty_tr_yx( ar, work, ar )
202
203	!
204	!-- FFT along x, solving the tridiagonal system and backward FFT
205	CALL fftx_tri_fftx( ar )
206
207	!
208	!-- Transposition x --> y and backward FFT along y
209	CALL tr_xy_ffty( ar, work, ar )
210
211	ELSEIF ( pdims(1) == 1 ) THEN
212
213	!
214	!-- 1d-domain-decomposition along y:
215	!-- FFT along x and transposition x --> y
216	CALL fftx_tr_xy( ar, work, ar )
217
218	!
219	!-- FFT along y, solving the tridiagonal system and backward FFT
220	CALL ffty_tri_ffty( ar )
221
222	!
223	!-- Transposition y --> x and backward FFT along x
224	CALL tr_yx_fftx( ar, work, ar )
225
226	ELSE
227
228	!
229	!-- 2d-domain-decomposition
230	!-- Transposition z --> x
231	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
232	CALL transpose_zx( ar, work, ar )
233	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
234
235	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
236	CALL fftxp( ar, 'forward' )
237	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
238
239	!
240	!-- Transposition x --> y
241	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
242	CALL transpose_xy( ar, work, ar )
243	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
244
245	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
246	CALL fftyp( ar, 'forward' )
247	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
248
249	!
250	!-- Transposition y --> z
251	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
252	CALL transpose_yz( ar, work, ar )
253	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
254
255	!
256	!-- Solve the Poisson equation in z-direction in cartesian space.
257	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
258	CALL tridia( ar )
259	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
260
261	!
262	!-- Inverse Fourier Transformation
263	!-- Transposition z --> y
264	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
265	CALL transpose_zy( ar, work, ar )
266	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
267
268	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
269	CALL fftyp( ar, 'backward' )
270	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
271
272	!
273	!-- Transposition y --> x
274	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
275	CALL transpose_yx( ar, work, ar )
276	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
277
278	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
279	CALL fftxp( ar, 'backward' )
280	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
281
282	!
283	!-- Transposition x --> z
284	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
285	CALL transpose_xz( ar, work, ar )
286	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
287
288	ENDIF
289
290	#else
291
292	!
293	!-- Two-dimensional Fourier Transformation along x- and y-direction.
294	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
295	CALL fftx( ar, 'forward' )
296	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
297	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
298	CALL ffty( ar, 'forward' )
299	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
300
301	!
302	!-- Solve the Poisson equation in z-direction in cartesian space.
303	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
304	CALL tridia( ar )
305	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
306
307	!
308	!-- Inverse Fourier Transformation.
309	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
310	CALL ffty( ar, 'backward' )
311	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
312	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
313	CALL fftx( ar, 'backward' )
314	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
315
316	#endif
317
318	CALL cpu_log( log_point_s(3), 'poisfft', 'stop' )
319
320	END SUBROUTINE poisfft
321
322
323
324	SUBROUTINE tridia( ar )
325
326	!------------------------------------------------------------------------------!
327	! solves the linear system of equations:
328	!
329	! -(4 pi^2(i^2/(dx^2nnx^2)+j^2/(dy^2nny^2))+
330	! 1/(dzu(k)dzw(k))+1/(dzu(k-1)dzw(k)))*p(i,j,k)+
331	! 1/(dzu(k)dzw(k))p(i,j,k+1)+1/(dzu(k-1)dzw(k))p(i,j,k-1)=d(i,j,k)
332	!
333	! by using the Thomas algorithm
334	!------------------------------------------------------------------------------!
335
336	USE arrays_3d
337
338	IMPLICIT NONE
339
340	INTEGER :: i, j, k, nnyh
341
342	REAL, DIMENSION(nxl_z:nxr_z,0:nz-1) :: ar1
343	REAL, DIMENSION(5,nxl_z:nxr_z,0:nz-1) :: tri
344
345	#if defined( __parallel )
346	REAL :: ar(nxl_z:nxr_z,nys_z:nyn_z,1:nz)
347	#else
348	REAL :: ar(1:nz,nys_z:nyn_z,nxl_z:nxr_z)
349	#endif
350
351
352	nnyh = (ny+1) / 2
353
354	!
355	!-- Define constant elements of the tridiagonal matrix.
356	!$OMP PARALLEL PRIVATE ( k, i )
357	!$OMP DO
358	DO k = 0, nz-1
359	DO i = nxl_z, nxr_z
360	tri(2,i,k) = ddzu_pres(k+1) * ddzw(k+1)
361	tri(3,i,k) = ddzu_pres(k+2) * ddzw(k+1)
362	ENDDO
363	ENDDO
364	!$OMP END PARALLEL
365
366	#if defined( __parallel )
367	!
368	!-- Repeat for all y-levels.
369	!$OMP PARALLEL FIRSTPRIVATE( tri ) PRIVATE ( ar1, j )
370	!$OMP DO
371	DO j = nys_z, nyn_z
372	IF ( j <= nnyh ) THEN
373	CALL maketri( j )
374	ELSE
375	CALL maketri( ny+1-j )
376	ENDIF
377	CALL split
378	CALL substi( j )
379	ENDDO
380	!$OMP END PARALLEL
381	#else
382	!
383	!-- First y-level.
384	CALL maketri( nys_z )
385	CALL split
386	CALL substi( 0 )
387
388	!
389	!-- Further y-levels.
390	DO j = 1, nnyh - 1
391	CALL maketri( j )
392	CALL split
393	CALL substi( j )
394	CALL substi( ny+1-j )
395	ENDDO
396	CALL maketri( nnyh )
397	CALL split
398	CALL substi( nnyh+nys )
399	#endif
400
401	CONTAINS
402
403	SUBROUTINE maketri( j )
404
405	!------------------------------------------------------------------------------!
406	! Computes the i- and j-dependent component of the matrix
407	!------------------------------------------------------------------------------!
408
409	USE arrays_3d
410	USE constants
411	USE control_parameters
412	USE grid_variables
413
414	IMPLICIT NONE
415
416	INTEGER :: i, j, k, nnxh
417	REAL :: a, c
418	REAL :: ll(nxl_z:nxr_z)
419
420
421	nnxh = ( nx + 1 ) / 2
422
423	!
424	!-- Provide the tridiagonal matrix for solution of the Poisson equation in
425	!-- Fourier space. The coefficients are computed following the method of
426	!-- Schmidt et al. (DFVLR-Mitteilung 84-15), which departs from Stephan
427	!-- Siano's original version by discretizing the Poisson equation,
428	!-- before it is Fourier-transformed
429	#if defined( __parallel )
430	DO i = nxl_z, nxr_z
431	IF ( i >= 0 .AND. i <= nnxh ) THEN
432	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
433	REAL( nx+1 ) ) ) / ( dx * dx ) + &
434	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
435	REAL( ny+1 ) ) ) / ( dy * dy )
436	ELSE
437	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
438	REAL( nx+1 ) ) ) / ( dx * dx ) + &
439	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
440	REAL( ny+1 ) ) ) / ( dy * dy )
441	ENDIF
442	DO k = 0,nz-1
443	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
444	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
445	tri(1,i,k) = a + c - ll(i)
446	ENDDO
447	ENDDO
448	#else
449	DO i = 0, nnxh
450	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / REAL( nx+1 ) ) ) / &
451	( dx * dx ) + &
452	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / REAL( ny+1 ) ) ) / &
453	( dy * dy )
454	DO k = 0, nz-1
455	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
456	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
457	tri(1,i,k) = a + c - ll(i)
458	IF ( i >= 1 .and. i < nnxh ) THEN
459	tri(1,nx+1-i,k) = tri(1,i,k)
460	ENDIF
461	ENDDO
462	ENDDO
463	#endif
464	IF ( ibc_p_b == 1 .OR. ibc_p_b == 2 ) THEN
465	DO i = nxl_z, nxr_z
466	tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
467	ENDDO
468	ENDIF
469	IF ( ibc_p_t == 1 ) THEN
470	DO i = nxl_z, nxr_z
471	tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
472	ENDDO
473	ENDIF
474
475	END SUBROUTINE maketri
476
477
478	SUBROUTINE substi( j )
479
480	!------------------------------------------------------------------------------!
481	! Substitution (Forward and Backward) (Thomas algorithm)
482	!------------------------------------------------------------------------------!
483
484	USE control_parameters
485
486	IMPLICIT NONE
487
488	INTEGER :: i, j, k
489
490	!
491	!-- Forward substitution.
492	DO i = nxl_z, nxr_z
493	#if defined( __parallel )
494	ar1(i,0) = ar(i,j,1)
495	#else
496	ar1(i,0) = ar(1,j,i)
497	#endif
498	ENDDO
499	DO k = 1, nz - 1
500	DO i = nxl_z, nxr_z
501	#if defined( __parallel )
502	ar1(i,k) = ar(i,j,k+1) - tri(5,i,k) * ar1(i,k-1)
503	#else
504	ar1(i,k) = ar(k+1,j,i) - tri(5,i,k) * ar1(i,k-1)
505	#endif
506	ENDDO
507	ENDDO
508
509	!
510	!-- Backward substitution
511	!-- Note, the 1.0E-20 in the denominator is due to avoid divisions
512	!-- by zero appearing if the pressure bc is set to neumann at the top of
513	!-- the model domain.
514	DO i = nxl_z, nxr_z
515	#if defined( __parallel )
516	ar(i,j,nz) = ar1(i,nz-1) / ( tri(4,i,nz-1) + 1.0E-20 )
517	#else
518	ar(nz,j,i) = ar1(i,nz-1) / ( tri(4,i,nz-1) + 1.0E-20 )
519	#endif
520	ENDDO
521	DO k = nz-2, 0, -1
522	DO i = nxl_z, nxr_z
523	#if defined( __parallel )
524	ar(i,j,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,j,k+2) ) &
525	/ tri(4,i,k)
526	#else
527	ar(k+1,j,i) = ( ar1(i,k) - tri(3,i,k) * ar(k+2,j,i) ) &
528	/ tri(4,i,k)
529	#endif
530	ENDDO
531	ENDDO
532
533	!
534	!-- Indices i=0, j=0 correspond to horizontally averaged pressure.
535	!-- The respective values of ar should be zero at all k-levels if
536	!-- acceleration of horizontally averaged vertical velocity is zero.
537	IF ( ibc_p_b == 1 .AND. ibc_p_t == 1 ) THEN
538	IF ( j == 0 .AND. nxl_z == 0 ) THEN
539	#if defined( __parallel )
540	DO k = 1, nz
541	ar(nxl_z,j,k) = 0.0
542	ENDDO
543	#else
544	DO k = 1, nz
545	ar(k,j,nxl_z) = 0.0
546	ENDDO
547	#endif
548	ENDIF
549	ENDIF
550
551	END SUBROUTINE substi
552
553
554	SUBROUTINE split
555
556	!------------------------------------------------------------------------------!
557	! Splitting of the tridiagonal matrix (Thomas algorithm)
558	!------------------------------------------------------------------------------!
559
560	IMPLICIT NONE
561
562	INTEGER :: i, k
563
564	!
565	!-- Splitting.
566	DO i = nxl_z, nxr_z
567	tri(4,i,0) = tri(1,i,0)
568	ENDDO
569	DO k = 1, nz-1
570	DO i = nxl_z, nxr_z
571	tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
572	tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
573	ENDDO
574	ENDDO
575
576	END SUBROUTINE split
577
578	END SUBROUTINE tridia
579
580
581	#if defined( __parallel )
582	SUBROUTINE fftxp( ar, direction )
583
584	!------------------------------------------------------------------------------!
585	! Fourier-transformation along x-direction Parallelized version
586	!------------------------------------------------------------------------------!
587
588	IMPLICIT NONE
589
590	CHARACTER (LEN=*) :: direction
591	INTEGER :: j, k
592	REAL :: ar(0:nx,nys_x:nyn_x,nzb_x:nzt_x)
593
594	!
595	!-- Performing the fft with one of the methods implemented
596	!$OMP PARALLEL PRIVATE ( j, k )
597	!$OMP DO
598	DO k = nzb_x, nzt_x
599	DO j = nys_x, nyn_x
600	CALL fft_x( ar(0:nx,j,k), direction )
601	ENDDO
602	ENDDO
603	!$OMP END PARALLEL
604
605	END SUBROUTINE fftxp
606
607	#else
608	SUBROUTINE fftx( ar, direction )
609
610	!------------------------------------------------------------------------------!
611	! Fourier-transformation along x-direction Non parallel version
612	!------------------------------------------------------------------------------!
613
614	IMPLICIT NONE
615
616	CHARACTER (LEN=*) :: direction
617	INTEGER :: i, j, k
618	REAL :: ar(1:nz,0:ny,0:nx)
619
620	!
621	!-- Performing the fft with one of the methods implemented
622	!$OMP PARALLEL PRIVATE ( j, k )
623	!$OMP DO
624	DO k = 1, nz
625	DO j = 0, ny
626	CALL fft_x( ar(k,j,0:nx), direction )
627	ENDDO
628	ENDDO
629	!$OMP END PARALLEL
630
631	END SUBROUTINE fftx
632	#endif
633
634
635	#if defined( __parallel )
636	SUBROUTINE fftyp( ar, direction )
637
638	!------------------------------------------------------------------------------!
639	! Fourier-transformation along y-direction Parallelized version
640	!------------------------------------------------------------------------------!
641
642	IMPLICIT NONE
643
644	CHARACTER (LEN=*) :: direction
645	INTEGER :: i, k
646	REAL :: ar(0:ny,nxl_y:nxr_y,nzb_y:nzt_y)
647
648	!
649	!-- Performing the fft with one of the methods implemented
650	!$OMP PARALLEL PRIVATE ( i, k )
651	!$OMP DO
652	DO k = nzb_y, nzt_y
653	DO i = nxl_y, nxr_y
654	CALL fft_y( ar(0:ny,i,k), direction )
655	ENDDO
656	ENDDO
657	!$OMP END PARALLEL
658
659	END SUBROUTINE fftyp
660
661	#else
662	SUBROUTINE ffty( ar, direction )
663
664	!------------------------------------------------------------------------------!
665	! Fourier-transformation along y-direction Non parallel version
666	!------------------------------------------------------------------------------!
667
668	IMPLICIT NONE
669
670	CHARACTER (LEN=*) :: direction
671	INTEGER :: i, k
672	REAL :: ar(1:nz,0:ny,0:nx)
673
674	!
675	!-- Performing the fft with one of the methods implemented
676	!$OMP PARALLEL PRIVATE ( i, k )
677	!$OMP DO
678	DO k = 1, nz
679	DO i = 0, nx
680	CALL fft_y( ar(k,0:ny,i), direction )
681	ENDDO
682	ENDDO
683	!$OMP END PARALLEL
684
685	END SUBROUTINE ffty
686	#endif
687
688	#if defined( __parallel )
689	SUBROUTINE ffty_tr_yx( f_in, work, f_out )
690
691	!------------------------------------------------------------------------------!
692	! Fourier-transformation along y with subsequent transposition y --> x for
693	! a 1d-decomposition along x
694	!
695	! ATTENTION: The performance of this routine is much faster on the NEC-SX6,
696	! if the first index of work_ffty_vec is odd. Otherwise
697	! memory bank conflicts may occur (especially if the index is a
698	! multiple of 128). That's why work_ffty_vec is dimensioned as
699	! 0:ny+1.
700	! Of course, this will not work if users are using an odd number
701	! of gridpoints along y.
702	!------------------------------------------------------------------------------!
703
704	USE control_parameters
705	USE cpulog
706	USE indices
707	USE interfaces
708	USE pegrid
709	USE transpose_indices
710
711	IMPLICIT NONE
712
713	INTEGER :: i, iend, iouter, ir, j, k
714	INTEGER, PARAMETER :: stridex = 4
715
716	REAL, DIMENSION(0:ny,stridex) :: work_ffty
717	#if defined( __nec )
718	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
719	#endif
720	REAL, DIMENSION(1:nz,0:ny,nxl:nxr) :: f_in
721	REAL, DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_out
722	REAL, DIMENSION(nxl:nxr,1:nz,0:ny) :: work
723
724	!
725	!-- Carry out the FFT along y, where all data are present due to the
726	!-- 1d-decomposition along x. Resort the data in a way that x becomes
727	!-- the first index.
728	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
729
730	IF ( host(1:3) == 'nec' ) THEN
731	#if defined( __nec )
732	!
733	!-- Code optimized for vector processors
734	!$OMP PARALLEL PRIVATE ( i, j, k )
735	!$OMP DO
736	DO i = nxl, nxr
737
738	DO j = 0, ny
739	DO k = 1, nz
740	work_ffty_vec(j,k,i) = f_in(k,j,i)
741	ENDDO
742	ENDDO
743
744	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' )
745
746	ENDDO
747
748	!$OMP DO
749	DO k = 1, nz
750	DO j = 0, ny
751	DO i = nxl, nxr
752	work(i,k,j) = work_ffty_vec(j,k,i)
753	ENDDO
754	ENDDO
755	ENDDO
756	!$OMP END PARALLEL
757	#endif
758
759	ELSE
760
761	!
762	!-- Cache optimized code.
763	!-- The i-(x-)direction is split into a strided outer loop and an inner
764	!-- loop for better cache performance
765	!$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty)
766	!$OMP DO
767	DO iouter = nxl, nxr, stridex
768
769	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
770
771	DO k = 1, nz
772
773	DO i = iouter, iend
774
775	ir = i-iouter+1 ! counter within a stride
776	DO j = 0, ny
777	work_ffty(j,ir) = f_in(k,j,i)
778	ENDDO
779	!
780	!-- FFT along y
781	CALL fft_y( work_ffty(:,ir), 'forward' )
782
783	ENDDO
784
785	!
786	!-- Resort
787	DO j = 0, ny
788	DO i = iouter, iend
789	work(i,k,j) = work_ffty(j,i-iouter+1)
790	ENDDO
791	ENDDO
792
793	ENDDO
794
795	ENDDO
796	!$OMP END PARALLEL
797
798	ENDIF
799	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
800
801	!
802	!-- Transpose array
803	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
804	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
805	CALL MPI_ALLTOALL( work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
806	f_out(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
807	comm1dx, ierr )
808	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
809
810	END SUBROUTINE ffty_tr_yx
811
812
813	SUBROUTINE tr_xy_ffty( f_in, work, f_out )
814
815	!------------------------------------------------------------------------------!
816	! Transposition x --> y with a subsequent backward Fourier transformation for
817	! a 1d-decomposition along x
818	!------------------------------------------------------------------------------!
819
820	USE control_parameters
821	USE cpulog
822	USE indices
823	USE interfaces
824	USE pegrid
825	USE transpose_indices
826
827	IMPLICIT NONE
828
829	INTEGER :: i, iend, iouter, ir, j, k
830	INTEGER, PARAMETER :: stridex = 4
831
832	REAL, DIMENSION(0:ny,stridex) :: work_ffty
833	#if defined( __nec )
834	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
835	#endif
836	REAL, DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_in
837	REAL, DIMENSION(1:nz,0:ny,nxl:nxr) :: f_out
838	REAL, DIMENSION(nxl:nxr,1:nz,0:ny) :: work
839
840	!
841	!-- Transpose array
842	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
843	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
844	CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
845	work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
846	comm1dx, ierr )
847	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
848
849	!
850	!-- Resort the data in a way that y becomes the first index and carry out the
851	!-- backward fft along y.
852	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
853
854	IF ( host(1:3) == 'nec' ) THEN
855	#if defined( __nec )
856	!
857	!-- Code optimized for vector processors
858	!$OMP PARALLEL PRIVATE ( i, j, k )
859	!$OMP DO
860	DO k = 1, nz
861	DO j = 0, ny
862	DO i = nxl, nxr
863	work_ffty_vec(j,k,i) = work(i,k,j)
864	ENDDO
865	ENDDO
866	ENDDO
867
868	!$OMP DO
869	DO i = nxl, nxr
870
871	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' )
872
873	DO j = 0, ny
874	DO k = 1, nz
875	f_out(k,j,i) = work_ffty_vec(j,k,i)
876	ENDDO
877	ENDDO
878
879	ENDDO
880	!$OMP END PARALLEL
881	#endif
882
883	ELSE
884
885	!
886	!-- Cache optimized code.
887	!-- The i-(x-)direction is split into a strided outer loop and an inner
888	!-- loop for better cache performance
889	!$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty )
890	!$OMP DO
891	DO iouter = nxl, nxr, stridex
892
893	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
894
895	DO k = 1, nz
896	!
897	!-- Resort
898	DO j = 0, ny
899	DO i = iouter, iend
900	work_ffty(j,i-iouter+1) = work(i,k,j)
901	ENDDO
902	ENDDO
903
904	DO i = iouter, iend
905
906	!
907	!-- FFT along y
908	ir = i-iouter+1 ! counter within a stride
909	CALL fft_y( work_ffty(:,ir), 'backward' )
910
911	DO j = 0, ny
912	f_out(k,j,i) = work_ffty(j,ir)
913	ENDDO
914	ENDDO
915
916	ENDDO
917
918	ENDDO
919	!$OMP END PARALLEL
920
921	ENDIF
922
923	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
924
925	END SUBROUTINE tr_xy_ffty
926
927
928	SUBROUTINE fftx_tri_fftx( ar )
929
930	!------------------------------------------------------------------------------!
931	! FFT along x, solution of the tridiagonal system and backward FFT for
932	! a 1d-decomposition along x
933	!
934	! WARNING: this subroutine may still not work for hybrid parallelization
935	! with OpenMP (for possible necessary changes see the original
936	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
937	!------------------------------------------------------------------------------!
938
939	USE control_parameters
940	USE cpulog
941	USE grid_variables
942	USE indices
943	USE interfaces
944	USE pegrid
945	USE transpose_indices
946
947	IMPLICIT NONE
948
949	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
950
951	REAL, DIMENSION(0:nx) :: work_fftx
952	REAL, DIMENSION(0:nx,1:nz) :: work_trix
953	REAL, DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: ar
954	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
955
956
957	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'start' )
958
959	ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) )
960
961	tn = 0 ! Default thread number in case of one thread
962	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_fftx, work_trix )
963	DO j = nys_x, nyn_x
964
965	!$ tn = omp_get_thread_num()
966
967	IF ( host(1:3) == 'nec' ) THEN
968	!
969	!-- Code optimized for vector processors
970	DO k = 1, nz
971
972	m = 0
973	DO n = 1, pdims(1)
974	DO i = 1, nnx
975	work_trix(m,k) = ar(i,k,j,n)
976	m = m + 1
977	ENDDO
978	ENDDO
979
980	ENDDO
981
982	CALL fft_x_m( work_trix, 'forward' )
983
984	ELSE
985	!
986	!-- Cache optimized code
987	DO k = 1, nz
988
989	m = 0
990	DO n = 1, pdims(1)
991	DO i = 1, nnx
992	work_fftx(m) = ar(i,k,j,n)
993	m = m + 1
994	ENDDO
995	ENDDO
996
997	CALL fft_x( work_fftx, 'forward' )
998
999	DO i = 0, nx
1000	work_trix(i,k) = work_fftx(i)
1001	ENDDO
1002
1003	ENDDO
1004
1005	ENDIF
1006
1007	!
1008	!-- Solve the linear equation system
1009	CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) )
1010
1011	IF ( host(1:3) == 'nec' ) THEN
1012	!
1013	!-- Code optimized for vector processors
1014	CALL fft_x_m( work_trix, 'backward' )
1015
1016	DO k = 1, nz
1017
1018	m = 0
1019	DO n = 1, pdims(1)
1020	DO i = 1, nnx
1021	ar(i,k,j,n) = work_trix(m,k)
1022	m = m + 1
1023	ENDDO
1024	ENDDO
1025
1026	ENDDO
1027
1028	ELSE
1029	!
1030	!-- Cache optimized code
1031	DO k = 1, nz
1032
1033	DO i = 0, nx
1034	work_fftx(i) = work_trix(i,k)
1035	ENDDO
1036
1037	CALL fft_x( work_fftx, 'backward' )
1038
1039	m = 0
1040	DO n = 1, pdims(1)
1041	DO i = 1, nnx
1042	ar(i,k,j,n) = work_fftx(m)
1043	m = m + 1
1044	ENDDO
1045	ENDDO
1046
1047	ENDDO
1048
1049	ENDIF
1050
1051	ENDDO
1052
1053	DEALLOCATE( tri )
1054
1055	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'stop' )
1056
1057	END SUBROUTINE fftx_tri_fftx
1058
1059
1060	SUBROUTINE fftx_tr_xy( f_in, work, f_out )
1061
1062	!------------------------------------------------------------------------------!
1063	! Fourier-transformation along x with subsequent transposition x --> y for
1064	! a 1d-decomposition along y
1065	!
1066	! ATTENTION: The NEC-branch of this routine may significantly profit from
1067	! further optimizations. So far, performance is much worse than
1068	! for routine ffty_tr_yx (more than three times slower).
1069	!------------------------------------------------------------------------------!
1070
1071	USE control_parameters
1072	USE cpulog
1073	USE indices
1074	USE interfaces
1075	USE pegrid
1076	USE transpose_indices
1077
1078	IMPLICIT NONE
1079
1080	INTEGER :: i, j, k
1081
1082	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
1083	REAL, DIMENSION(1:nz,nys:nyn,0:nx) :: f_in
1084	REAL, DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_out
1085	REAL, DIMENSION(nys:nyn,1:nz,0:nx) :: work
1086
1087	!
1088	!-- Carry out the FFT along x, where all data are present due to the
1089	!-- 1d-decomposition along y. Resort the data in a way that y becomes
1090	!-- the first index.
1091	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
1092
1093	IF ( host(1:3) == 'nec' ) THEN
1094	!
1095	!-- Code for vector processors
1096	!$OMP PARALLEL PRIVATE ( i, j, k )
1097	!$OMP DO
1098	DO i = 0, nx
1099
1100	DO j = nys, nyn
1101	DO k = 1, nz
1102	work_fftx(i,k,j) = f_in(k,j,i)
1103	ENDDO
1104	ENDDO
1105
1106	ENDDO
1107
1108	!$OMP DO
1109	DO j = nys, nyn
1110
1111	CALL fft_x_m( work_fftx(:,:,j), 'forward' )
1112
1113	DO k = 1, nz
1114	DO i = 0, nx
1115	work(j,k,i) = work_fftx(i,k,j)
1116	ENDDO
1117	ENDDO
1118
1119	ENDDO
1120	!$OMP END PARALLEL
1121
1122	ELSE
1123
1124	!
1125	!-- Cache optimized code (there might be still a potential for better
1126	!-- optimization).
1127	!$OMP PARALLEL PRIVATE (i,j,k)
1128	!$OMP DO
1129	DO i = 0, nx
1130
1131	DO j = nys, nyn
1132	DO k = 1, nz
1133	work_fftx(i,k,j) = f_in(k,j,i)
1134	ENDDO
1135	ENDDO
1136
1137	ENDDO
1138
1139	!$OMP DO
1140	DO j = nys, nyn
1141	DO k = 1, nz
1142
1143	CALL fft_x( work_fftx(0:nx,k,j), 'forward' )
1144
1145	DO i = 0, nx
1146	work(j,k,i) = work_fftx(i,k,j)
1147	ENDDO
1148	ENDDO
1149
1150	ENDDO
1151	!$OMP END PARALLEL
1152
1153	ENDIF
1154	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
1155
1156	!
1157	!-- Transpose array
1158	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1159	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1160	CALL MPI_ALLTOALL( work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
1161	f_out(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
1162	comm1dy, ierr )
1163	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1164
1165	END SUBROUTINE fftx_tr_xy
1166
1167
1168	SUBROUTINE tr_yx_fftx( f_in, work, f_out )
1169
1170	!------------------------------------------------------------------------------!
1171	! Transposition y --> x with a subsequent backward Fourier transformation for
1172	! a 1d-decomposition along x
1173	!------------------------------------------------------------------------------!
1174
1175	USE control_parameters
1176	USE cpulog
1177	USE indices
1178	USE interfaces
1179	USE pegrid
1180	USE transpose_indices
1181
1182	IMPLICIT NONE
1183
1184	INTEGER :: i, j, k
1185
1186	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
1187	REAL, DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_in
1188	REAL, DIMENSION(1:nz,nys:nyn,0:nx) :: f_out
1189	REAL, DIMENSION(nys:nyn,1:nz,0:nx) :: work
1190
1191	!
1192	!-- Transpose array
1193	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1194	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1195	CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
1196	work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
1197	comm1dy, ierr )
1198	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1199
1200	!
1201	!-- Carry out the FFT along x, where all data are present due to the
1202	!-- 1d-decomposition along y. Resort the data in a way that y becomes
1203	!-- the first index.
1204	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
1205
1206	IF ( host(1:3) == 'nec' ) THEN
1207	!
1208	!-- Code optimized for vector processors
1209	!$OMP PARALLEL PRIVATE ( i, j, k )
1210	!$OMP DO
1211	DO j = nys, nyn
1212
1213	DO k = 1, nz
1214	DO i = 0, nx
1215	work_fftx(i,k,j) = work(j,k,i)
1216	ENDDO
1217	ENDDO
1218
1219	CALL fft_x_m( work_fftx(:,:,j), 'backward' )
1220
1221	ENDDO
1222
1223	!$OMP DO
1224	DO i = 0, nx
1225	DO j = nys, nyn
1226	DO k = 1, nz
1227	f_out(k,j,i) = work_fftx(i,k,j)
1228	ENDDO
1229	ENDDO
1230	ENDDO
1231	!$OMP END PARALLEL
1232
1233	ELSE
1234
1235	!
1236	!-- Cache optimized code (there might be still a potential for better
1237	!-- optimization).
1238	!$OMP PARALLEL PRIVATE (i,j,k)
1239	!$OMP DO
1240	DO j = nys, nyn
1241	DO k = 1, nz
1242
1243	DO i = 0, nx
1244	work_fftx(i,k,j) = work(j,k,i)
1245	ENDDO
1246
1247	CALL fft_x( work_fftx(0:nx,k,j), 'backward' )
1248
1249	ENDDO
1250	ENDDO
1251
1252	!$OMP DO
1253	DO i = 0, nx
1254	DO j = nys, nyn
1255	DO k = 1, nz
1256	f_out(k,j,i) = work_fftx(i,k,j)
1257	ENDDO
1258	ENDDO
1259	ENDDO
1260	!$OMP END PARALLEL
1261
1262	ENDIF
1263	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
1264
1265	END SUBROUTINE tr_yx_fftx
1266
1267
1268	SUBROUTINE ffty_tri_ffty( ar )
1269
1270	!------------------------------------------------------------------------------!
1271	! FFT along y, solution of the tridiagonal system and backward FFT for
1272	! a 1d-decomposition along y
1273	!
1274	! WARNING: this subroutine may still not work for hybrid parallelization
1275	! with OpenMP (for possible necessary changes see the original
1276	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
1277	!------------------------------------------------------------------------------!
1278
1279	USE control_parameters
1280	USE cpulog
1281	USE grid_variables
1282	USE indices
1283	USE interfaces
1284	USE pegrid
1285	USE transpose_indices
1286
1287	IMPLICIT NONE
1288
1289	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
1290
1291	REAL, DIMENSION(0:ny) :: work_ffty
1292	REAL, DIMENSION(0:ny,1:nz) :: work_triy
1293	REAL, DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: ar
1294	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
1295
1296
1297	CALL cpu_log( log_point_s(39), 'fft_y + tridia', 'start' )
1298
1299	ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) )
1300
1301	tn = 0 ! Default thread number in case of one thread
1302	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy )
1303	DO i = nxl_y, nxr_y
1304
1305	!$ tn = omp_get_thread_num()
1306
1307	IF ( host(1:3) == 'nec' ) THEN
1308	!
1309	!-- Code optimized for vector processors
1310	DO k = 1, nz
1311
1312	m = 0
1313	DO n = 1, pdims(2)
1314	DO j = 1, nny
1315	work_triy(m,k) = ar(j,k,i,n)
1316	m = m + 1
1317	ENDDO
1318	ENDDO
1319
1320	ENDDO
1321
1322	CALL fft_y_m( work_triy, ny, 'forward' )
1323
1324	ELSE
1325	!
1326	!-- Cache optimized code
1327	DO k = 1, nz
1328
1329	m = 0
1330	DO n = 1, pdims(2)
1331	DO j = 1, nny
1332	work_ffty(m) = ar(j,k,i,n)
1333	m = m + 1
1334	ENDDO
1335	ENDDO
1336
1337	CALL fft_y( work_ffty, 'forward' )
1338
1339	DO j = 0, ny
1340	work_triy(j,k) = work_ffty(j)
1341	ENDDO
1342
1343	ENDDO
1344
1345	ENDIF
1346
1347	!
1348	!-- Solve the linear equation system
1349	CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) )
1350
1351	IF ( host(1:3) == 'nec' ) THEN
1352	!
1353	!-- Code optimized for vector processors
1354	CALL fft_y_m( work_triy, ny, 'backward' )
1355
1356	DO k = 1, nz
1357
1358	m = 0
1359	DO n = 1, pdims(2)
1360	DO j = 1, nny
1361	ar(j,k,i,n) = work_triy(m,k)
1362	m = m + 1
1363	ENDDO
1364	ENDDO
1365
1366	ENDDO
1367
1368	ELSE
1369	!
1370	!-- Cache optimized code
1371	DO k = 1, nz
1372
1373	DO j = 0, ny
1374	work_ffty(j) = work_triy(j,k)
1375	ENDDO
1376
1377	CALL fft_y( work_ffty, 'backward' )
1378
1379	m = 0
1380	DO n = 1, pdims(2)
1381	DO j = 1, nny
1382	ar(j,k,i,n) = work_ffty(m)
1383	m = m + 1
1384	ENDDO
1385	ENDDO
1386
1387	ENDDO
1388
1389	ENDIF
1390
1391	ENDDO
1392
1393	DEALLOCATE( tri )
1394
1395	CALL cpu_log( log_point_s(39), 'fft_y + tridia', 'stop' )
1396
1397	END SUBROUTINE ffty_tri_ffty
1398
1399
1400	SUBROUTINE tridia_1dd( ddx2, ddy2, nx, ny, j, ar, tri )
1401
1402	!------------------------------------------------------------------------------!
1403	! Solves the linear system of equations for a 1d-decomposition along x (see
1404	! tridia)
1405	!
1406	! Attention: when using the intel compilers older than 12.0, array tri must
1407	! be passed as an argument to the contained subroutines. Otherwise
1408	! addres faults will occur. This feature can be activated with
1409	! cpp-switch __intel11
1410	! On NEC, tri should not be passed (except for routine substi_1dd)
1411	! because this causes very bad performance.
1412	!------------------------------------------------------------------------------!
1413
1414	USE arrays_3d
1415	USE control_parameters
1416
1417	USE pegrid
1418
1419	IMPLICIT NONE
1420
1421	INTEGER :: i, j, k, nnyh, nx, ny, omp_get_thread_num, tn
1422
1423	REAL :: ddx2, ddy2
1424
1425	REAL, DIMENSION(0:nx,1:nz) :: ar
1426	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
1427
1428
1429	nnyh = ( ny + 1 ) / 2
1430
1431	!
1432	!-- Define constant elements of the tridiagonal matrix.
1433	!-- The compiler on SX6 does loop exchange. If 0:nx is a high power of 2,
1434	!-- the exchanged loops create bank conflicts. The following directive
1435	!-- prohibits loop exchange and the loops perform much better.
1436	! tn = omp_get_thread_num()
1437	! WRITE( 120+tn, * ) '+++ id=',myid,' nx=',nx,' thread=', omp_get_thread_num()
1438	! CALL local_flush( 120+tn )
1439	!CDIR NOLOOPCHG
1440	DO k = 0, nz-1
1441	DO i = 0,nx
1442	tri(2,i,k) = ddzu_pres(k+1) * ddzw(k+1)
1443	tri(3,i,k) = ddzu_pres(k+2) * ddzw(k+1)
1444	ENDDO
1445	ENDDO
1446	! WRITE( 120+tn, * ) '+++ id=',myid,' end of first tridia loop thread=', omp_get_thread_num()
1447	! CALL local_flush( 120+tn )
1448
1449	IF ( j <= nnyh ) THEN
1450	#if defined( __intel11 )
1451	CALL maketri_1dd( j, tri )
1452	#else
1453	CALL maketri_1dd( j )
1454	#endif
1455	ELSE
1456	#if defined( __intel11 )
1457	CALL maketri_1dd( ny+1-j, tri )
1458	#else
1459	CALL maketri_1dd( ny+1-j )
1460	#endif
1461	ENDIF
1462	#if defined( __intel11 )
1463	CALL split_1dd( tri )
1464	#else
1465	CALL split_1dd
1466	#endif
1467	CALL substi_1dd( ar, tri )
1468
1469	CONTAINS
1470
1471	#if defined( __intel11 )
1472	SUBROUTINE maketri_1dd( j, tri )
1473	#else
1474	SUBROUTINE maketri_1dd( j )
1475	#endif
1476
1477	!------------------------------------------------------------------------------!
1478	! computes the i- and j-dependent component of the matrix
1479	!------------------------------------------------------------------------------!
1480
1481	USE constants
1482
1483	IMPLICIT NONE
1484
1485	INTEGER :: i, j, k, nnxh
1486	REAL :: a, c
1487
1488	REAL, DIMENSION(0:nx) :: l
1489
1490	#if defined( __intel11 )
1491	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
1492	#endif
1493
1494
1495	nnxh = ( nx + 1 ) / 2
1496	!
1497	!-- Provide the tridiagonal matrix for solution of the Poisson equation in
1498	!-- Fourier space. The coefficients are computed following the method of
1499	!-- Schmidt et al. (DFVLR-Mitteilung 84-15), which departs from Stephan
1500	!-- Siano's original version by discretizing the Poisson equation,
1501	!-- before it is Fourier-transformed
1502	DO i = 0, nx
1503	IF ( i >= 0 .AND. i <= nnxh ) THEN
1504	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
1505	REAL( nx+1 ) ) ) * ddx2 + &
1506	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
1507	REAL( ny+1 ) ) ) * ddy2
1508	ELSE
1509	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
1510	REAL( nx+1 ) ) ) * ddx2 + &
1511	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
1512	REAL( ny+1 ) ) ) * ddy2
1513	ENDIF
1514	ENDDO
1515
1516	DO k = 0, nz-1
1517	DO i = 0, nx
1518	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
1519	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
1520	tri(1,i,k) = a + c - l(i)
1521	ENDDO
1522	ENDDO
1523	IF ( ibc_p_b == 1 .OR. ibc_p_b == 2 ) THEN
1524	DO i = 0, nx
1525	tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
1526	ENDDO
1527	ENDIF
1528	IF ( ibc_p_t == 1 ) THEN
1529	DO i = 0, nx
1530	tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
1531	ENDDO
1532	ENDIF
1533
1534	END SUBROUTINE maketri_1dd
1535
1536
1537	#if defined( __intel11 )
1538	SUBROUTINE split_1dd( tri )
1539	#else
1540	SUBROUTINE split_1dd
1541	#endif
1542
1543	!------------------------------------------------------------------------------!
1544	! Splitting of the tridiagonal matrix (Thomas algorithm)
1545	!------------------------------------------------------------------------------!
1546
1547	IMPLICIT NONE
1548
1549	INTEGER :: i, k
1550
1551	#if defined( __intel11 )
1552	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
1553	#endif
1554
1555
1556	!
1557	!-- Splitting
1558	DO i = 0, nx
1559	tri(4,i,0) = tri(1,i,0)
1560	ENDDO
1561	DO k = 1, nz-1
1562	DO i = 0, nx
1563	tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
1564	tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
1565	ENDDO
1566	ENDDO
1567
1568	END SUBROUTINE split_1dd
1569
1570
1571	SUBROUTINE substi_1dd( ar, tri )
1572
1573	!------------------------------------------------------------------------------!
1574	! Substitution (Forward and Backward) (Thomas algorithm)
1575	!------------------------------------------------------------------------------!
1576
1577	IMPLICIT NONE
1578
1579	INTEGER :: i, k
1580
1581	REAL, DIMENSION(0:nx,nz) :: ar
1582	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
1583	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
1584
1585	!
1586	!-- Forward substitution
1587	DO i = 0, nx
1588	ar1(i,0) = ar(i,1)
1589	ENDDO
1590	DO k = 1, nz-1
1591	DO i = 0, nx
1592	ar1(i,k) = ar(i,k+1) - tri(5,i,k) * ar1(i,k-1)
1593	ENDDO
1594	ENDDO
1595
1596	!
1597	!-- Backward substitution
1598	!-- Note, the add of 1.0E-20 in the denominator is due to avoid divisions
1599	!-- by zero appearing if the pressure bc is set to neumann at the top of
1600	!-- the model domain.
1601	DO i = 0, nx
1602	ar(i,nz) = ar1(i,nz-1) / ( tri(4,i,nz-1) + 1.0E-20 )
1603	ENDDO
1604	DO k = nz-2, 0, -1
1605	DO i = 0, nx
1606	ar(i,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,k+2) ) &
1607	/ tri(4,i,k)
1608	ENDDO
1609	ENDDO
1610
1611	!
1612	!-- Indices i=0, j=0 correspond to horizontally averaged pressure.
1613	!-- The respective values of ar should be zero at all k-levels if
1614	!-- acceleration of horizontally averaged vertical velocity is zero.
1615	IF ( ibc_p_b == 1 .AND. ibc_p_t == 1 ) THEN
1616	IF ( j == 0 ) THEN
1617	DO k = 1, nz
1618	ar(0,k) = 0.0
1619	ENDDO
1620	ENDIF
1621	ENDIF
1622
1623	END SUBROUTINE substi_1dd
1624
1625	END SUBROUTINE tridia_1dd
1626
1627	#endif
1628	#endif
1629	END MODULE poisfft_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |