Home

Context Navigation

source: palm/trunk/SOURCE/poisfft.f90 @ 1093

Last change on this file since 1093 was 1093, checked in by raasch, 12 years ago
last commit documented
Property svn:keywords set to `Id`
File size: 47.7 KB

Line
1	MODULE poisfft_mod
2
3	!--------------------------------------------------------------------------------!
4	! This file is part of PALM.
5	!
6	! PALM is free software: you can redistribute it and/or modify it under the terms
7	! of the GNU General Public License as published by the Free Software Foundation,
8	! either version 3 of the License, or (at your option) any later version.
9	!
10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13	!
14	! You should have received a copy of the GNU General Public License along with
15	! PALM. If not, see <http://www.gnu.org/licenses/>.
16	!
17	! Copyright 1997-2012 Leibniz University Hannover
18	!--------------------------------------------------------------------------------!
19	!
20	! Current revisions:
21	! -----------------
22	!
23	!
24	! Former revisions:
25	! -----------------
26	! $Id: poisfft.f90 1093 2013-02-02 12:58:49Z raasch $
27	!
28	! 1092 2013-02-02 11:24:22Z raasch
29	! unused variables removed
30	!
31	! 1036 2012-10-22 13:43:42Z raasch
32	! code put under GPL (PALM 3.9)
33	!
34	! 2012-09-21 07:03:55Z raasch
35	! FLOAT type conversion replaced by REAL
36	!
37	! 1003 2012-09-14 14:35:53Z raasch
38	! indices nxa, nya, etc. replaced by nx, ny, etc.
39	!
40	! 940 2012-07-09 14:31:00Z raasch
41	! special handling of tri-array as an argument in tridia_1dd routines switched
42	! off because it caused segmentation faults with intel 12.1 compiler
43	!
44	! 877 2012-04-03 11:21:44Z suehring
45	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
46	! pressure at the top of the model domain.
47	!
48	! 809 2012-01-30 13:32:58Z maronga
49	! Bugfix: replaced .AND. and .NOT. with && and ! in the preprocessor directives
50	!
51	! 807 2012-01-25 11:53:51Z maronga
52	! New cpp directive "__check" implemented which is used by check_namelist_files
53	! (most of the code is unneeded by check_namelist_files).
54	!
55	! 763 2011-10-06 09:32:09Z suehring
56	! Comment added concerning the last change.
57	!
58	! 761 2011-10-05 17:58:52Z suehring
59	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
60	! pressure at the top of the model domain.
61	!
62	! 696 2011-03-18 07:03:49Z raasch
63	! work_fftx removed from PRIVATE clauses in fftx_tr_xy and tr_yx_fftx
64	!
65	! 683 2011-02-09 14:25:15Z raasch
66	! openMP parallelization for 2d-domain-decomposition
67	!
68	! 667 2010-12-23 12:06:00Z suehring/gryschka
69	! ddzu replaced by ddzu_pres due to changes in zu(0)
70	!
71	! 622 2010-12-10 08:08:13Z raasch
72	! optional barriers included in order to speed up collective operations
73	!
74	! 377 2009-09-04 11:09:00Z raasch
75	! __lcmuk changed to __lc to avoid problems with Intel compiler on sgi-ice
76	!
77	! 164 2008-05-15 08:46:15Z raasch
78	! Arguments removed from transpose routines
79	!
80	! 128 2007-10-26 13:11:14Z raasch
81	! Bugfix: wavenumber calculation for even nx in routines maketri
82	!
83	! 85 2007-05-11 09:35:14Z raasch
84	! Bugfix: work_fft*_vec removed from some PRIVATE-declarations
85	!
86	! 76 2007-03-29 00:58:32Z raasch
87	! Tridiagonal coefficients adjusted for Neumann boundary conditions both at
88	! the bottom and the top.
89	!
90	! RCS Log replace by Id keyword, revision history cleaned up
91	!
92	! Revision 1.24 2006/08/04 15:00:24 raasch
93	! Default setting of the thread number tn in case of not using OpenMP
94	!
95	! Revision 1.23 2006/02/23 12:48:38 raasch
96	! Additional compiler directive in routine tridia_1dd for preventing loop
97	! exchange on NEC-SX6
98	!
99	! Revision 1.20 2004/04/30 12:38:09 raasch
100	! Parts of former poisfft_hybrid moved to this subroutine,
101	! former subroutine changed to a module, renaming of FFT-subroutines and
102	! -module, FFTs completely substituted by calls of fft_x and fft_y,
103	! NAG fft used in the non-parallel case completely removed, l in maketri
104	! is now a 1d-array, variables passed by modules instead of using parameter
105	! lists, enlarged transposition arrays introduced
106	!
107	! Revision 1.1 1997/07/24 11:24:14 raasch
108	! Initial revision
109	!
110	!
111	! Description:
112	! ------------
113	! See below.
114	!------------------------------------------------------------------------------!
115
116	!--------------------------------------------------------------------------!
117	! poisfft !
118	! !
119	! Original version: Stephan Siano (pois3d) !
120	! !
121	! Institute of Meteorology and Climatology, University of Hannover !
122	! Germany !
123	! !
124	! Version as of July 23,1996 !
125	! !
126	! !
127	! Version for parallel computers: Siegfried Raasch !
128	! !
129	! Version as of July 03,1997 !
130	! !
131	! Solves the Poisson equation with a 2D spectral method !
132	! d^2 p / dx^2 + d^2 p / dy^2 + d^2 p / dz^2 = s !
133	! !
134	! Input: !
135	! real ar contains in the (nnx,nny,nnz) elements, !
136	! starting from the element (1,nys,nxl), the !
137	! values for s !
138	! real work Temporary array !
139	! !
140	! Output: !
141	! real ar contains the solution for p !
142	!--------------------------------------------------------------------------!
143
144	USE fft_xy
145	USE indices
146	USE transpose_indices
147
148	IMPLICIT NONE
149
150	PRIVATE
151
152	#if ! defined ( __check )
153	PUBLIC poisfft, poisfft_init
154
155	INTERFACE poisfft
156	MODULE PROCEDURE poisfft
157	END INTERFACE poisfft
158
159	INTERFACE poisfft_init
160	MODULE PROCEDURE poisfft_init
161	END INTERFACE poisfft_init
162	#else
163	PUBLIC poisfft_init
164
165	INTERFACE poisfft_init
166	MODULE PROCEDURE poisfft_init
167	END INTERFACE poisfft_init
168	#endif
169
170	CONTAINS
171
172	SUBROUTINE poisfft_init
173
174	CALL fft_init
175
176	END SUBROUTINE poisfft_init
177
178	#if ! defined ( __check )
179	SUBROUTINE poisfft( ar, work )
180
181	USE cpulog
182	USE interfaces
183	USE pegrid
184
185	IMPLICIT NONE
186
187	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar, work
188
189
190	CALL cpu_log( log_point_s(3), 'poisfft', 'start' )
191
192	!
193	!-- Two-dimensional Fourier Transformation in x- and y-direction.
194	#if defined( __parallel )
195	IF ( pdims(2) == 1 ) THEN
196
197	!
198	!-- 1d-domain-decomposition along x:
199	!-- FFT along y and transposition y --> x
200	CALL ffty_tr_yx( ar, work, ar )
201
202	!
203	!-- FFT along x, solving the tridiagonal system and backward FFT
204	CALL fftx_tri_fftx( ar )
205
206	!
207	!-- Transposition x --> y and backward FFT along y
208	CALL tr_xy_ffty( ar, work, ar )
209
210	ELSEIF ( pdims(1) == 1 ) THEN
211
212	!
213	!-- 1d-domain-decomposition along y:
214	!-- FFT along x and transposition x --> y
215	CALL fftx_tr_xy( ar, work, ar )
216
217	!
218	!-- FFT along y, solving the tridiagonal system and backward FFT
219	CALL ffty_tri_ffty( ar )
220
221	!
222	!-- Transposition y --> x and backward FFT along x
223	CALL tr_yx_fftx( ar, work, ar )
224
225	ELSE
226
227	!
228	!-- 2d-domain-decomposition
229	!-- Transposition z --> x
230	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
231	CALL transpose_zx( ar, work, ar )
232	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
233
234	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
235	CALL fftxp( ar, 'forward' )
236	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
237
238	!
239	!-- Transposition x --> y
240	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
241	CALL transpose_xy( ar, work, ar )
242	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
243
244	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
245	CALL fftyp( ar, 'forward' )
246	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
247
248	!
249	!-- Transposition y --> z
250	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
251	CALL transpose_yz( ar, work, ar )
252	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
253
254	!
255	!-- Solve the Poisson equation in z-direction in cartesian space.
256	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
257	CALL tridia( ar )
258	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
259
260	!
261	!-- Inverse Fourier Transformation
262	!-- Transposition z --> y
263	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
264	CALL transpose_zy( ar, work, ar )
265	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
266
267	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
268	CALL fftyp( ar, 'backward' )
269	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
270
271	!
272	!-- Transposition y --> x
273	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
274	CALL transpose_yx( ar, work, ar )
275	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
276
277	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
278	CALL fftxp( ar, 'backward' )
279	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
280
281	!
282	!-- Transposition x --> z
283	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
284	CALL transpose_xz( ar, work, ar )
285	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
286
287	ENDIF
288
289	#else
290
291	!
292	!-- Two-dimensional Fourier Transformation along x- and y-direction.
293	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
294	CALL fftx( ar, 'forward' )
295	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
296	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
297	CALL ffty( ar, 'forward' )
298	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
299
300	!
301	!-- Solve the Poisson equation in z-direction in cartesian space.
302	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
303	CALL tridia( ar )
304	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
305
306	!
307	!-- Inverse Fourier Transformation.
308	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
309	CALL ffty( ar, 'backward' )
310	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
311	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
312	CALL fftx( ar, 'backward' )
313	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
314
315	#endif
316
317	CALL cpu_log( log_point_s(3), 'poisfft', 'stop' )
318
319	END SUBROUTINE poisfft
320
321
322
323	SUBROUTINE tridia( ar )
324
325	!------------------------------------------------------------------------------!
326	! solves the linear system of equations:
327	!
328	! -(4 pi^2(i^2/(dx^2nnx^2)+j^2/(dy^2nny^2))+
329	! 1/(dzu(k)dzw(k))+1/(dzu(k-1)dzw(k)))*p(i,j,k)+
330	! 1/(dzu(k)dzw(k))p(i,j,k+1)+1/(dzu(k-1)dzw(k))p(i,j,k-1)=d(i,j,k)
331	!
332	! by using the Thomas algorithm
333	!------------------------------------------------------------------------------!
334
335	USE arrays_3d
336
337	IMPLICIT NONE
338
339	INTEGER :: i, j, k, nnyh
340
341	REAL, DIMENSION(nxl_z:nxr_z,0:nz-1) :: ar1
342	REAL, DIMENSION(5,nxl_z:nxr_z,0:nz-1) :: tri
343
344	#if defined( __parallel )
345	REAL :: ar(nxl_z:nxr_z,nys_z:nyn_z,1:nz)
346	#else
347	REAL :: ar(1:nz,nys_z:nyn_z,nxl_z:nxr_z)
348	#endif
349
350
351	nnyh = (ny+1) / 2
352
353	!
354	!-- Define constant elements of the tridiagonal matrix.
355	!$OMP PARALLEL PRIVATE ( k, i )
356	!$OMP DO
357	DO k = 0, nz-1
358	DO i = nxl_z, nxr_z
359	tri(2,i,k) = ddzu_pres(k+1) * ddzw(k+1)
360	tri(3,i,k) = ddzu_pres(k+2) * ddzw(k+1)
361	ENDDO
362	ENDDO
363	!$OMP END PARALLEL
364
365	#if defined( __parallel )
366	!
367	!-- Repeat for all y-levels.
368	!$OMP PARALLEL FIRSTPRIVATE( tri ) PRIVATE ( ar1, j )
369	!$OMP DO
370	DO j = nys_z, nyn_z
371	IF ( j <= nnyh ) THEN
372	CALL maketri( tri, j )
373	ELSE
374	CALL maketri( tri, ny+1-j )
375	ENDIF
376	CALL split( tri )
377	CALL substi( ar, ar1, tri, j )
378	ENDDO
379	!$OMP END PARALLEL
380	#else
381	!
382	!-- First y-level.
383	CALL maketri( tri, nys_z )
384	CALL split( tri )
385	CALL substi( ar, ar1, tri, 0 )
386
387	!
388	!-- Further y-levels.
389	DO j = 1, nnyh - 1
390	CALL maketri( tri, j )
391	CALL split( tri )
392	CALL substi( ar, ar1, tri, j )
393	CALL substi( ar, ar1, tri, ny+1-j )
394	ENDDO
395	CALL maketri( tri, nnyh )
396	CALL split( tri )
397	CALL substi( ar, ar1, tri, nnyh+nys )
398	#endif
399
400	CONTAINS
401
402	SUBROUTINE maketri( tri, j )
403
404	!------------------------------------------------------------------------------!
405	! Computes the i- and j-dependent component of the matrix
406	!------------------------------------------------------------------------------!
407
408	USE arrays_3d
409	USE constants
410	USE control_parameters
411	USE grid_variables
412
413	IMPLICIT NONE
414
415	INTEGER :: i, j, k, nnxh
416	REAL :: a, c
417	REAL :: ll(nxl_z:nxr_z)
418	REAL :: tri(5,nxl_z:nxr_z,0:nz-1)
419
420
421	nnxh = ( nx + 1 ) / 2
422
423	!
424	!-- Provide the tridiagonal matrix for solution of the Poisson equation in
425	!-- Fourier space. The coefficients are computed following the method of
426	!-- Schmidt et al. (DFVLR-Mitteilung 84-15), which departs from Stephan
427	!-- Siano's original version by discretizing the Poisson equation,
428	!-- before it is Fourier-transformed
429	#if defined( __parallel )
430	DO i = nxl_z, nxr_z
431	IF ( i >= 0 .AND. i <= nnxh ) THEN
432	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
433	REAL( nx+1 ) ) ) / ( dx * dx ) + &
434	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
435	REAL( ny+1 ) ) ) / ( dy * dy )
436	ELSE
437	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
438	REAL( nx+1 ) ) ) / ( dx * dx ) + &
439	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
440	REAL( ny+1 ) ) ) / ( dy * dy )
441	ENDIF
442	DO k = 0,nz-1
443	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
444	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
445	tri(1,i,k) = a + c - ll(i)
446	ENDDO
447	ENDDO
448	#else
449	DO i = 0, nnxh
450	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / REAL( nx+1 ) ) ) / &
451	( dx * dx ) + &
452	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / REAL( ny+1 ) ) ) / &
453	( dy * dy )
454	DO k = 0, nz-1
455	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
456	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
457	tri(1,i,k) = a + c - ll(i)
458	IF ( i >= 1 .and. i < nnxh ) THEN
459	tri(1,nx+1-i,k) = tri(1,i,k)
460	ENDIF
461	ENDDO
462	ENDDO
463	#endif
464	IF ( ibc_p_b == 1 .OR. ibc_p_b == 2 ) THEN
465	DO i = nxl_z, nxr_z
466	tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
467	ENDDO
468	ENDIF
469	IF ( ibc_p_t == 1 ) THEN
470	DO i = nxl_z, nxr_z
471	tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
472	ENDDO
473	ENDIF
474
475	END SUBROUTINE maketri
476
477
478	SUBROUTINE substi( ar, ar1, tri, j )
479
480	!------------------------------------------------------------------------------!
481	! Substitution (Forward and Backward) (Thomas algorithm)
482	!------------------------------------------------------------------------------!
483
484	USE control_parameters
485
486	IMPLICIT NONE
487
488	INTEGER :: i, j, k
489	REAL :: ar1(nxl_z:nxr_z,0:nz-1)
490	REAL :: tri(5,nxl_z:nxr_z,0:nz-1)
491	#if defined( __parallel )
492	REAL :: ar(nxl_z:nxr_z,nys_z:nyn_z,1:nz)
493	#else
494	REAL :: ar(1:nz,nys_z:nyn_z,nxl_z:nxr_z)
495	#endif
496
497	!
498	!-- Forward substitution.
499	DO i = nxl_z, nxr_z
500	#if defined( __parallel )
501	ar1(i,0) = ar(i,j,1)
502	#else
503	ar1(i,0) = ar(1,j,i)
504	#endif
505	ENDDO
506	DO k = 1, nz - 1
507	DO i = nxl_z, nxr_z
508	#if defined( __parallel )
509	ar1(i,k) = ar(i,j,k+1) - tri(5,i,k) * ar1(i,k-1)
510	#else
511	ar1(i,k) = ar(k+1,j,i) - tri(5,i,k) * ar1(i,k-1)
512	#endif
513	ENDDO
514	ENDDO
515
516	!
517	!-- Backward substitution
518	!-- Note, the 1.0E-20 in the denominator is due to avoid divisions
519	!-- by zero appearing if the pressure bc is set to neumann at the top of
520	!-- the model domain.
521	DO i = nxl_z, nxr_z
522	#if defined( __parallel )
523	ar(i,j,nz) = ar1(i,nz-1) / ( tri(4,i,nz-1) + 1.0E-20 )
524	#else
525	ar(nz,j,i) = ar1(i,nz-1) / ( tri(4,i,nz-1) + 1.0E-20 )
526	#endif
527	ENDDO
528	DO k = nz-2, 0, -1
529	DO i = nxl_z, nxr_z
530	#if defined( __parallel )
531	ar(i,j,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,j,k+2) ) &
532	/ tri(4,i,k)
533	#else
534	ar(k+1,j,i) = ( ar1(i,k) - tri(3,i,k) * ar(k+2,j,i) ) &
535	/ tri(4,i,k)
536	#endif
537	ENDDO
538	ENDDO
539
540	!
541	!-- Indices i=0, j=0 correspond to horizontally averaged pressure.
542	!-- The respective values of ar should be zero at all k-levels if
543	!-- acceleration of horizontally averaged vertical velocity is zero.
544	IF ( ibc_p_b == 1 .AND. ibc_p_t == 1 ) THEN
545	IF ( j == 0 .AND. nxl_z == 0 ) THEN
546	#if defined( __parallel )
547	DO k = 1, nz
548	ar(nxl_z,j,k) = 0.0
549	ENDDO
550	#else
551	DO k = 1, nz
552	ar(k,j,nxl_z) = 0.0
553	ENDDO
554	#endif
555	ENDIF
556	ENDIF
557
558	END SUBROUTINE substi
559
560
561	SUBROUTINE split( tri )
562
563	!------------------------------------------------------------------------------!
564	! Splitting of the tridiagonal matrix (Thomas algorithm)
565	!------------------------------------------------------------------------------!
566
567	IMPLICIT NONE
568
569	INTEGER :: i, k
570	REAL :: tri(5,nxl_z:nxr_z,0:nz-1)
571
572	!
573	!-- Splitting.
574	DO i = nxl_z, nxr_z
575	tri(4,i,0) = tri(1,i,0)
576	ENDDO
577	DO k = 1, nz-1
578	DO i = nxl_z, nxr_z
579	tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
580	tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
581	ENDDO
582	ENDDO
583
584	END SUBROUTINE split
585
586	END SUBROUTINE tridia
587
588
589	#if defined( __parallel )
590	SUBROUTINE fftxp( ar, direction )
591
592	!------------------------------------------------------------------------------!
593	! Fourier-transformation along x-direction Parallelized version
594	!------------------------------------------------------------------------------!
595
596	IMPLICIT NONE
597
598	CHARACTER (LEN=*) :: direction
599	INTEGER :: j, k
600	REAL :: ar(0:nx,nys_x:nyn_x,nzb_x:nzt_x)
601
602	!
603	!-- Performing the fft with one of the methods implemented
604	!$OMP PARALLEL PRIVATE ( j, k )
605	!$OMP DO
606	DO k = nzb_x, nzt_x
607	DO j = nys_x, nyn_x
608	CALL fft_x( ar(0:nx,j,k), direction )
609	ENDDO
610	ENDDO
611	!$OMP END PARALLEL
612
613	END SUBROUTINE fftxp
614
615	#else
616	SUBROUTINE fftx( ar, direction )
617
618	!------------------------------------------------------------------------------!
619	! Fourier-transformation along x-direction Non parallel version
620	!------------------------------------------------------------------------------!
621
622	IMPLICIT NONE
623
624	CHARACTER (LEN=*) :: direction
625	INTEGER :: i, j, k
626	REAL :: ar(1:nz,0:ny,0:nx)
627
628	!
629	!-- Performing the fft with one of the methods implemented
630	!$OMP PARALLEL PRIVATE ( j, k )
631	!$OMP DO
632	DO k = 1, nz
633	DO j = 0, ny
634	CALL fft_x( ar(k,j,0:nx), direction )
635	ENDDO
636	ENDDO
637	!$OMP END PARALLEL
638
639	END SUBROUTINE fftx
640	#endif
641
642
643	#if defined( __parallel )
644	SUBROUTINE fftyp( ar, direction )
645
646	!------------------------------------------------------------------------------!
647	! Fourier-transformation along y-direction Parallelized version
648	!------------------------------------------------------------------------------!
649
650	IMPLICIT NONE
651
652	CHARACTER (LEN=*) :: direction
653	INTEGER :: i, k
654	REAL :: ar(0:ny,nxl_y:nxr_y,nzb_y:nzt_y)
655
656	!
657	!-- Performing the fft with one of the methods implemented
658	!$OMP PARALLEL PRIVATE ( i, k )
659	!$OMP DO
660	DO k = nzb_y, nzt_y
661	DO i = nxl_y, nxr_y
662	CALL fft_y( ar(0:ny,i,k), direction )
663	ENDDO
664	ENDDO
665	!$OMP END PARALLEL
666
667	END SUBROUTINE fftyp
668
669	#else
670	SUBROUTINE ffty( ar, direction )
671
672	!------------------------------------------------------------------------------!
673	! Fourier-transformation along y-direction Non parallel version
674	!------------------------------------------------------------------------------!
675
676	IMPLICIT NONE
677
678	CHARACTER (LEN=*) :: direction
679	INTEGER :: i, k
680	REAL :: ar(1:nz,0:ny,0:nx)
681
682	!
683	!-- Performing the fft with one of the methods implemented
684	!$OMP PARALLEL PRIVATE ( i, k )
685	!$OMP DO
686	DO k = 1, nz
687	DO i = 0, nx
688	CALL fft_y( ar(k,0:ny,i), direction )
689	ENDDO
690	ENDDO
691	!$OMP END PARALLEL
692
693	END SUBROUTINE ffty
694	#endif
695
696	#if defined( __parallel )
697	SUBROUTINE ffty_tr_yx( f_in, work, f_out )
698
699	!------------------------------------------------------------------------------!
700	! Fourier-transformation along y with subsequent transposition y --> x for
701	! a 1d-decomposition along x
702	!
703	! ATTENTION: The performance of this routine is much faster on the NEC-SX6,
704	! if the first index of work_ffty_vec is odd. Otherwise
705	! memory bank conflicts may occur (especially if the index is a
706	! multiple of 128). That's why work_ffty_vec is dimensioned as
707	! 0:ny+1.
708	! Of course, this will not work if users are using an odd number
709	! of gridpoints along y.
710	!------------------------------------------------------------------------------!
711
712	USE control_parameters
713	USE cpulog
714	USE indices
715	USE interfaces
716	USE pegrid
717	USE transpose_indices
718
719	IMPLICIT NONE
720
721	INTEGER :: i, iend, iouter, ir, j, k
722	INTEGER, PARAMETER :: stridex = 4
723
724	REAL, DIMENSION(0:ny,stridex) :: work_ffty
725	#if defined( __nec )
726	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
727	#endif
728	REAL, DIMENSION(1:nz,0:ny,nxl:nxr) :: f_in
729	REAL, DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_out
730	REAL, DIMENSION(nxl:nxr,1:nz,0:ny) :: work
731
732	!
733	!-- Carry out the FFT along y, where all data are present due to the
734	!-- 1d-decomposition along x. Resort the data in a way that x becomes
735	!-- the first index.
736	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
737
738	IF ( host(1:3) == 'nec' ) THEN
739	#if defined( __nec )
740	!
741	!-- Code optimized for vector processors
742	!$OMP PARALLEL PRIVATE ( i, j, k )
743	!$OMP DO
744	DO i = nxl, nxr
745
746	DO j = 0, ny
747	DO k = 1, nz
748	work_ffty_vec(j,k,i) = f_in(k,j,i)
749	ENDDO
750	ENDDO
751
752	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' )
753
754	ENDDO
755
756	!$OMP DO
757	DO k = 1, nz
758	DO j = 0, ny
759	DO i = nxl, nxr
760	work(i,k,j) = work_ffty_vec(j,k,i)
761	ENDDO
762	ENDDO
763	ENDDO
764	!$OMP END PARALLEL
765	#endif
766
767	ELSE
768
769	!
770	!-- Cache optimized code.
771	!-- The i-(x-)direction is split into a strided outer loop and an inner
772	!-- loop for better cache performance
773	!$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty)
774	!$OMP DO
775	DO iouter = nxl, nxr, stridex
776
777	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
778
779	DO k = 1, nz
780
781	DO i = iouter, iend
782
783	ir = i-iouter+1 ! counter within a stride
784	DO j = 0, ny
785	work_ffty(j,ir) = f_in(k,j,i)
786	ENDDO
787	!
788	!-- FFT along y
789	CALL fft_y( work_ffty(:,ir), 'forward' )
790
791	ENDDO
792
793	!
794	!-- Resort
795	DO j = 0, ny
796	DO i = iouter, iend
797	work(i,k,j) = work_ffty(j,i-iouter+1)
798	ENDDO
799	ENDDO
800
801	ENDDO
802
803	ENDDO
804	!$OMP END PARALLEL
805
806	ENDIF
807	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
808
809	!
810	!-- Transpose array
811	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
812	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
813	CALL MPI_ALLTOALL( work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
814	f_out(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
815	comm1dx, ierr )
816	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
817
818	END SUBROUTINE ffty_tr_yx
819
820
821	SUBROUTINE tr_xy_ffty( f_in, work, f_out )
822
823	!------------------------------------------------------------------------------!
824	! Transposition x --> y with a subsequent backward Fourier transformation for
825	! a 1d-decomposition along x
826	!------------------------------------------------------------------------------!
827
828	USE control_parameters
829	USE cpulog
830	USE indices
831	USE interfaces
832	USE pegrid
833	USE transpose_indices
834
835	IMPLICIT NONE
836
837	INTEGER :: i, iend, iouter, ir, j, k
838	INTEGER, PARAMETER :: stridex = 4
839
840	REAL, DIMENSION(0:ny,stridex) :: work_ffty
841	#if defined( __nec )
842	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
843	#endif
844	REAL, DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_in
845	REAL, DIMENSION(1:nz,0:ny,nxl:nxr) :: f_out
846	REAL, DIMENSION(nxl:nxr,1:nz,0:ny) :: work
847
848	!
849	!-- Transpose array
850	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
851	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
852	CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
853	work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
854	comm1dx, ierr )
855	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
856
857	!
858	!-- Resort the data in a way that y becomes the first index and carry out the
859	!-- backward fft along y.
860	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
861
862	IF ( host(1:3) == 'nec' ) THEN
863	#if defined( __nec )
864	!
865	!-- Code optimized for vector processors
866	!$OMP PARALLEL PRIVATE ( i, j, k )
867	!$OMP DO
868	DO k = 1, nz
869	DO j = 0, ny
870	DO i = nxl, nxr
871	work_ffty_vec(j,k,i) = work(i,k,j)
872	ENDDO
873	ENDDO
874	ENDDO
875
876	!$OMP DO
877	DO i = nxl, nxr
878
879	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' )
880
881	DO j = 0, ny
882	DO k = 1, nz
883	f_out(k,j,i) = work_ffty_vec(j,k,i)
884	ENDDO
885	ENDDO
886
887	ENDDO
888	!$OMP END PARALLEL
889	#endif
890
891	ELSE
892
893	!
894	!-- Cache optimized code.
895	!-- The i-(x-)direction is split into a strided outer loop and an inner
896	!-- loop for better cache performance
897	!$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty )
898	!$OMP DO
899	DO iouter = nxl, nxr, stridex
900
901	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
902
903	DO k = 1, nz
904	!
905	!-- Resort
906	DO j = 0, ny
907	DO i = iouter, iend
908	work_ffty(j,i-iouter+1) = work(i,k,j)
909	ENDDO
910	ENDDO
911
912	DO i = iouter, iend
913
914	!
915	!-- FFT along y
916	ir = i-iouter+1 ! counter within a stride
917	CALL fft_y( work_ffty(:,ir), 'backward' )
918
919	DO j = 0, ny
920	f_out(k,j,i) = work_ffty(j,ir)
921	ENDDO
922	ENDDO
923
924	ENDDO
925
926	ENDDO
927	!$OMP END PARALLEL
928
929	ENDIF
930
931	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
932
933	END SUBROUTINE tr_xy_ffty
934
935
936	SUBROUTINE fftx_tri_fftx( ar )
937
938	!------------------------------------------------------------------------------!
939	! FFT along x, solution of the tridiagonal system and backward FFT for
940	! a 1d-decomposition along x
941	!
942	! WARNING: this subroutine may still not work for hybrid parallelization
943	! with OpenMP (for possible necessary changes see the original
944	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
945	!------------------------------------------------------------------------------!
946
947	USE control_parameters
948	USE cpulog
949	USE grid_variables
950	USE indices
951	USE interfaces
952	USE pegrid
953	USE transpose_indices
954
955	IMPLICIT NONE
956
957	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
958
959	REAL, DIMENSION(0:nx) :: work_fftx
960	REAL, DIMENSION(0:nx,1:nz) :: work_trix
961	REAL, DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: ar
962	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
963
964
965	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'start' )
966
967	ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) )
968
969	tn = 0 ! Default thread number in case of one thread
970	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_fftx, work_trix )
971	DO j = nys_x, nyn_x
972
973	!$ tn = omp_get_thread_num()
974
975	IF ( host(1:3) == 'nec' ) THEN
976	!
977	!-- Code optimized for vector processors
978	DO k = 1, nz
979
980	m = 0
981	DO n = 1, pdims(1)
982	DO i = 1, nnx
983	work_trix(m,k) = ar(i,k,j,n)
984	m = m + 1
985	ENDDO
986	ENDDO
987
988	ENDDO
989
990	CALL fft_x_m( work_trix, 'forward' )
991
992	ELSE
993	!
994	!-- Cache optimized code
995	DO k = 1, nz
996
997	m = 0
998	DO n = 1, pdims(1)
999	DO i = 1, nnx
1000	work_fftx(m) = ar(i,k,j,n)
1001	m = m + 1
1002	ENDDO
1003	ENDDO
1004
1005	CALL fft_x( work_fftx, 'forward' )
1006
1007	DO i = 0, nx
1008	work_trix(i,k) = work_fftx(i)
1009	ENDDO
1010
1011	ENDDO
1012
1013	ENDIF
1014
1015	!
1016	!-- Solve the linear equation system
1017	CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) )
1018
1019	IF ( host(1:3) == 'nec' ) THEN
1020	!
1021	!-- Code optimized for vector processors
1022	CALL fft_x_m( work_trix, 'backward' )
1023
1024	DO k = 1, nz
1025
1026	m = 0
1027	DO n = 1, pdims(1)
1028	DO i = 1, nnx
1029	ar(i,k,j,n) = work_trix(m,k)
1030	m = m + 1
1031	ENDDO
1032	ENDDO
1033
1034	ENDDO
1035
1036	ELSE
1037	!
1038	!-- Cache optimized code
1039	DO k = 1, nz
1040
1041	DO i = 0, nx
1042	work_fftx(i) = work_trix(i,k)
1043	ENDDO
1044
1045	CALL fft_x( work_fftx, 'backward' )
1046
1047	m = 0
1048	DO n = 1, pdims(1)
1049	DO i = 1, nnx
1050	ar(i,k,j,n) = work_fftx(m)
1051	m = m + 1
1052	ENDDO
1053	ENDDO
1054
1055	ENDDO
1056
1057	ENDIF
1058
1059	ENDDO
1060
1061	DEALLOCATE( tri )
1062
1063	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'stop' )
1064
1065	END SUBROUTINE fftx_tri_fftx
1066
1067
1068	SUBROUTINE fftx_tr_xy( f_in, work, f_out )
1069
1070	!------------------------------------------------------------------------------!
1071	! Fourier-transformation along x with subsequent transposition x --> y for
1072	! a 1d-decomposition along y
1073	!
1074	! ATTENTION: The NEC-branch of this routine may significantly profit from
1075	! further optimizations. So far, performance is much worse than
1076	! for routine ffty_tr_yx (more than three times slower).
1077	!------------------------------------------------------------------------------!
1078
1079	USE control_parameters
1080	USE cpulog
1081	USE indices
1082	USE interfaces
1083	USE pegrid
1084	USE transpose_indices
1085
1086	IMPLICIT NONE
1087
1088	INTEGER :: i, j, k
1089
1090	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
1091	REAL, DIMENSION(1:nz,nys:nyn,0:nx) :: f_in
1092	REAL, DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_out
1093	REAL, DIMENSION(nys:nyn,1:nz,0:nx) :: work
1094
1095	!
1096	!-- Carry out the FFT along x, where all data are present due to the
1097	!-- 1d-decomposition along y. Resort the data in a way that y becomes
1098	!-- the first index.
1099	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
1100
1101	IF ( host(1:3) == 'nec' ) THEN
1102	!
1103	!-- Code for vector processors
1104	!$OMP PARALLEL PRIVATE ( i, j, k )
1105	!$OMP DO
1106	DO i = 0, nx
1107
1108	DO j = nys, nyn
1109	DO k = 1, nz
1110	work_fftx(i,k,j) = f_in(k,j,i)
1111	ENDDO
1112	ENDDO
1113
1114	ENDDO
1115
1116	!$OMP DO
1117	DO j = nys, nyn
1118
1119	CALL fft_x_m( work_fftx(:,:,j), 'forward' )
1120
1121	DO k = 1, nz
1122	DO i = 0, nx
1123	work(j,k,i) = work_fftx(i,k,j)
1124	ENDDO
1125	ENDDO
1126
1127	ENDDO
1128	!$OMP END PARALLEL
1129
1130	ELSE
1131
1132	!
1133	!-- Cache optimized code (there might be still a potential for better
1134	!-- optimization).
1135	!$OMP PARALLEL PRIVATE (i,j,k)
1136	!$OMP DO
1137	DO i = 0, nx
1138
1139	DO j = nys, nyn
1140	DO k = 1, nz
1141	work_fftx(i,k,j) = f_in(k,j,i)
1142	ENDDO
1143	ENDDO
1144
1145	ENDDO
1146
1147	!$OMP DO
1148	DO j = nys, nyn
1149	DO k = 1, nz
1150
1151	CALL fft_x( work_fftx(0:nx,k,j), 'forward' )
1152
1153	DO i = 0, nx
1154	work(j,k,i) = work_fftx(i,k,j)
1155	ENDDO
1156	ENDDO
1157
1158	ENDDO
1159	!$OMP END PARALLEL
1160
1161	ENDIF
1162	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
1163
1164	!
1165	!-- Transpose array
1166	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1167	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1168	CALL MPI_ALLTOALL( work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
1169	f_out(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
1170	comm1dy, ierr )
1171	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1172
1173	END SUBROUTINE fftx_tr_xy
1174
1175
1176	SUBROUTINE tr_yx_fftx( f_in, work, f_out )
1177
1178	!------------------------------------------------------------------------------!
1179	! Transposition y --> x with a subsequent backward Fourier transformation for
1180	! a 1d-decomposition along x
1181	!------------------------------------------------------------------------------!
1182
1183	USE control_parameters
1184	USE cpulog
1185	USE indices
1186	USE interfaces
1187	USE pegrid
1188	USE transpose_indices
1189
1190	IMPLICIT NONE
1191
1192	INTEGER :: i, j, k
1193
1194	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
1195	REAL, DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_in
1196	REAL, DIMENSION(1:nz,nys:nyn,0:nx) :: f_out
1197	REAL, DIMENSION(nys:nyn,1:nz,0:nx) :: work
1198
1199	!
1200	!-- Transpose array
1201	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1202	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1203	CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
1204	work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
1205	comm1dy, ierr )
1206	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1207
1208	!
1209	!-- Carry out the FFT along x, where all data are present due to the
1210	!-- 1d-decomposition along y. Resort the data in a way that y becomes
1211	!-- the first index.
1212	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
1213
1214	IF ( host(1:3) == 'nec' ) THEN
1215	!
1216	!-- Code optimized for vector processors
1217	!$OMP PARALLEL PRIVATE ( i, j, k )
1218	!$OMP DO
1219	DO j = nys, nyn
1220
1221	DO k = 1, nz
1222	DO i = 0, nx
1223	work_fftx(i,k,j) = work(j,k,i)
1224	ENDDO
1225	ENDDO
1226
1227	CALL fft_x_m( work_fftx(:,:,j), 'backward' )
1228
1229	ENDDO
1230
1231	!$OMP DO
1232	DO i = 0, nx
1233	DO j = nys, nyn
1234	DO k = 1, nz
1235	f_out(k,j,i) = work_fftx(i,k,j)
1236	ENDDO
1237	ENDDO
1238	ENDDO
1239	!$OMP END PARALLEL
1240
1241	ELSE
1242
1243	!
1244	!-- Cache optimized code (there might be still a potential for better
1245	!-- optimization).
1246	!$OMP PARALLEL PRIVATE (i,j,k)
1247	!$OMP DO
1248	DO j = nys, nyn
1249	DO k = 1, nz
1250
1251	DO i = 0, nx
1252	work_fftx(i,k,j) = work(j,k,i)
1253	ENDDO
1254
1255	CALL fft_x( work_fftx(0:nx,k,j), 'backward' )
1256
1257	ENDDO
1258	ENDDO
1259
1260	!$OMP DO
1261	DO i = 0, nx
1262	DO j = nys, nyn
1263	DO k = 1, nz
1264	f_out(k,j,i) = work_fftx(i,k,j)
1265	ENDDO
1266	ENDDO
1267	ENDDO
1268	!$OMP END PARALLEL
1269
1270	ENDIF
1271	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
1272
1273	END SUBROUTINE tr_yx_fftx
1274
1275
1276	SUBROUTINE ffty_tri_ffty( ar )
1277
1278	!------------------------------------------------------------------------------!
1279	! FFT along y, solution of the tridiagonal system and backward FFT for
1280	! a 1d-decomposition along y
1281	!
1282	! WARNING: this subroutine may still not work for hybrid parallelization
1283	! with OpenMP (for possible necessary changes see the original
1284	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
1285	!------------------------------------------------------------------------------!
1286
1287	USE control_parameters
1288	USE cpulog
1289	USE grid_variables
1290	USE indices
1291	USE interfaces
1292	USE pegrid
1293	USE transpose_indices
1294
1295	IMPLICIT NONE
1296
1297	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
1298
1299	REAL, DIMENSION(0:ny) :: work_ffty
1300	REAL, DIMENSION(0:ny,1:nz) :: work_triy
1301	REAL, DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: ar
1302	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
1303
1304
1305	CALL cpu_log( log_point_s(39), 'fft_y + tridia', 'start' )
1306
1307	ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) )
1308
1309	tn = 0 ! Default thread number in case of one thread
1310	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy )
1311	DO i = nxl_y, nxr_y
1312
1313	!$ tn = omp_get_thread_num()
1314
1315	IF ( host(1:3) == 'nec' ) THEN
1316	!
1317	!-- Code optimized for vector processors
1318	DO k = 1, nz
1319
1320	m = 0
1321	DO n = 1, pdims(2)
1322	DO j = 1, nny
1323	work_triy(m,k) = ar(j,k,i,n)
1324	m = m + 1
1325	ENDDO
1326	ENDDO
1327
1328	ENDDO
1329
1330	CALL fft_y_m( work_triy, ny, 'forward' )
1331
1332	ELSE
1333	!
1334	!-- Cache optimized code
1335	DO k = 1, nz
1336
1337	m = 0
1338	DO n = 1, pdims(2)
1339	DO j = 1, nny
1340	work_ffty(m) = ar(j,k,i,n)
1341	m = m + 1
1342	ENDDO
1343	ENDDO
1344
1345	CALL fft_y( work_ffty, 'forward' )
1346
1347	DO j = 0, ny
1348	work_triy(j,k) = work_ffty(j)
1349	ENDDO
1350
1351	ENDDO
1352
1353	ENDIF
1354
1355	!
1356	!-- Solve the linear equation system
1357	CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) )
1358
1359	IF ( host(1:3) == 'nec' ) THEN
1360	!
1361	!-- Code optimized for vector processors
1362	CALL fft_y_m( work_triy, ny, 'backward' )
1363
1364	DO k = 1, nz
1365
1366	m = 0
1367	DO n = 1, pdims(2)
1368	DO j = 1, nny
1369	ar(j,k,i,n) = work_triy(m,k)
1370	m = m + 1
1371	ENDDO
1372	ENDDO
1373
1374	ENDDO
1375
1376	ELSE
1377	!
1378	!-- Cache optimized code
1379	DO k = 1, nz
1380
1381	DO j = 0, ny
1382	work_ffty(j) = work_triy(j,k)
1383	ENDDO
1384
1385	CALL fft_y( work_ffty, 'backward' )
1386
1387	m = 0
1388	DO n = 1, pdims(2)
1389	DO j = 1, nny
1390	ar(j,k,i,n) = work_ffty(m)
1391	m = m + 1
1392	ENDDO
1393	ENDDO
1394
1395	ENDDO
1396
1397	ENDIF
1398
1399	ENDDO
1400
1401	DEALLOCATE( tri )
1402
1403	CALL cpu_log( log_point_s(39), 'fft_y + tridia', 'stop' )
1404
1405	END SUBROUTINE ffty_tri_ffty
1406
1407
1408	SUBROUTINE tridia_1dd( ddx2, ddy2, nx, ny, j, ar, tri )
1409
1410	!------------------------------------------------------------------------------!
1411	! Solves the linear system of equations for a 1d-decomposition along x (see
1412	! tridia)
1413	!
1414	! Attention: when using the intel compilers older than 12.0, array tri must
1415	! be passed as an argument to the contained subroutines. Otherwise
1416	! addres faults will occur. This feature can be activated with
1417	! cpp-switch __intel11
1418	! On NEC, tri should not be passed (except for routine substi_1dd)
1419	! because this causes very bad performance.
1420	!------------------------------------------------------------------------------!
1421
1422	USE arrays_3d
1423	USE control_parameters
1424
1425	USE pegrid
1426
1427	IMPLICIT NONE
1428
1429	INTEGER :: i, j, k, nnyh, nx, ny, omp_get_thread_num, tn
1430
1431	REAL :: ddx2, ddy2
1432
1433	REAL, DIMENSION(0:nx,1:nz) :: ar
1434	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
1435
1436
1437	nnyh = ( ny + 1 ) / 2
1438
1439	!
1440	!-- Define constant elements of the tridiagonal matrix.
1441	!-- The compiler on SX6 does loop exchange. If 0:nx is a high power of 2,
1442	!-- the exchanged loops create bank conflicts. The following directive
1443	!-- prohibits loop exchange and the loops perform much better.
1444	! tn = omp_get_thread_num()
1445	! WRITE( 120+tn, * ) '+++ id=',myid,' nx=',nx,' thread=', omp_get_thread_num()
1446	! CALL local_flush( 120+tn )
1447	!CDIR NOLOOPCHG
1448	DO k = 0, nz-1
1449	DO i = 0,nx
1450	tri(2,i,k) = ddzu_pres(k+1) * ddzw(k+1)
1451	tri(3,i,k) = ddzu_pres(k+2) * ddzw(k+1)
1452	ENDDO
1453	ENDDO
1454	! WRITE( 120+tn, * ) '+++ id=',myid,' end of first tridia loop thread=', omp_get_thread_num()
1455	! CALL local_flush( 120+tn )
1456
1457	IF ( j <= nnyh ) THEN
1458	#if defined( __intel11 )
1459	CALL maketri_1dd( j, tri )
1460	#else
1461	CALL maketri_1dd( j )
1462	#endif
1463	ELSE
1464	#if defined( __intel11 )
1465	CALL maketri_1dd( ny+1-j, tri )
1466	#else
1467	CALL maketri_1dd( ny+1-j )
1468	#endif
1469	ENDIF
1470	#if defined( __intel11 )
1471	CALL split_1dd( tri )
1472	#else
1473	CALL split_1dd
1474	#endif
1475	CALL substi_1dd( ar, tri )
1476
1477	CONTAINS
1478
1479	#if defined( __intel11 )
1480	SUBROUTINE maketri_1dd( j, tri )
1481	#else
1482	SUBROUTINE maketri_1dd( j )
1483	#endif
1484
1485	!------------------------------------------------------------------------------!
1486	! computes the i- and j-dependent component of the matrix
1487	!------------------------------------------------------------------------------!
1488
1489	USE constants
1490
1491	IMPLICIT NONE
1492
1493	INTEGER :: i, j, k, nnxh
1494	REAL :: a, c
1495
1496	REAL, DIMENSION(0:nx) :: l
1497
1498	#if defined( __intel11 )
1499	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
1500	#endif
1501
1502
1503	nnxh = ( nx + 1 ) / 2
1504	!
1505	!-- Provide the tridiagonal matrix for solution of the Poisson equation in
1506	!-- Fourier space. The coefficients are computed following the method of
1507	!-- Schmidt et al. (DFVLR-Mitteilung 84-15), which departs from Stephan
1508	!-- Siano's original version by discretizing the Poisson equation,
1509	!-- before it is Fourier-transformed
1510	DO i = 0, nx
1511	IF ( i >= 0 .AND. i <= nnxh ) THEN
1512	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
1513	REAL( nx+1 ) ) ) * ddx2 + &
1514	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
1515	REAL( ny+1 ) ) ) * ddy2
1516	ELSE
1517	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
1518	REAL( nx+1 ) ) ) * ddx2 + &
1519	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
1520	REAL( ny+1 ) ) ) * ddy2
1521	ENDIF
1522	ENDDO
1523
1524	DO k = 0, nz-1
1525	DO i = 0, nx
1526	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
1527	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
1528	tri(1,i,k) = a + c - l(i)
1529	ENDDO
1530	ENDDO
1531	IF ( ibc_p_b == 1 .OR. ibc_p_b == 2 ) THEN
1532	DO i = 0, nx
1533	tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
1534	ENDDO
1535	ENDIF
1536	IF ( ibc_p_t == 1 ) THEN
1537	DO i = 0, nx
1538	tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
1539	ENDDO
1540	ENDIF
1541
1542	END SUBROUTINE maketri_1dd
1543
1544
1545	#if defined( __intel11 )
1546	SUBROUTINE split_1dd( tri )
1547	#else
1548	SUBROUTINE split_1dd
1549	#endif
1550
1551	!------------------------------------------------------------------------------!
1552	! Splitting of the tridiagonal matrix (Thomas algorithm)
1553	!------------------------------------------------------------------------------!
1554
1555	IMPLICIT NONE
1556
1557	INTEGER :: i, k
1558
1559	#if defined( __intel11 )
1560	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
1561	#endif
1562
1563
1564	!
1565	!-- Splitting
1566	DO i = 0, nx
1567	tri(4,i,0) = tri(1,i,0)
1568	ENDDO
1569	DO k = 1, nz-1
1570	DO i = 0, nx
1571	tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
1572	tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
1573	ENDDO
1574	ENDDO
1575
1576	END SUBROUTINE split_1dd
1577
1578
1579	SUBROUTINE substi_1dd( ar, tri )
1580
1581	!------------------------------------------------------------------------------!
1582	! Substitution (Forward and Backward) (Thomas algorithm)
1583	!------------------------------------------------------------------------------!
1584
1585	IMPLICIT NONE
1586
1587	INTEGER :: i, k
1588
1589	REAL, DIMENSION(0:nx,nz) :: ar
1590	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
1591	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
1592
1593	!
1594	!-- Forward substitution
1595	DO i = 0, nx
1596	ar1(i,0) = ar(i,1)
1597	ENDDO
1598	DO k = 1, nz-1
1599	DO i = 0, nx
1600	ar1(i,k) = ar(i,k+1) - tri(5,i,k) * ar1(i,k-1)
1601	ENDDO
1602	ENDDO
1603
1604	!
1605	!-- Backward substitution
1606	!-- Note, the add of 1.0E-20 in the denominator is due to avoid divisions
1607	!-- by zero appearing if the pressure bc is set to neumann at the top of
1608	!-- the model domain.
1609	DO i = 0, nx
1610	ar(i,nz) = ar1(i,nz-1) / ( tri(4,i,nz-1) + 1.0E-20 )
1611	ENDDO
1612	DO k = nz-2, 0, -1
1613	DO i = 0, nx
1614	ar(i,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,k+2) ) &
1615	/ tri(4,i,k)
1616	ENDDO
1617	ENDDO
1618
1619	!
1620	!-- Indices i=0, j=0 correspond to horizontally averaged pressure.
1621	!-- The respective values of ar should be zero at all k-levels if
1622	!-- acceleration of horizontally averaged vertical velocity is zero.
1623	IF ( ibc_p_b == 1 .AND. ibc_p_t == 1 ) THEN
1624	IF ( j == 0 ) THEN
1625	DO k = 1, nz
1626	ar(0,k) = 0.0
1627	ENDDO
1628	ENDIF
1629	ENDIF
1630
1631	END SUBROUTINE substi_1dd
1632
1633	END SUBROUTINE tridia_1dd
1634
1635	#endif
1636	#endif
1637	END MODULE poisfft_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |