Home

Context Navigation

source: palm/trunk/SOURCE/poisfft.f90 @ 762

Last change on this file since 762 was 762, checked in by suehring, 13 years ago
last commit documented
Property svn:keywords set to `Id`
File size: 45.8 KB

Line
1	MODULE poisfft_mod
2
3	!------------------------------------------------------------------------------!
4	! Current revisions:
5	! -----------------
6	!
7	!
8	! Former revisions:
9	! -----------------
10	! $Id: poisfft.f90 762 2011-10-05 18:00:44Z suehring $
11	!
12	! 761 2011-10-05 17:58:52Z suehring
13	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
14	! pressure at the top of the model domain.
15	!
16	! 696 2011-03-18 07:03:49Z raasch
17	! work_fftx removed from PRIVATE clauses in fftx_tr_xy and tr_yx_fftx
18	!
19	! 683 2011-02-09 14:25:15Z raasch
20	! openMP parallelization for 2d-domain-decomposition
21	!
22	! 667 2010-12-23 12:06:00Z suehring/gryschka
23	! ddzu replaced by ddzu_pres due to changes in zu(0)
24	!
25	! 622 2010-12-10 08:08:13Z raasch
26	! optional barriers included in order to speed up collective operations
27	!
28	! 377 2009-09-04 11:09:00Z raasch
29	! __lcmuk changed to __lc to avoid problems with Intel compiler on sgi-ice
30	!
31	! 164 2008-05-15 08:46:15Z raasch
32	! Arguments removed from transpose routines
33	!
34	! 128 2007-10-26 13:11:14Z raasch
35	! Bugfix: wavenumber calculation for even nx in routines maketri
36	!
37	! 85 2007-05-11 09:35:14Z raasch
38	! Bugfix: work_fft*_vec removed from some PRIVATE-declarations
39	!
40	! 76 2007-03-29 00:58:32Z raasch
41	! Tridiagonal coefficients adjusted for Neumann boundary conditions both at
42	! the bottom and the top.
43	!
44	! RCS Log replace by Id keyword, revision history cleaned up
45	!
46	! Revision 1.24 2006/08/04 15:00:24 raasch
47	! Default setting of the thread number tn in case of not using OpenMP
48	!
49	! Revision 1.23 2006/02/23 12:48:38 raasch
50	! Additional compiler directive in routine tridia_1dd for preventing loop
51	! exchange on NEC-SX6
52	!
53	! Revision 1.20 2004/04/30 12:38:09 raasch
54	! Parts of former poisfft_hybrid moved to this subroutine,
55	! former subroutine changed to a module, renaming of FFT-subroutines and
56	! -module, FFTs completely substituted by calls of fft_x and fft_y,
57	! NAG fft used in the non-parallel case completely removed, l in maketri
58	! is now a 1d-array, variables passed by modules instead of using parameter
59	! lists, enlarged transposition arrays introduced
60	!
61	! Revision 1.1 1997/07/24 11:24:14 raasch
62	! Initial revision
63	!
64	!
65	! Description:
66	! ------------
67	! See below.
68	!------------------------------------------------------------------------------!
69
70	!--------------------------------------------------------------------------!
71	! poisfft !
72	! !
73	! Original version: Stephan Siano (pois3d) !
74	! !
75	! Institute of Meteorology and Climatology, University of Hannover !
76	! Germany !
77	! !
78	! Version as of July 23,1996 !
79	! !
80	! !
81	! Version for parallel computers: Siegfried Raasch !
82	! !
83	! Version as of July 03,1997 !
84	! !
85	! Solves the Poisson equation with a 2D spectral method !
86	! d^2 p / dx^2 + d^2 p / dy^2 + d^2 p / dz^2 = s !
87	! !
88	! Input: !
89	! real ar contains in the (nnx,nny,nnz) elements, !
90	! starting from the element (1,nys,nxl), the !
91	! values for s !
92	! real work Temporary array !
93	! !
94	! Output: !
95	! real ar contains the solution for p !
96	!--------------------------------------------------------------------------!
97
98	USE fft_xy
99	USE indices
100	USE transpose_indices
101
102	IMPLICIT NONE
103
104	PRIVATE
105	PUBLIC poisfft, poisfft_init
106
107	INTERFACE poisfft
108	MODULE PROCEDURE poisfft
109	END INTERFACE poisfft
110
111	INTERFACE poisfft_init
112	MODULE PROCEDURE poisfft_init
113	END INTERFACE poisfft_init
114
115	CONTAINS
116
117	SUBROUTINE poisfft_init
118
119	CALL fft_init
120
121	END SUBROUTINE poisfft_init
122
123
124	SUBROUTINE poisfft( ar, work )
125
126	USE cpulog
127	USE interfaces
128	USE pegrid
129
130	IMPLICIT NONE
131
132	REAL, DIMENSION(1:nza,nys:nyna,nxl:nxra) :: ar, work
133
134
135	CALL cpu_log( log_point_s(3), 'poisfft', 'start' )
136
137	!
138	!-- Two-dimensional Fourier Transformation in x- and y-direction.
139	#if defined( __parallel )
140	IF ( pdims(2) == 1 ) THEN
141
142	!
143	!-- 1d-domain-decomposition along x:
144	!-- FFT along y and transposition y --> x
145	CALL ffty_tr_yx( ar, work, ar )
146
147	!
148	!-- FFT along x, solving the tridiagonal system and backward FFT
149	CALL fftx_tri_fftx( ar )
150
151	!
152	!-- Transposition x --> y and backward FFT along y
153	CALL tr_xy_ffty( ar, work, ar )
154
155	ELSEIF ( pdims(1) == 1 ) THEN
156
157	!
158	!-- 1d-domain-decomposition along y:
159	!-- FFT along x and transposition x --> y
160	CALL fftx_tr_xy( ar, work, ar )
161
162	!
163	!-- FFT along y, solving the tridiagonal system and backward FFT
164	CALL ffty_tri_ffty( ar )
165
166	!
167	!-- Transposition y --> x and backward FFT along x
168	CALL tr_yx_fftx( ar, work, ar )
169
170	ELSE
171
172	!
173	!-- 2d-domain-decomposition
174	!-- Transposition z --> x
175	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
176	CALL transpose_zx( ar, work, ar )
177	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
178
179	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
180	CALL fftxp( ar, 'forward' )
181	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
182
183	!
184	!-- Transposition x --> y
185	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
186	CALL transpose_xy( ar, work, ar )
187	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
188
189	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
190	CALL fftyp( ar, 'forward' )
191	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
192
193	!
194	!-- Transposition y --> z
195	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
196	CALL transpose_yz( ar, work, ar )
197	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
198
199	!
200	!-- Solve the Poisson equation in z-direction in cartesian space.
201	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
202	CALL tridia( ar )
203	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
204
205	!
206	!-- Inverse Fourier Transformation
207	!-- Transposition z --> y
208	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
209	CALL transpose_zy( ar, work, ar )
210	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
211
212	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
213	CALL fftyp( ar, 'backward' )
214	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
215
216	!
217	!-- Transposition y --> x
218	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
219	CALL transpose_yx( ar, work, ar )
220	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
221
222	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
223	CALL fftxp( ar, 'backward' )
224	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
225
226	!
227	!-- Transposition x --> z
228	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
229	CALL transpose_xz( ar, work, ar )
230	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
231
232	ENDIF
233
234	#else
235
236	!
237	!-- Two-dimensional Fourier Transformation along x- and y-direction.
238	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
239	CALL fftx( ar, 'forward' )
240	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
241	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
242	CALL ffty( ar, 'forward' )
243	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
244
245	!
246	!-- Solve the Poisson equation in z-direction in cartesian space.
247	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
248	CALL tridia( ar )
249	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
250
251	!
252	!-- Inverse Fourier Transformation.
253	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
254	CALL ffty( ar, 'backward' )
255	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
256	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
257	CALL fftx( ar, 'backward' )
258	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
259
260	#endif
261
262	CALL cpu_log( log_point_s(3), 'poisfft', 'stop' )
263
264	END SUBROUTINE poisfft
265
266
267
268	SUBROUTINE tridia( ar )
269
270	!------------------------------------------------------------------------------!
271	! solves the linear system of equations:
272	!
273	! -(4 pi^2(i^2/(dx^2nnx^2)+j^2/(dy^2nny^2))+
274	! 1/(dzu(k)dzw(k))+1/(dzu(k-1)dzw(k)))*p(i,j,k)+
275	! 1/(dzu(k)dzw(k))p(i,j,k+1)+1/(dzu(k-1)dzw(k))p(i,j,k-1)=d(i,j,k)
276	!
277	! by using the Thomas algorithm
278	!------------------------------------------------------------------------------!
279
280	USE arrays_3d
281
282	IMPLICIT NONE
283
284	INTEGER :: i, j, k, nnyh
285
286	REAL, DIMENSION(nxl_z:nxr_z,0:nz-1) :: ar1
287	REAL, DIMENSION(5,nxl_z:nxr_z,0:nz-1) :: tri
288
289	#if defined( __parallel )
290	REAL :: ar(nxl_z:nxr_za,nys_z:nyn_za,1:nza)
291	#else
292	REAL :: ar(1:nz,nys_z:nyn_z,nxl_z:nxr_z)
293	#endif
294
295
296	nnyh = (ny+1) / 2
297
298	!
299	!-- Define constant elements of the tridiagonal matrix.
300	!$OMP PARALLEL PRIVATE ( k, i )
301	!$OMP DO
302	DO k = 0, nz-1
303	DO i = nxl_z, nxr_z
304	tri(2,i,k) = ddzu_pres(k+1) * ddzw(k+1)
305	tri(3,i,k) = ddzu_pres(k+2) * ddzw(k+1)
306	ENDDO
307	ENDDO
308	!$OMP END PARALLEL
309
310	#if defined( __parallel )
311	!
312	!-- Repeat for all y-levels.
313	!$OMP PARALLEL FIRSTPRIVATE( tri ) PRIVATE ( ar1, j )
314	!$OMP DO
315	DO j = nys_z, nyn_z
316	IF ( j <= nnyh ) THEN
317	CALL maketri( tri, j )
318	ELSE
319	CALL maketri( tri, ny+1-j )
320	ENDIF
321	CALL split( tri )
322	CALL substi( ar, ar1, tri, j )
323	ENDDO
324	!$OMP END PARALLEL
325	#else
326	!
327	!-- First y-level.
328	CALL maketri( tri, nys_z )
329	CALL split( tri )
330	CALL substi( ar, ar1, tri, 0 )
331
332	!
333	!-- Further y-levels.
334	DO j = 1, nnyh - 1
335	CALL maketri( tri, j )
336	CALL split( tri )
337	CALL substi( ar, ar1, tri, j )
338	CALL substi( ar, ar1, tri, ny+1-j )
339	ENDDO
340	CALL maketri( tri, nnyh )
341	CALL split( tri )
342	CALL substi( ar, ar1, tri, nnyh+nys )
343	#endif
344
345	CONTAINS
346
347	SUBROUTINE maketri( tri, j )
348
349	!------------------------------------------------------------------------------!
350	! Computes the i- and j-dependent component of the matrix
351	!------------------------------------------------------------------------------!
352
353	USE arrays_3d
354	USE constants
355	USE control_parameters
356	USE grid_variables
357
358	IMPLICIT NONE
359
360	INTEGER :: i, j, k, nnxh
361	REAL :: a, c
362	REAL :: ll(nxl_z:nxr_z)
363	REAL :: tri(5,nxl_z:nxr_z,0:nz-1)
364
365
366	nnxh = ( nx + 1 ) / 2
367
368	!
369	!-- Provide the tridiagonal matrix for solution of the Poisson equation in
370	!-- Fourier space. The coefficients are computed following the method of
371	!-- Schmidt et al. (DFVLR-Mitteilung 84-15), which departs from Stephan
372	!-- Siano's original version by discretizing the Poisson equation,
373	!-- before it is Fourier-transformed
374	#if defined( __parallel )
375	DO i = nxl_z, nxr_z
376	IF ( i >= 0 .AND. i <= nnxh ) THEN
377	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
378	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
379	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
380	FLOAT( ny+1 ) ) ) / ( dy * dy )
381	ELSE
382	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
383	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
384	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
385	FLOAT( ny+1 ) ) ) / ( dy * dy )
386	ENDIF
387	DO k = 0,nz-1
388	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
389	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
390	tri(1,i,k) = a + c - ll(i)
391	ENDDO
392	ENDDO
393	#else
394	DO i = 0, nnxh
395	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / FLOAT( nx+1 ) ) ) / &
396	( dx * dx ) + &
397	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / FLOAT( ny+1 ) ) ) / &
398	( dy * dy )
399	DO k = 0, nz-1
400	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
401	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
402	tri(1,i,k) = a + c - ll(i)
403	IF ( i >= 1 .and. i < nnxh ) THEN
404	tri(1,nx+1-i,k) = tri(1,i,k)
405	ENDIF
406	ENDDO
407	ENDDO
408	#endif
409	IF ( ibc_p_b == 1 .OR. ibc_p_b == 2 ) THEN
410	DO i = nxl_z, nxr_z
411	tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
412	ENDDO
413	ENDIF
414	IF ( ibc_p_t == 1 ) THEN
415	DO i = nxl_z, nxr_z
416	tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
417	ENDDO
418	ENDIF
419
420	END SUBROUTINE maketri
421
422
423	SUBROUTINE substi( ar, ar1, tri, j )
424
425	!------------------------------------------------------------------------------!
426	! Substitution (Forward and Backward) (Thomas algorithm)
427	!------------------------------------------------------------------------------!
428
429	USE control_parameters
430
431	IMPLICIT NONE
432
433	INTEGER :: i, j, k
434	REAL :: ar1(nxl_z:nxr_z,0:nz-1)
435	REAL :: tri(5,nxl_z:nxr_z,0:nz-1)
436	#if defined( __parallel )
437	REAL :: ar(nxl_z:nxr_za,nys_z:nyn_za,1:nza)
438	#else
439	REAL :: ar(1:nz,nys_z:nyn_z,nxl_z:nxr_z)
440	#endif
441
442	!
443	!-- Forward substitution.
444	DO i = nxl_z, nxr_z
445	#if defined( __parallel )
446	ar1(i,0) = ar(i,j,1)
447	#else
448	ar1(i,0) = ar(1,j,i)
449	#endif
450	ENDDO
451	DO k = 1, nz - 1
452	DO i = nxl_z, nxr_z
453	#if defined( __parallel )
454	ar1(i,k) = ar(i,j,k+1) - tri(5,i,k) * ar1(i,k-1)
455	#else
456	ar1(i,k) = ar(k+1,j,i) - tri(5,i,k) * ar1(i,k-1)
457	#endif
458	ENDDO
459	ENDDO
460
461	!
462	!-- Backward substitution.
463	DO i = nxl_z, nxr_z
464	#if defined( __parallel )
465	ar(i,j,nz) = ar1(i,nz-1) / tri(4,i,nz-1)
466	#else
467	ar(nz,j,i) = ar1(i,nz-1) / tri(4,i,nz-1)
468	#endif
469	ENDDO
470	DO k = nz-2, 0, -1
471	DO i = nxl_z, nxr_z
472	#if defined( __parallel )
473	ar(i,j,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,j,k+2) ) &
474	/ tri(4,i,k)
475	#else
476	ar(k+1,j,i) = ( ar1(i,k) - tri(3,i,k) * ar(k+2,j,i) ) &
477	/ tri(4,i,k)
478	#endif
479	ENDDO
480	ENDDO
481
482	!
483	!-- Indices i=0, j=0 correspond to horizontally averaged pressure.
484	!-- The respective values of ar should be zero at all k-levels if
485	!-- acceleration of horizontally averaged vertical velocity is zero.
486	IF ( ibc_p_b == 1 .AND. ibc_p_t == 1 ) THEN
487	IF ( j == 0 .AND. nxl_z == 0 ) THEN
488	#if defined( __parallel )
489	DO k = 1, nz
490	ar(nxl_z,j,k) = 0.0
491	ENDDO
492	#else
493	DO k = 1, nz
494	ar(k,j,nxl_z) = 0.0
495	ENDDO
496	#endif
497	ENDIF
498	ENDIF
499
500	END SUBROUTINE substi
501
502
503	SUBROUTINE split( tri )
504
505	!------------------------------------------------------------------------------!
506	! Splitting of the tridiagonal matrix (Thomas algorithm)
507	!------------------------------------------------------------------------------!
508
509	IMPLICIT NONE
510
511	INTEGER :: i, k
512	REAL :: tri(5,nxl_z:nxr_z,0:nz-1)
513
514	!
515	!-- Splitting.
516	DO i = nxl_z, nxr_z
517	tri(4,i,0) = tri(1,i,0)
518	ENDDO
519	DO k = 1, nz-1
520	DO i = nxl_z, nxr_z
521	tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
522	tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
523	ENDDO
524	ENDDO
525
526	END SUBROUTINE split
527
528	END SUBROUTINE tridia
529
530
531	#if defined( __parallel )
532	SUBROUTINE fftxp( ar, direction )
533
534	!------------------------------------------------------------------------------!
535	! Fourier-transformation along x-direction Parallelized version
536	!------------------------------------------------------------------------------!
537
538	IMPLICIT NONE
539
540	CHARACTER (LEN=*) :: direction
541	INTEGER :: j, k
542	REAL :: ar(0:nxa,nys_x:nyn_xa,nzb_x:nzt_xa)
543
544	!
545	!-- Performing the fft with one of the methods implemented
546	!$OMP PARALLEL PRIVATE ( j, k )
547	!$OMP DO
548	DO k = nzb_x, nzt_x
549	DO j = nys_x, nyn_x
550	CALL fft_x( ar(0:nx,j,k), direction )
551	ENDDO
552	ENDDO
553	!$OMP END PARALLEL
554
555	END SUBROUTINE fftxp
556
557	#else
558	SUBROUTINE fftx( ar, direction )
559
560	!------------------------------------------------------------------------------!
561	! Fourier-transformation along x-direction Non parallel version
562	!------------------------------------------------------------------------------!
563
564	IMPLICIT NONE
565
566	CHARACTER (LEN=*) :: direction
567	INTEGER :: i, j, k
568	REAL :: ar(1:nz,0:ny,0:nx)
569
570	!
571	!-- Performing the fft with one of the methods implemented
572	!$OMP PARALLEL PRIVATE ( j, k )
573	!$OMP DO
574	DO k = 1, nz
575	DO j = 0, ny
576	CALL fft_x( ar(k,j,0:nx), direction )
577	ENDDO
578	ENDDO
579	!$OMP END PARALLEL
580
581	END SUBROUTINE fftx
582	#endif
583
584
585	#if defined( __parallel )
586	SUBROUTINE fftyp( ar, direction )
587
588	!------------------------------------------------------------------------------!
589	! Fourier-transformation along y-direction Parallelized version
590	!------------------------------------------------------------------------------!
591
592	IMPLICIT NONE
593
594	CHARACTER (LEN=*) :: direction
595	INTEGER :: i, k
596	REAL :: ar(0:nya,nxl_y:nxr_ya,nzb_y:nzt_ya)
597
598	!
599	!-- Performing the fft with one of the methods implemented
600	!$OMP PARALLEL PRIVATE ( i, k )
601	!$OMP DO
602	DO k = nzb_y, nzt_y
603	DO i = nxl_y, nxr_y
604	CALL fft_y( ar(0:ny,i,k), direction )
605	ENDDO
606	ENDDO
607	!$OMP END PARALLEL
608
609	END SUBROUTINE fftyp
610
611	#else
612	SUBROUTINE ffty( ar, direction )
613
614	!------------------------------------------------------------------------------!
615	! Fourier-transformation along y-direction Non parallel version
616	!------------------------------------------------------------------------------!
617
618	IMPLICIT NONE
619
620	CHARACTER (LEN=*) :: direction
621	INTEGER :: i, k
622	REAL :: ar(1:nz,0:ny,0:nx)
623
624	!
625	!-- Performing the fft with one of the methods implemented
626	!$OMP PARALLEL PRIVATE ( i, k )
627	!$OMP DO
628	DO k = 1, nz
629	DO i = 0, nx
630	CALL fft_y( ar(k,0:ny,i), direction )
631	ENDDO
632	ENDDO
633	!$OMP END PARALLEL
634
635	END SUBROUTINE ffty
636	#endif
637
638	#if defined( __parallel )
639	SUBROUTINE ffty_tr_yx( f_in, work, f_out )
640
641	!------------------------------------------------------------------------------!
642	! Fourier-transformation along y with subsequent transposition y --> x for
643	! a 1d-decomposition along x
644	!
645	! ATTENTION: The performance of this routine is much faster on the NEC-SX6,
646	! if the first index of work_ffty_vec is odd. Otherwise
647	! memory bank conflicts may occur (especially if the index is a
648	! multiple of 128). That's why work_ffty_vec is dimensioned as
649	! 0:ny+1.
650	! Of course, this will not work if users are using an odd number
651	! of gridpoints along y.
652	!------------------------------------------------------------------------------!
653
654	USE control_parameters
655	USE cpulog
656	USE indices
657	USE interfaces
658	USE pegrid
659	USE transpose_indices
660
661	IMPLICIT NONE
662
663	INTEGER :: i, iend, iouter, ir, j, k
664	INTEGER, PARAMETER :: stridex = 4
665
666	REAL, DIMENSION(0:ny,stridex) :: work_ffty
667	#if defined( __nec )
668	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
669	#endif
670	REAL, DIMENSION(1:nza,0:nya,nxl:nxra) :: f_in
671	REAL, DIMENSION(nnx,1:nza,nys_x:nyn_xa,pdims(1)) :: f_out
672	REAL, DIMENSION(nxl:nxra,1:nza,0:nya) :: work
673
674	!
675	!-- Carry out the FFT along y, where all data are present due to the
676	!-- 1d-decomposition along x. Resort the data in a way that x becomes
677	!-- the first index.
678	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
679
680	IF ( host(1:3) == 'nec' ) THEN
681	#if defined( __nec )
682	!
683	!-- Code optimized for vector processors
684	!$OMP PARALLEL PRIVATE ( i, j, k )
685	!$OMP DO
686	DO i = nxl, nxr
687
688	DO j = 0, ny
689	DO k = 1, nz
690	work_ffty_vec(j,k,i) = f_in(k,j,i)
691	ENDDO
692	ENDDO
693
694	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' )
695
696	ENDDO
697
698	!$OMP DO
699	DO k = 1, nz
700	DO j = 0, ny
701	DO i = nxl, nxr
702	work(i,k,j) = work_ffty_vec(j,k,i)
703	ENDDO
704	ENDDO
705	ENDDO
706	!$OMP END PARALLEL
707	#endif
708
709	ELSE
710
711	!
712	!-- Cache optimized code.
713	!-- The i-(x-)direction is split into a strided outer loop and an inner
714	!-- loop for better cache performance
715	!$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty)
716	!$OMP DO
717	DO iouter = nxl, nxr, stridex
718
719	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
720
721	DO k = 1, nz
722
723	DO i = iouter, iend
724
725	ir = i-iouter+1 ! counter within a stride
726	DO j = 0, ny
727	work_ffty(j,ir) = f_in(k,j,i)
728	ENDDO
729	!
730	!-- FFT along y
731	CALL fft_y( work_ffty(:,ir), 'forward' )
732
733	ENDDO
734
735	!
736	!-- Resort
737	DO j = 0, ny
738	DO i = iouter, iend
739	work(i,k,j) = work_ffty(j,i-iouter+1)
740	ENDDO
741	ENDDO
742
743	ENDDO
744
745	ENDDO
746	!$OMP END PARALLEL
747
748	ENDIF
749	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
750
751	!
752	!-- Transpose array
753	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
754	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
755	CALL MPI_ALLTOALL( work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
756	f_out(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
757	comm1dx, ierr )
758	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
759
760	END SUBROUTINE ffty_tr_yx
761
762
763	SUBROUTINE tr_xy_ffty( f_in, work, f_out )
764
765	!------------------------------------------------------------------------------!
766	! Transposition x --> y with a subsequent backward Fourier transformation for
767	! a 1d-decomposition along x
768	!------------------------------------------------------------------------------!
769
770	USE control_parameters
771	USE cpulog
772	USE indices
773	USE interfaces
774	USE pegrid
775	USE transpose_indices
776
777	IMPLICIT NONE
778
779	INTEGER :: i, iend, iouter, ir, j, k
780	INTEGER, PARAMETER :: stridex = 4
781
782	REAL, DIMENSION(0:ny,stridex) :: work_ffty
783	#if defined( __nec )
784	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
785	#endif
786	REAL, DIMENSION(nnx,1:nza,nys_x:nyn_xa,pdims(1)) :: f_in
787	REAL, DIMENSION(1:nza,0:nya,nxl:nxra) :: f_out
788	REAL, DIMENSION(nxl:nxra,1:nza,0:nya) :: work
789
790	!
791	!-- Transpose array
792	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
793	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
794	CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
795	work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
796	comm1dx, ierr )
797	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
798
799	!
800	!-- Resort the data in a way that y becomes the first index and carry out the
801	!-- backward fft along y.
802	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
803
804	IF ( host(1:3) == 'nec' ) THEN
805	#if defined( __nec )
806	!
807	!-- Code optimized for vector processors
808	!$OMP PARALLEL PRIVATE ( i, j, k )
809	!$OMP DO
810	DO k = 1, nz
811	DO j = 0, ny
812	DO i = nxl, nxr
813	work_ffty_vec(j,k,i) = work(i,k,j)
814	ENDDO
815	ENDDO
816	ENDDO
817
818	!$OMP DO
819	DO i = nxl, nxr
820
821	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' )
822
823	DO j = 0, ny
824	DO k = 1, nz
825	f_out(k,j,i) = work_ffty_vec(j,k,i)
826	ENDDO
827	ENDDO
828
829	ENDDO
830	!$OMP END PARALLEL
831	#endif
832
833	ELSE
834
835	!
836	!-- Cache optimized code.
837	!-- The i-(x-)direction is split into a strided outer loop and an inner
838	!-- loop for better cache performance
839	!$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty )
840	!$OMP DO
841	DO iouter = nxl, nxr, stridex
842
843	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
844
845	DO k = 1, nz
846	!
847	!-- Resort
848	DO j = 0, ny
849	DO i = iouter, iend
850	work_ffty(j,i-iouter+1) = work(i,k,j)
851	ENDDO
852	ENDDO
853
854	DO i = iouter, iend
855
856	!
857	!-- FFT along y
858	ir = i-iouter+1 ! counter within a stride
859	CALL fft_y( work_ffty(:,ir), 'backward' )
860
861	DO j = 0, ny
862	f_out(k,j,i) = work_ffty(j,ir)
863	ENDDO
864	ENDDO
865
866	ENDDO
867
868	ENDDO
869	!$OMP END PARALLEL
870
871	ENDIF
872
873	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
874
875	END SUBROUTINE tr_xy_ffty
876
877
878	SUBROUTINE fftx_tri_fftx( ar )
879
880	!------------------------------------------------------------------------------!
881	! FFT along x, solution of the tridiagonal system and backward FFT for
882	! a 1d-decomposition along x
883	!
884	! WARNING: this subroutine may still not work for hybrid parallelization
885	! with OpenMP (for possible necessary changes see the original
886	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
887	!------------------------------------------------------------------------------!
888
889	USE control_parameters
890	USE cpulog
891	USE grid_variables
892	USE indices
893	USE interfaces
894	USE pegrid
895	USE transpose_indices
896
897	IMPLICIT NONE
898
899	character(len=3) :: myth_char
900
901	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
902
903	REAL, DIMENSION(0:nx) :: work_fftx
904	REAL, DIMENSION(0:nx,1:nz) :: work_trix
905	REAL, DIMENSION(nnx,1:nza,nys_x:nyn_xa,pdims(1)) :: ar
906	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
907
908
909	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'start' )
910
911	ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) )
912
913	tn = 0 ! Default thread number in case of one thread
914	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_fftx, work_trix )
915	DO j = nys_x, nyn_x
916
917	!$ tn = omp_get_thread_num()
918
919	IF ( host(1:3) == 'nec' ) THEN
920	!
921	!-- Code optimized for vector processors
922	DO k = 1, nz
923
924	m = 0
925	DO n = 1, pdims(1)
926	DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
927	work_trix(m,k) = ar(i,k,j,n)
928	m = m + 1
929	ENDDO
930	ENDDO
931
932	ENDDO
933
934	CALL fft_x_m( work_trix, 'forward' )
935
936	ELSE
937	!
938	!-- Cache optimized code
939	DO k = 1, nz
940
941	m = 0
942	DO n = 1, pdims(1)
943	DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
944	work_fftx(m) = ar(i,k,j,n)
945	m = m + 1
946	ENDDO
947	ENDDO
948
949	CALL fft_x( work_fftx, 'forward' )
950
951	DO i = 0, nx
952	work_trix(i,k) = work_fftx(i)
953	ENDDO
954
955	ENDDO
956
957	ENDIF
958
959	!
960	!-- Solve the linear equation system
961	CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) )
962
963	IF ( host(1:3) == 'nec' ) THEN
964	!
965	!-- Code optimized for vector processors
966	CALL fft_x_m( work_trix, 'backward' )
967
968	DO k = 1, nz
969
970	m = 0
971	DO n = 1, pdims(1)
972	DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
973	ar(i,k,j,n) = work_trix(m,k)
974	m = m + 1
975	ENDDO
976	ENDDO
977
978	ENDDO
979
980	ELSE
981	!
982	!-- Cache optimized code
983	DO k = 1, nz
984
985	DO i = 0, nx
986	work_fftx(i) = work_trix(i,k)
987	ENDDO
988
989	CALL fft_x( work_fftx, 'backward' )
990
991	m = 0
992	DO n = 1, pdims(1)
993	DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
994	ar(i,k,j,n) = work_fftx(m)
995	m = m + 1
996	ENDDO
997	ENDDO
998
999	ENDDO
1000
1001	ENDIF
1002
1003	ENDDO
1004
1005	DEALLOCATE( tri )
1006
1007	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'stop' )
1008
1009	END SUBROUTINE fftx_tri_fftx
1010
1011
1012	SUBROUTINE fftx_tr_xy( f_in, work, f_out )
1013
1014	!------------------------------------------------------------------------------!
1015	! Fourier-transformation along x with subsequent transposition x --> y for
1016	! a 1d-decomposition along y
1017	!
1018	! ATTENTION: The NEC-branch of this routine may significantly profit from
1019	! further optimizations. So far, performance is much worse than
1020	! for routine ffty_tr_yx (more than three times slower).
1021	!------------------------------------------------------------------------------!
1022
1023	USE control_parameters
1024	USE cpulog
1025	USE indices
1026	USE interfaces
1027	USE pegrid
1028	USE transpose_indices
1029
1030	IMPLICIT NONE
1031
1032	INTEGER :: i, j, k
1033
1034	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
1035	REAL, DIMENSION(1:nza,nys:nyna,0:nxa) :: f_in
1036	REAL, DIMENSION(nny,1:nza,nxl_y:nxr_ya,pdims(2)) :: f_out
1037	REAL, DIMENSION(nys:nyna,1:nza,0:nxa) :: work
1038
1039	!
1040	!-- Carry out the FFT along x, where all data are present due to the
1041	!-- 1d-decomposition along y. Resort the data in a way that y becomes
1042	!-- the first index.
1043	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
1044
1045	IF ( host(1:3) == 'nec' ) THEN
1046	!
1047	!-- Code for vector processors
1048	!$OMP PARALLEL PRIVATE ( i, j, k )
1049	!$OMP DO
1050	DO i = 0, nx
1051
1052	DO j = nys, nyn
1053	DO k = 1, nz
1054	work_fftx(i,k,j) = f_in(k,j,i)
1055	ENDDO
1056	ENDDO
1057
1058	ENDDO
1059
1060	!$OMP DO
1061	DO j = nys, nyn
1062
1063	CALL fft_x_m( work_fftx(:,:,j), 'forward' )
1064
1065	DO k = 1, nz
1066	DO i = 0, nx
1067	work(j,k,i) = work_fftx(i,k,j)
1068	ENDDO
1069	ENDDO
1070
1071	ENDDO
1072	!$OMP END PARALLEL
1073
1074	ELSE
1075
1076	!
1077	!-- Cache optimized code (there might be still a potential for better
1078	!-- optimization).
1079	!$OMP PARALLEL PRIVATE (i,j,k)
1080	!$OMP DO
1081	DO i = 0, nx
1082
1083	DO j = nys, nyn
1084	DO k = 1, nz
1085	work_fftx(i,k,j) = f_in(k,j,i)
1086	ENDDO
1087	ENDDO
1088
1089	ENDDO
1090
1091	!$OMP DO
1092	DO j = nys, nyn
1093	DO k = 1, nz
1094
1095	CALL fft_x( work_fftx(0:nx,k,j), 'forward' )
1096
1097	DO i = 0, nx
1098	work(j,k,i) = work_fftx(i,k,j)
1099	ENDDO
1100	ENDDO
1101
1102	ENDDO
1103	!$OMP END PARALLEL
1104
1105	ENDIF
1106	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
1107
1108	!
1109	!-- Transpose array
1110	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1111	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1112	CALL MPI_ALLTOALL( work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
1113	f_out(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
1114	comm1dy, ierr )
1115	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1116
1117	END SUBROUTINE fftx_tr_xy
1118
1119
1120	SUBROUTINE tr_yx_fftx( f_in, work, f_out )
1121
1122	!------------------------------------------------------------------------------!
1123	! Transposition y --> x with a subsequent backward Fourier transformation for
1124	! a 1d-decomposition along x
1125	!------------------------------------------------------------------------------!
1126
1127	USE control_parameters
1128	USE cpulog
1129	USE indices
1130	USE interfaces
1131	USE pegrid
1132	USE transpose_indices
1133
1134	IMPLICIT NONE
1135
1136	INTEGER :: i, j, k
1137
1138	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
1139	REAL, DIMENSION(nny,1:nza,nxl_y:nxr_ya,pdims(2)) :: f_in
1140	REAL, DIMENSION(1:nza,nys:nyna,0:nxa) :: f_out
1141	REAL, DIMENSION(nys:nyna,1:nza,0:nxa) :: work
1142
1143	!
1144	!-- Transpose array
1145	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1146	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1147	CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
1148	work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
1149	comm1dy, ierr )
1150	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1151
1152	!
1153	!-- Carry out the FFT along x, where all data are present due to the
1154	!-- 1d-decomposition along y. Resort the data in a way that y becomes
1155	!-- the first index.
1156	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
1157
1158	IF ( host(1:3) == 'nec' ) THEN
1159	!
1160	!-- Code optimized for vector processors
1161	!$OMP PARALLEL PRIVATE ( i, j, k )
1162	!$OMP DO
1163	DO j = nys, nyn
1164
1165	DO k = 1, nz
1166	DO i = 0, nx
1167	work_fftx(i,k,j) = work(j,k,i)
1168	ENDDO
1169	ENDDO
1170
1171	CALL fft_x_m( work_fftx(:,:,j), 'backward' )
1172
1173	ENDDO
1174
1175	!$OMP DO
1176	DO i = 0, nx
1177	DO j = nys, nyn
1178	DO k = 1, nz
1179	f_out(k,j,i) = work_fftx(i,k,j)
1180	ENDDO
1181	ENDDO
1182	ENDDO
1183	!$OMP END PARALLEL
1184
1185	ELSE
1186
1187	!
1188	!-- Cache optimized code (there might be still a potential for better
1189	!-- optimization).
1190	!$OMP PARALLEL PRIVATE (i,j,k)
1191	!$OMP DO
1192	DO j = nys, nyn
1193	DO k = 1, nz
1194
1195	DO i = 0, nx
1196	work_fftx(i,k,j) = work(j,k,i)
1197	ENDDO
1198
1199	CALL fft_x( work_fftx(0:nx,k,j), 'backward' )
1200
1201	ENDDO
1202	ENDDO
1203
1204	!$OMP DO
1205	DO i = 0, nx
1206	DO j = nys, nyn
1207	DO k = 1, nz
1208	f_out(k,j,i) = work_fftx(i,k,j)
1209	ENDDO
1210	ENDDO
1211	ENDDO
1212	!$OMP END PARALLEL
1213
1214	ENDIF
1215	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
1216
1217	END SUBROUTINE tr_yx_fftx
1218
1219
1220	SUBROUTINE ffty_tri_ffty( ar )
1221
1222	!------------------------------------------------------------------------------!
1223	! FFT along y, solution of the tridiagonal system and backward FFT for
1224	! a 1d-decomposition along y
1225	!
1226	! WARNING: this subroutine may still not work for hybrid parallelization
1227	! with OpenMP (for possible necessary changes see the original
1228	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
1229	!------------------------------------------------------------------------------!
1230
1231	USE control_parameters
1232	USE cpulog
1233	USE grid_variables
1234	USE indices
1235	USE interfaces
1236	USE pegrid
1237	USE transpose_indices
1238
1239	IMPLICIT NONE
1240
1241	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
1242
1243	REAL, DIMENSION(0:ny) :: work_ffty
1244	REAL, DIMENSION(0:ny,1:nz) :: work_triy
1245	REAL, DIMENSION(nny,1:nza,nxl_y:nxr_ya,pdims(2)) :: ar
1246	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
1247
1248
1249	CALL cpu_log( log_point_s(39), 'fft_y + tridia', 'start' )
1250
1251	ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) )
1252
1253	tn = 0 ! Default thread number in case of one thread
1254	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy )
1255	DO i = nxl_y, nxr_y
1256
1257	!$ tn = omp_get_thread_num()
1258
1259	IF ( host(1:3) == 'nec' ) THEN
1260	!
1261	!-- Code optimized for vector processors
1262	DO k = 1, nz
1263
1264	m = 0
1265	DO n = 1, pdims(2)
1266	DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
1267	work_triy(m,k) = ar(j,k,i,n)
1268	m = m + 1
1269	ENDDO
1270	ENDDO
1271
1272	ENDDO
1273
1274	CALL fft_y_m( work_triy, ny, 'forward' )
1275
1276	ELSE
1277	!
1278	!-- Cache optimized code
1279	DO k = 1, nz
1280
1281	m = 0
1282	DO n = 1, pdims(2)
1283	DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
1284	work_ffty(m) = ar(j,k,i,n)
1285	m = m + 1
1286	ENDDO
1287	ENDDO
1288
1289	CALL fft_y( work_ffty, 'forward' )
1290
1291	DO j = 0, ny
1292	work_triy(j,k) = work_ffty(j)
1293	ENDDO
1294
1295	ENDDO
1296
1297	ENDIF
1298
1299	!
1300	!-- Solve the linear equation system
1301	CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) )
1302
1303	IF ( host(1:3) == 'nec' ) THEN
1304	!
1305	!-- Code optimized for vector processors
1306	CALL fft_y_m( work_triy, ny, 'backward' )
1307
1308	DO k = 1, nz
1309
1310	m = 0
1311	DO n = 1, pdims(2)
1312	DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
1313	ar(j,k,i,n) = work_triy(m,k)
1314	m = m + 1
1315	ENDDO
1316	ENDDO
1317
1318	ENDDO
1319
1320	ELSE
1321	!
1322	!-- Cache optimized code
1323	DO k = 1, nz
1324
1325	DO j = 0, ny
1326	work_ffty(j) = work_triy(j,k)
1327	ENDDO
1328
1329	CALL fft_y( work_ffty, 'backward' )
1330
1331	m = 0
1332	DO n = 1, pdims(2)
1333	DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
1334	ar(j,k,i,n) = work_ffty(m)
1335	m = m + 1
1336	ENDDO
1337	ENDDO
1338
1339	ENDDO
1340
1341	ENDIF
1342
1343	ENDDO
1344
1345	DEALLOCATE( tri )
1346
1347	CALL cpu_log( log_point_s(39), 'fft_y + tridia', 'stop' )
1348
1349	END SUBROUTINE ffty_tri_ffty
1350
1351
1352	SUBROUTINE tridia_1dd( ddx2, ddy2, nx, ny, j, ar, tri )
1353
1354	!------------------------------------------------------------------------------!
1355	! Solves the linear system of equations for a 1d-decomposition along x (see
1356	! tridia)
1357	!
1358	! Attention: when using the intel compiler, array tri must be passed as an
1359	! argument to the contained subroutines. Otherwise addres faults
1360	! will occur.
1361	! On NEC, tri should not be passed (except for routine substi_1dd)
1362	! because this causes very bad performance.
1363	!------------------------------------------------------------------------------!
1364
1365	USE arrays_3d
1366	USE control_parameters
1367
1368	USE pegrid
1369
1370	IMPLICIT NONE
1371
1372	INTEGER :: i, j, k, nnyh, nx, ny, omp_get_thread_num, tn
1373
1374	REAL :: ddx2, ddy2
1375
1376	REAL, DIMENSION(0:nx,1:nz) :: ar
1377	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
1378	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
1379
1380
1381	nnyh = ( ny + 1 ) / 2
1382
1383	!
1384	!-- Define constant elements of the tridiagonal matrix.
1385	!-- The compiler on SX6 does loop exchange. If 0:nx is a high power of 2,
1386	!-- the exchanged loops create bank conflicts. The following directive
1387	!-- prohibits loop exchange and the loops perform much better.
1388	! tn = omp_get_thread_num()
1389	! WRITE( 120+tn, * ) '+++ id=',myid,' nx=',nx,' thread=', omp_get_thread_num()
1390	! CALL local_flush( 120+tn )
1391	!CDIR NOLOOPCHG
1392	DO k = 0, nz-1
1393	DO i = 0,nx
1394	tri(2,i,k) = ddzu_pres(k+1) * ddzw(k+1)
1395	tri(3,i,k) = ddzu_pres(k+2) * ddzw(k+1)
1396	ENDDO
1397	ENDDO
1398	! WRITE( 120+tn, * ) '+++ id=',myid,' end of first tridia loop thread=', omp_get_thread_num()
1399	! CALL local_flush( 120+tn )
1400
1401	IF ( j <= nnyh ) THEN
1402	#if defined( __lc )
1403	CALL maketri_1dd( j, tri )
1404	#else
1405	CALL maketri_1dd( j )
1406	#endif
1407	ELSE
1408	#if defined( __lc )
1409	CALL maketri_1dd( ny+1-j, tri )
1410	#else
1411	CALL maketri_1dd( ny+1-j )
1412	#endif
1413	ENDIF
1414	#if defined( __lc )
1415	CALL split_1dd( tri )
1416	#else
1417	CALL split_1dd
1418	#endif
1419	CALL substi_1dd( ar, tri )
1420
1421	CONTAINS
1422
1423	#if defined( __lc )
1424	SUBROUTINE maketri_1dd( j, tri )
1425	#else
1426	SUBROUTINE maketri_1dd( j )
1427	#endif
1428
1429	!------------------------------------------------------------------------------!
1430	! computes the i- and j-dependent component of the matrix
1431	!------------------------------------------------------------------------------!
1432
1433	USE constants
1434
1435	IMPLICIT NONE
1436
1437	INTEGER :: i, j, k, nnxh
1438	REAL :: a, c
1439
1440	REAL, DIMENSION(0:nx) :: l
1441
1442	#if defined( __lc )
1443	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
1444	#endif
1445
1446
1447	nnxh = ( nx + 1 ) / 2
1448	!
1449	!-- Provide the tridiagonal matrix for solution of the Poisson equation in
1450	!-- Fourier space. The coefficients are computed following the method of
1451	!-- Schmidt et al. (DFVLR-Mitteilung 84-15), which departs from Stephan
1452	!-- Siano's original version by discretizing the Poisson equation,
1453	!-- before it is Fourier-transformed
1454	DO i = 0, nx
1455	IF ( i >= 0 .AND. i <= nnxh ) THEN
1456	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
1457	FLOAT( nx+1 ) ) ) * ddx2 + &
1458	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
1459	FLOAT( ny+1 ) ) ) * ddy2
1460	ELSE
1461	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
1462	FLOAT( nx+1 ) ) ) * ddx2 + &
1463	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
1464	FLOAT( ny+1 ) ) ) * ddy2
1465	ENDIF
1466	ENDDO
1467
1468	DO k = 0, nz-1
1469	DO i = 0, nx
1470	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
1471	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
1472	tri(1,i,k) = a + c - l(i)
1473	ENDDO
1474	ENDDO
1475	IF ( ibc_p_b == 1 .OR. ibc_p_b == 2 ) THEN
1476	DO i = 0, nx
1477	tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
1478	ENDDO
1479	ENDIF
1480	IF ( ibc_p_t == 1 ) THEN
1481	DO i = 0, nx
1482	tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
1483	ENDDO
1484	ENDIF
1485
1486	END SUBROUTINE maketri_1dd
1487
1488
1489	#if defined( __lc )
1490	SUBROUTINE split_1dd( tri )
1491	#else
1492	SUBROUTINE split_1dd
1493	#endif
1494
1495	!------------------------------------------------------------------------------!
1496	! Splitting of the tridiagonal matrix (Thomas algorithm)
1497	!------------------------------------------------------------------------------!
1498
1499	IMPLICIT NONE
1500
1501	INTEGER :: i, k
1502
1503	#if defined( __lc )
1504	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
1505	#endif
1506
1507
1508	!
1509	!-- Splitting
1510	DO i = 0, nx
1511	tri(4,i,0) = tri(1,i,0)
1512	ENDDO
1513	DO k = 1, nz-1
1514	DO i = 0, nx
1515	tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
1516	tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
1517	ENDDO
1518	ENDDO
1519
1520	END SUBROUTINE split_1dd
1521
1522
1523	SUBROUTINE substi_1dd( ar, tri )
1524
1525	!------------------------------------------------------------------------------!
1526	! Substitution (Forward and Backward) (Thomas algorithm)
1527	!------------------------------------------------------------------------------!
1528
1529	IMPLICIT NONE
1530
1531	INTEGER :: i, k
1532
1533	REAL, DIMENSION(0:nx,nz) :: ar
1534	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
1535	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
1536
1537	!
1538	!-- Forward substitution
1539	DO i = 0, nx
1540	ar1(i,0) = ar(i,1)
1541	ENDDO
1542	DO k = 1, nz-1
1543	DO i = 0, nx
1544	ar1(i,k) = ar(i,k+1) - tri(5,i,k) * ar1(i,k-1)
1545	ENDDO
1546	ENDDO
1547
1548	!
1549	!-- Backward substitution
1550	DO i = 0, nx
1551	ar(i,nz) = ar1(i,nz-1) / ( tri(4,i,nz-1) + 1.0E-20 )
1552	ENDDO
1553	DO k = nz-2, 0, -1
1554	DO i = 0, nx
1555	ar(i,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,k+2) ) &
1556	/ tri(4,i,k)
1557	ENDDO
1558	ENDDO
1559
1560	!
1561	!-- Indices i=0, j=0 correspond to horizontally averaged pressure.
1562	!-- The respective values of ar should be zero at all k-levels if
1563	!-- acceleration of horizontally averaged vertical velocity is zero.
1564	IF ( ibc_p_b == 1 .AND. ibc_p_t == 1 ) THEN
1565	IF ( j == 0 ) THEN
1566	DO k = 1, nz
1567	ar(0,k) = 0.0
1568	ENDDO
1569	ENDIF
1570	ENDIF
1571
1572	END SUBROUTINE substi_1dd
1573
1574	END SUBROUTINE tridia_1dd
1575
1576	#endif
1577
1578	END MODULE poisfft_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |