Home

Context Navigation

source: palm/trunk/SOURCE/poisfft.f90 @ 798

Last change on this file since 798 was 764, checked in by suehring, 13 years ago
last commit documented
Property svn:keywords set to `Id`
File size: 46.0 KB

Line
1	MODULE poisfft_mod
2
3	!------------------------------------------------------------------------------!
4	! Current revisions:
5	! -----------------
6	!
7	!
8	! Former revisions:
9	! -----------------
10	! $Id: poisfft.f90 764 2011-10-06 09:34:55Z maronga $
11	!
12	! 763 2011-10-06 09:32:09Z suehring
13	! Comment added concerning the last change.
14	!
15	! 761 2011-10-05 17:58:52Z suehring
16	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
17	! pressure at the top of the model domain.
18	!
19	! 696 2011-03-18 07:03:49Z raasch
20	! work_fftx removed from PRIVATE clauses in fftx_tr_xy and tr_yx_fftx
21	!
22	! 683 2011-02-09 14:25:15Z raasch
23	! openMP parallelization for 2d-domain-decomposition
24	!
25	! 667 2010-12-23 12:06:00Z suehring/gryschka
26	! ddzu replaced by ddzu_pres due to changes in zu(0)
27	!
28	! 622 2010-12-10 08:08:13Z raasch
29	! optional barriers included in order to speed up collective operations
30	!
31	! 377 2009-09-04 11:09:00Z raasch
32	! __lcmuk changed to __lc to avoid problems with Intel compiler on sgi-ice
33	!
34	! 164 2008-05-15 08:46:15Z raasch
35	! Arguments removed from transpose routines
36	!
37	! 128 2007-10-26 13:11:14Z raasch
38	! Bugfix: wavenumber calculation for even nx in routines maketri
39	!
40	! 85 2007-05-11 09:35:14Z raasch
41	! Bugfix: work_fft*_vec removed from some PRIVATE-declarations
42	!
43	! 76 2007-03-29 00:58:32Z raasch
44	! Tridiagonal coefficients adjusted for Neumann boundary conditions both at
45	! the bottom and the top.
46	!
47	! RCS Log replace by Id keyword, revision history cleaned up
48	!
49	! Revision 1.24 2006/08/04 15:00:24 raasch
50	! Default setting of the thread number tn in case of not using OpenMP
51	!
52	! Revision 1.23 2006/02/23 12:48:38 raasch
53	! Additional compiler directive in routine tridia_1dd for preventing loop
54	! exchange on NEC-SX6
55	!
56	! Revision 1.20 2004/04/30 12:38:09 raasch
57	! Parts of former poisfft_hybrid moved to this subroutine,
58	! former subroutine changed to a module, renaming of FFT-subroutines and
59	! -module, FFTs completely substituted by calls of fft_x and fft_y,
60	! NAG fft used in the non-parallel case completely removed, l in maketri
61	! is now a 1d-array, variables passed by modules instead of using parameter
62	! lists, enlarged transposition arrays introduced
63	!
64	! Revision 1.1 1997/07/24 11:24:14 raasch
65	! Initial revision
66	!
67	!
68	! Description:
69	! ------------
70	! See below.
71	!------------------------------------------------------------------------------!
72
73	!--------------------------------------------------------------------------!
74	! poisfft !
75	! !
76	! Original version: Stephan Siano (pois3d) !
77	! !
78	! Institute of Meteorology and Climatology, University of Hannover !
79	! Germany !
80	! !
81	! Version as of July 23,1996 !
82	! !
83	! !
84	! Version for parallel computers: Siegfried Raasch !
85	! !
86	! Version as of July 03,1997 !
87	! !
88	! Solves the Poisson equation with a 2D spectral method !
89	! d^2 p / dx^2 + d^2 p / dy^2 + d^2 p / dz^2 = s !
90	! !
91	! Input: !
92	! real ar contains in the (nnx,nny,nnz) elements, !
93	! starting from the element (1,nys,nxl), the !
94	! values for s !
95	! real work Temporary array !
96	! !
97	! Output: !
98	! real ar contains the solution for p !
99	!--------------------------------------------------------------------------!
100
101	USE fft_xy
102	USE indices
103	USE transpose_indices
104
105	IMPLICIT NONE
106
107	PRIVATE
108	PUBLIC poisfft, poisfft_init
109
110	INTERFACE poisfft
111	MODULE PROCEDURE poisfft
112	END INTERFACE poisfft
113
114	INTERFACE poisfft_init
115	MODULE PROCEDURE poisfft_init
116	END INTERFACE poisfft_init
117
118	CONTAINS
119
120	SUBROUTINE poisfft_init
121
122	CALL fft_init
123
124	END SUBROUTINE poisfft_init
125
126
127	SUBROUTINE poisfft( ar, work )
128
129	USE cpulog
130	USE interfaces
131	USE pegrid
132
133	IMPLICIT NONE
134
135	REAL, DIMENSION(1:nza,nys:nyna,nxl:nxra) :: ar, work
136
137
138	CALL cpu_log( log_point_s(3), 'poisfft', 'start' )
139
140	!
141	!-- Two-dimensional Fourier Transformation in x- and y-direction.
142	#if defined( __parallel )
143	IF ( pdims(2) == 1 ) THEN
144
145	!
146	!-- 1d-domain-decomposition along x:
147	!-- FFT along y and transposition y --> x
148	CALL ffty_tr_yx( ar, work, ar )
149
150	!
151	!-- FFT along x, solving the tridiagonal system and backward FFT
152	CALL fftx_tri_fftx( ar )
153
154	!
155	!-- Transposition x --> y and backward FFT along y
156	CALL tr_xy_ffty( ar, work, ar )
157
158	ELSEIF ( pdims(1) == 1 ) THEN
159
160	!
161	!-- 1d-domain-decomposition along y:
162	!-- FFT along x and transposition x --> y
163	CALL fftx_tr_xy( ar, work, ar )
164
165	!
166	!-- FFT along y, solving the tridiagonal system and backward FFT
167	CALL ffty_tri_ffty( ar )
168
169	!
170	!-- Transposition y --> x and backward FFT along x
171	CALL tr_yx_fftx( ar, work, ar )
172
173	ELSE
174
175	!
176	!-- 2d-domain-decomposition
177	!-- Transposition z --> x
178	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
179	CALL transpose_zx( ar, work, ar )
180	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
181
182	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
183	CALL fftxp( ar, 'forward' )
184	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
185
186	!
187	!-- Transposition x --> y
188	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
189	CALL transpose_xy( ar, work, ar )
190	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
191
192	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
193	CALL fftyp( ar, 'forward' )
194	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
195
196	!
197	!-- Transposition y --> z
198	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
199	CALL transpose_yz( ar, work, ar )
200	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
201
202	!
203	!-- Solve the Poisson equation in z-direction in cartesian space.
204	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
205	CALL tridia( ar )
206	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
207
208	!
209	!-- Inverse Fourier Transformation
210	!-- Transposition z --> y
211	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
212	CALL transpose_zy( ar, work, ar )
213	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
214
215	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
216	CALL fftyp( ar, 'backward' )
217	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
218
219	!
220	!-- Transposition y --> x
221	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
222	CALL transpose_yx( ar, work, ar )
223	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
224
225	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
226	CALL fftxp( ar, 'backward' )
227	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
228
229	!
230	!-- Transposition x --> z
231	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
232	CALL transpose_xz( ar, work, ar )
233	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
234
235	ENDIF
236
237	#else
238
239	!
240	!-- Two-dimensional Fourier Transformation along x- and y-direction.
241	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
242	CALL fftx( ar, 'forward' )
243	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
244	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
245	CALL ffty( ar, 'forward' )
246	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
247
248	!
249	!-- Solve the Poisson equation in z-direction in cartesian space.
250	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
251	CALL tridia( ar )
252	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
253
254	!
255	!-- Inverse Fourier Transformation.
256	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
257	CALL ffty( ar, 'backward' )
258	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
259	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
260	CALL fftx( ar, 'backward' )
261	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
262
263	#endif
264
265	CALL cpu_log( log_point_s(3), 'poisfft', 'stop' )
266
267	END SUBROUTINE poisfft
268
269
270
271	SUBROUTINE tridia( ar )
272
273	!------------------------------------------------------------------------------!
274	! solves the linear system of equations:
275	!
276	! -(4 pi^2(i^2/(dx^2nnx^2)+j^2/(dy^2nny^2))+
277	! 1/(dzu(k)dzw(k))+1/(dzu(k-1)dzw(k)))*p(i,j,k)+
278	! 1/(dzu(k)dzw(k))p(i,j,k+1)+1/(dzu(k-1)dzw(k))p(i,j,k-1)=d(i,j,k)
279	!
280	! by using the Thomas algorithm
281	!------------------------------------------------------------------------------!
282
283	USE arrays_3d
284
285	IMPLICIT NONE
286
287	INTEGER :: i, j, k, nnyh
288
289	REAL, DIMENSION(nxl_z:nxr_z,0:nz-1) :: ar1
290	REAL, DIMENSION(5,nxl_z:nxr_z,0:nz-1) :: tri
291
292	#if defined( __parallel )
293	REAL :: ar(nxl_z:nxr_za,nys_z:nyn_za,1:nza)
294	#else
295	REAL :: ar(1:nz,nys_z:nyn_z,nxl_z:nxr_z)
296	#endif
297
298
299	nnyh = (ny+1) / 2
300
301	!
302	!-- Define constant elements of the tridiagonal matrix.
303	!$OMP PARALLEL PRIVATE ( k, i )
304	!$OMP DO
305	DO k = 0, nz-1
306	DO i = nxl_z, nxr_z
307	tri(2,i,k) = ddzu_pres(k+1) * ddzw(k+1)
308	tri(3,i,k) = ddzu_pres(k+2) * ddzw(k+1)
309	ENDDO
310	ENDDO
311	!$OMP END PARALLEL
312
313	#if defined( __parallel )
314	!
315	!-- Repeat for all y-levels.
316	!$OMP PARALLEL FIRSTPRIVATE( tri ) PRIVATE ( ar1, j )
317	!$OMP DO
318	DO j = nys_z, nyn_z
319	IF ( j <= nnyh ) THEN
320	CALL maketri( tri, j )
321	ELSE
322	CALL maketri( tri, ny+1-j )
323	ENDIF
324	CALL split( tri )
325	CALL substi( ar, ar1, tri, j )
326	ENDDO
327	!$OMP END PARALLEL
328	#else
329	!
330	!-- First y-level.
331	CALL maketri( tri, nys_z )
332	CALL split( tri )
333	CALL substi( ar, ar1, tri, 0 )
334
335	!
336	!-- Further y-levels.
337	DO j = 1, nnyh - 1
338	CALL maketri( tri, j )
339	CALL split( tri )
340	CALL substi( ar, ar1, tri, j )
341	CALL substi( ar, ar1, tri, ny+1-j )
342	ENDDO
343	CALL maketri( tri, nnyh )
344	CALL split( tri )
345	CALL substi( ar, ar1, tri, nnyh+nys )
346	#endif
347
348	CONTAINS
349
350	SUBROUTINE maketri( tri, j )
351
352	!------------------------------------------------------------------------------!
353	! Computes the i- and j-dependent component of the matrix
354	!------------------------------------------------------------------------------!
355
356	USE arrays_3d
357	USE constants
358	USE control_parameters
359	USE grid_variables
360
361	IMPLICIT NONE
362
363	INTEGER :: i, j, k, nnxh
364	REAL :: a, c
365	REAL :: ll(nxl_z:nxr_z)
366	REAL :: tri(5,nxl_z:nxr_z,0:nz-1)
367
368
369	nnxh = ( nx + 1 ) / 2
370
371	!
372	!-- Provide the tridiagonal matrix for solution of the Poisson equation in
373	!-- Fourier space. The coefficients are computed following the method of
374	!-- Schmidt et al. (DFVLR-Mitteilung 84-15), which departs from Stephan
375	!-- Siano's original version by discretizing the Poisson equation,
376	!-- before it is Fourier-transformed
377	#if defined( __parallel )
378	DO i = nxl_z, nxr_z
379	IF ( i >= 0 .AND. i <= nnxh ) THEN
380	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
381	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
382	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
383	FLOAT( ny+1 ) ) ) / ( dy * dy )
384	ELSE
385	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
386	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
387	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
388	FLOAT( ny+1 ) ) ) / ( dy * dy )
389	ENDIF
390	DO k = 0,nz-1
391	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
392	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
393	tri(1,i,k) = a + c - ll(i)
394	ENDDO
395	ENDDO
396	#else
397	DO i = 0, nnxh
398	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / FLOAT( nx+1 ) ) ) / &
399	( dx * dx ) + &
400	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / FLOAT( ny+1 ) ) ) / &
401	( dy * dy )
402	DO k = 0, nz-1
403	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
404	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
405	tri(1,i,k) = a + c - ll(i)
406	IF ( i >= 1 .and. i < nnxh ) THEN
407	tri(1,nx+1-i,k) = tri(1,i,k)
408	ENDIF
409	ENDDO
410	ENDDO
411	#endif
412	IF ( ibc_p_b == 1 .OR. ibc_p_b == 2 ) THEN
413	DO i = nxl_z, nxr_z
414	tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
415	ENDDO
416	ENDIF
417	IF ( ibc_p_t == 1 ) THEN
418	DO i = nxl_z, nxr_z
419	tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
420	ENDDO
421	ENDIF
422
423	END SUBROUTINE maketri
424
425
426	SUBROUTINE substi( ar, ar1, tri, j )
427
428	!------------------------------------------------------------------------------!
429	! Substitution (Forward and Backward) (Thomas algorithm)
430	!------------------------------------------------------------------------------!
431
432	USE control_parameters
433
434	IMPLICIT NONE
435
436	INTEGER :: i, j, k
437	REAL :: ar1(nxl_z:nxr_z,0:nz-1)
438	REAL :: tri(5,nxl_z:nxr_z,0:nz-1)
439	#if defined( __parallel )
440	REAL :: ar(nxl_z:nxr_za,nys_z:nyn_za,1:nza)
441	#else
442	REAL :: ar(1:nz,nys_z:nyn_z,nxl_z:nxr_z)
443	#endif
444
445	!
446	!-- Forward substitution.
447	DO i = nxl_z, nxr_z
448	#if defined( __parallel )
449	ar1(i,0) = ar(i,j,1)
450	#else
451	ar1(i,0) = ar(1,j,i)
452	#endif
453	ENDDO
454	DO k = 1, nz - 1
455	DO i = nxl_z, nxr_z
456	#if defined( __parallel )
457	ar1(i,k) = ar(i,j,k+1) - tri(5,i,k) * ar1(i,k-1)
458	#else
459	ar1(i,k) = ar(k+1,j,i) - tri(5,i,k) * ar1(i,k-1)
460	#endif
461	ENDDO
462	ENDDO
463
464	!
465	!-- Backward substitution.
466	DO i = nxl_z, nxr_z
467	#if defined( __parallel )
468	ar(i,j,nz) = ar1(i,nz-1) / tri(4,i,nz-1)
469	#else
470	ar(nz,j,i) = ar1(i,nz-1) / tri(4,i,nz-1)
471	#endif
472	ENDDO
473	DO k = nz-2, 0, -1
474	DO i = nxl_z, nxr_z
475	#if defined( __parallel )
476	ar(i,j,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,j,k+2) ) &
477	/ tri(4,i,k)
478	#else
479	ar(k+1,j,i) = ( ar1(i,k) - tri(3,i,k) * ar(k+2,j,i) ) &
480	/ tri(4,i,k)
481	#endif
482	ENDDO
483	ENDDO
484
485	!
486	!-- Indices i=0, j=0 correspond to horizontally averaged pressure.
487	!-- The respective values of ar should be zero at all k-levels if
488	!-- acceleration of horizontally averaged vertical velocity is zero.
489	IF ( ibc_p_b == 1 .AND. ibc_p_t == 1 ) THEN
490	IF ( j == 0 .AND. nxl_z == 0 ) THEN
491	#if defined( __parallel )
492	DO k = 1, nz
493	ar(nxl_z,j,k) = 0.0
494	ENDDO
495	#else
496	DO k = 1, nz
497	ar(k,j,nxl_z) = 0.0
498	ENDDO
499	#endif
500	ENDIF
501	ENDIF
502
503	END SUBROUTINE substi
504
505
506	SUBROUTINE split( tri )
507
508	!------------------------------------------------------------------------------!
509	! Splitting of the tridiagonal matrix (Thomas algorithm)
510	!------------------------------------------------------------------------------!
511
512	IMPLICIT NONE
513
514	INTEGER :: i, k
515	REAL :: tri(5,nxl_z:nxr_z,0:nz-1)
516
517	!
518	!-- Splitting.
519	DO i = nxl_z, nxr_z
520	tri(4,i,0) = tri(1,i,0)
521	ENDDO
522	DO k = 1, nz-1
523	DO i = nxl_z, nxr_z
524	tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
525	tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
526	ENDDO
527	ENDDO
528
529	END SUBROUTINE split
530
531	END SUBROUTINE tridia
532
533
534	#if defined( __parallel )
535	SUBROUTINE fftxp( ar, direction )
536
537	!------------------------------------------------------------------------------!
538	! Fourier-transformation along x-direction Parallelized version
539	!------------------------------------------------------------------------------!
540
541	IMPLICIT NONE
542
543	CHARACTER (LEN=*) :: direction
544	INTEGER :: j, k
545	REAL :: ar(0:nxa,nys_x:nyn_xa,nzb_x:nzt_xa)
546
547	!
548	!-- Performing the fft with one of the methods implemented
549	!$OMP PARALLEL PRIVATE ( j, k )
550	!$OMP DO
551	DO k = nzb_x, nzt_x
552	DO j = nys_x, nyn_x
553	CALL fft_x( ar(0:nx,j,k), direction )
554	ENDDO
555	ENDDO
556	!$OMP END PARALLEL
557
558	END SUBROUTINE fftxp
559
560	#else
561	SUBROUTINE fftx( ar, direction )
562
563	!------------------------------------------------------------------------------!
564	! Fourier-transformation along x-direction Non parallel version
565	!------------------------------------------------------------------------------!
566
567	IMPLICIT NONE
568
569	CHARACTER (LEN=*) :: direction
570	INTEGER :: i, j, k
571	REAL :: ar(1:nz,0:ny,0:nx)
572
573	!
574	!-- Performing the fft with one of the methods implemented
575	!$OMP PARALLEL PRIVATE ( j, k )
576	!$OMP DO
577	DO k = 1, nz
578	DO j = 0, ny
579	CALL fft_x( ar(k,j,0:nx), direction )
580	ENDDO
581	ENDDO
582	!$OMP END PARALLEL
583
584	END SUBROUTINE fftx
585	#endif
586
587
588	#if defined( __parallel )
589	SUBROUTINE fftyp( ar, direction )
590
591	!------------------------------------------------------------------------------!
592	! Fourier-transformation along y-direction Parallelized version
593	!------------------------------------------------------------------------------!
594
595	IMPLICIT NONE
596
597	CHARACTER (LEN=*) :: direction
598	INTEGER :: i, k
599	REAL :: ar(0:nya,nxl_y:nxr_ya,nzb_y:nzt_ya)
600
601	!
602	!-- Performing the fft with one of the methods implemented
603	!$OMP PARALLEL PRIVATE ( i, k )
604	!$OMP DO
605	DO k = nzb_y, nzt_y
606	DO i = nxl_y, nxr_y
607	CALL fft_y( ar(0:ny,i,k), direction )
608	ENDDO
609	ENDDO
610	!$OMP END PARALLEL
611
612	END SUBROUTINE fftyp
613
614	#else
615	SUBROUTINE ffty( ar, direction )
616
617	!------------------------------------------------------------------------------!
618	! Fourier-transformation along y-direction Non parallel version
619	!------------------------------------------------------------------------------!
620
621	IMPLICIT NONE
622
623	CHARACTER (LEN=*) :: direction
624	INTEGER :: i, k
625	REAL :: ar(1:nz,0:ny,0:nx)
626
627	!
628	!-- Performing the fft with one of the methods implemented
629	!$OMP PARALLEL PRIVATE ( i, k )
630	!$OMP DO
631	DO k = 1, nz
632	DO i = 0, nx
633	CALL fft_y( ar(k,0:ny,i), direction )
634	ENDDO
635	ENDDO
636	!$OMP END PARALLEL
637
638	END SUBROUTINE ffty
639	#endif
640
641	#if defined( __parallel )
642	SUBROUTINE ffty_tr_yx( f_in, work, f_out )
643
644	!------------------------------------------------------------------------------!
645	! Fourier-transformation along y with subsequent transposition y --> x for
646	! a 1d-decomposition along x
647	!
648	! ATTENTION: The performance of this routine is much faster on the NEC-SX6,
649	! if the first index of work_ffty_vec is odd. Otherwise
650	! memory bank conflicts may occur (especially if the index is a
651	! multiple of 128). That's why work_ffty_vec is dimensioned as
652	! 0:ny+1.
653	! Of course, this will not work if users are using an odd number
654	! of gridpoints along y.
655	!------------------------------------------------------------------------------!
656
657	USE control_parameters
658	USE cpulog
659	USE indices
660	USE interfaces
661	USE pegrid
662	USE transpose_indices
663
664	IMPLICIT NONE
665
666	INTEGER :: i, iend, iouter, ir, j, k
667	INTEGER, PARAMETER :: stridex = 4
668
669	REAL, DIMENSION(0:ny,stridex) :: work_ffty
670	#if defined( __nec )
671	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
672	#endif
673	REAL, DIMENSION(1:nza,0:nya,nxl:nxra) :: f_in
674	REAL, DIMENSION(nnx,1:nza,nys_x:nyn_xa,pdims(1)) :: f_out
675	REAL, DIMENSION(nxl:nxra,1:nza,0:nya) :: work
676
677	!
678	!-- Carry out the FFT along y, where all data are present due to the
679	!-- 1d-decomposition along x. Resort the data in a way that x becomes
680	!-- the first index.
681	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
682
683	IF ( host(1:3) == 'nec' ) THEN
684	#if defined( __nec )
685	!
686	!-- Code optimized for vector processors
687	!$OMP PARALLEL PRIVATE ( i, j, k )
688	!$OMP DO
689	DO i = nxl, nxr
690
691	DO j = 0, ny
692	DO k = 1, nz
693	work_ffty_vec(j,k,i) = f_in(k,j,i)
694	ENDDO
695	ENDDO
696
697	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' )
698
699	ENDDO
700
701	!$OMP DO
702	DO k = 1, nz
703	DO j = 0, ny
704	DO i = nxl, nxr
705	work(i,k,j) = work_ffty_vec(j,k,i)
706	ENDDO
707	ENDDO
708	ENDDO
709	!$OMP END PARALLEL
710	#endif
711
712	ELSE
713
714	!
715	!-- Cache optimized code.
716	!-- The i-(x-)direction is split into a strided outer loop and an inner
717	!-- loop for better cache performance
718	!$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty)
719	!$OMP DO
720	DO iouter = nxl, nxr, stridex
721
722	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
723
724	DO k = 1, nz
725
726	DO i = iouter, iend
727
728	ir = i-iouter+1 ! counter within a stride
729	DO j = 0, ny
730	work_ffty(j,ir) = f_in(k,j,i)
731	ENDDO
732	!
733	!-- FFT along y
734	CALL fft_y( work_ffty(:,ir), 'forward' )
735
736	ENDDO
737
738	!
739	!-- Resort
740	DO j = 0, ny
741	DO i = iouter, iend
742	work(i,k,j) = work_ffty(j,i-iouter+1)
743	ENDDO
744	ENDDO
745
746	ENDDO
747
748	ENDDO
749	!$OMP END PARALLEL
750
751	ENDIF
752	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
753
754	!
755	!-- Transpose array
756	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
757	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
758	CALL MPI_ALLTOALL( work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
759	f_out(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
760	comm1dx, ierr )
761	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
762
763	END SUBROUTINE ffty_tr_yx
764
765
766	SUBROUTINE tr_xy_ffty( f_in, work, f_out )
767
768	!------------------------------------------------------------------------------!
769	! Transposition x --> y with a subsequent backward Fourier transformation for
770	! a 1d-decomposition along x
771	!------------------------------------------------------------------------------!
772
773	USE control_parameters
774	USE cpulog
775	USE indices
776	USE interfaces
777	USE pegrid
778	USE transpose_indices
779
780	IMPLICIT NONE
781
782	INTEGER :: i, iend, iouter, ir, j, k
783	INTEGER, PARAMETER :: stridex = 4
784
785	REAL, DIMENSION(0:ny,stridex) :: work_ffty
786	#if defined( __nec )
787	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
788	#endif
789	REAL, DIMENSION(nnx,1:nza,nys_x:nyn_xa,pdims(1)) :: f_in
790	REAL, DIMENSION(1:nza,0:nya,nxl:nxra) :: f_out
791	REAL, DIMENSION(nxl:nxra,1:nza,0:nya) :: work
792
793	!
794	!-- Transpose array
795	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
796	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
797	CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
798	work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
799	comm1dx, ierr )
800	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
801
802	!
803	!-- Resort the data in a way that y becomes the first index and carry out the
804	!-- backward fft along y.
805	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
806
807	IF ( host(1:3) == 'nec' ) THEN
808	#if defined( __nec )
809	!
810	!-- Code optimized for vector processors
811	!$OMP PARALLEL PRIVATE ( i, j, k )
812	!$OMP DO
813	DO k = 1, nz
814	DO j = 0, ny
815	DO i = nxl, nxr
816	work_ffty_vec(j,k,i) = work(i,k,j)
817	ENDDO
818	ENDDO
819	ENDDO
820
821	!$OMP DO
822	DO i = nxl, nxr
823
824	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' )
825
826	DO j = 0, ny
827	DO k = 1, nz
828	f_out(k,j,i) = work_ffty_vec(j,k,i)
829	ENDDO
830	ENDDO
831
832	ENDDO
833	!$OMP END PARALLEL
834	#endif
835
836	ELSE
837
838	!
839	!-- Cache optimized code.
840	!-- The i-(x-)direction is split into a strided outer loop and an inner
841	!-- loop for better cache performance
842	!$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty )
843	!$OMP DO
844	DO iouter = nxl, nxr, stridex
845
846	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
847
848	DO k = 1, nz
849	!
850	!-- Resort
851	DO j = 0, ny
852	DO i = iouter, iend
853	work_ffty(j,i-iouter+1) = work(i,k,j)
854	ENDDO
855	ENDDO
856
857	DO i = iouter, iend
858
859	!
860	!-- FFT along y
861	ir = i-iouter+1 ! counter within a stride
862	CALL fft_y( work_ffty(:,ir), 'backward' )
863
864	DO j = 0, ny
865	f_out(k,j,i) = work_ffty(j,ir)
866	ENDDO
867	ENDDO
868
869	ENDDO
870
871	ENDDO
872	!$OMP END PARALLEL
873
874	ENDIF
875
876	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
877
878	END SUBROUTINE tr_xy_ffty
879
880
881	SUBROUTINE fftx_tri_fftx( ar )
882
883	!------------------------------------------------------------------------------!
884	! FFT along x, solution of the tridiagonal system and backward FFT for
885	! a 1d-decomposition along x
886	!
887	! WARNING: this subroutine may still not work for hybrid parallelization
888	! with OpenMP (for possible necessary changes see the original
889	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
890	!------------------------------------------------------------------------------!
891
892	USE control_parameters
893	USE cpulog
894	USE grid_variables
895	USE indices
896	USE interfaces
897	USE pegrid
898	USE transpose_indices
899
900	IMPLICIT NONE
901
902	character(len=3) :: myth_char
903
904	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
905
906	REAL, DIMENSION(0:nx) :: work_fftx
907	REAL, DIMENSION(0:nx,1:nz) :: work_trix
908	REAL, DIMENSION(nnx,1:nza,nys_x:nyn_xa,pdims(1)) :: ar
909	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
910
911
912	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'start' )
913
914	ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) )
915
916	tn = 0 ! Default thread number in case of one thread
917	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_fftx, work_trix )
918	DO j = nys_x, nyn_x
919
920	!$ tn = omp_get_thread_num()
921
922	IF ( host(1:3) == 'nec' ) THEN
923	!
924	!-- Code optimized for vector processors
925	DO k = 1, nz
926
927	m = 0
928	DO n = 1, pdims(1)
929	DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
930	work_trix(m,k) = ar(i,k,j,n)
931	m = m + 1
932	ENDDO
933	ENDDO
934
935	ENDDO
936
937	CALL fft_x_m( work_trix, 'forward' )
938
939	ELSE
940	!
941	!-- Cache optimized code
942	DO k = 1, nz
943
944	m = 0
945	DO n = 1, pdims(1)
946	DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
947	work_fftx(m) = ar(i,k,j,n)
948	m = m + 1
949	ENDDO
950	ENDDO
951
952	CALL fft_x( work_fftx, 'forward' )
953
954	DO i = 0, nx
955	work_trix(i,k) = work_fftx(i)
956	ENDDO
957
958	ENDDO
959
960	ENDIF
961
962	!
963	!-- Solve the linear equation system
964	CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) )
965
966	IF ( host(1:3) == 'nec' ) THEN
967	!
968	!-- Code optimized for vector processors
969	CALL fft_x_m( work_trix, 'backward' )
970
971	DO k = 1, nz
972
973	m = 0
974	DO n = 1, pdims(1)
975	DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
976	ar(i,k,j,n) = work_trix(m,k)
977	m = m + 1
978	ENDDO
979	ENDDO
980
981	ENDDO
982
983	ELSE
984	!
985	!-- Cache optimized code
986	DO k = 1, nz
987
988	DO i = 0, nx
989	work_fftx(i) = work_trix(i,k)
990	ENDDO
991
992	CALL fft_x( work_fftx, 'backward' )
993
994	m = 0
995	DO n = 1, pdims(1)
996	DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
997	ar(i,k,j,n) = work_fftx(m)
998	m = m + 1
999	ENDDO
1000	ENDDO
1001
1002	ENDDO
1003
1004	ENDIF
1005
1006	ENDDO
1007
1008	DEALLOCATE( tri )
1009
1010	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'stop' )
1011
1012	END SUBROUTINE fftx_tri_fftx
1013
1014
1015	SUBROUTINE fftx_tr_xy( f_in, work, f_out )
1016
1017	!------------------------------------------------------------------------------!
1018	! Fourier-transformation along x with subsequent transposition x --> y for
1019	! a 1d-decomposition along y
1020	!
1021	! ATTENTION: The NEC-branch of this routine may significantly profit from
1022	! further optimizations. So far, performance is much worse than
1023	! for routine ffty_tr_yx (more than three times slower).
1024	!------------------------------------------------------------------------------!
1025
1026	USE control_parameters
1027	USE cpulog
1028	USE indices
1029	USE interfaces
1030	USE pegrid
1031	USE transpose_indices
1032
1033	IMPLICIT NONE
1034
1035	INTEGER :: i, j, k
1036
1037	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
1038	REAL, DIMENSION(1:nza,nys:nyna,0:nxa) :: f_in
1039	REAL, DIMENSION(nny,1:nza,nxl_y:nxr_ya,pdims(2)) :: f_out
1040	REAL, DIMENSION(nys:nyna,1:nza,0:nxa) :: work
1041
1042	!
1043	!-- Carry out the FFT along x, where all data are present due to the
1044	!-- 1d-decomposition along y. Resort the data in a way that y becomes
1045	!-- the first index.
1046	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
1047
1048	IF ( host(1:3) == 'nec' ) THEN
1049	!
1050	!-- Code for vector processors
1051	!$OMP PARALLEL PRIVATE ( i, j, k )
1052	!$OMP DO
1053	DO i = 0, nx
1054
1055	DO j = nys, nyn
1056	DO k = 1, nz
1057	work_fftx(i,k,j) = f_in(k,j,i)
1058	ENDDO
1059	ENDDO
1060
1061	ENDDO
1062
1063	!$OMP DO
1064	DO j = nys, nyn
1065
1066	CALL fft_x_m( work_fftx(:,:,j), 'forward' )
1067
1068	DO k = 1, nz
1069	DO i = 0, nx
1070	work(j,k,i) = work_fftx(i,k,j)
1071	ENDDO
1072	ENDDO
1073
1074	ENDDO
1075	!$OMP END PARALLEL
1076
1077	ELSE
1078
1079	!
1080	!-- Cache optimized code (there might be still a potential for better
1081	!-- optimization).
1082	!$OMP PARALLEL PRIVATE (i,j,k)
1083	!$OMP DO
1084	DO i = 0, nx
1085
1086	DO j = nys, nyn
1087	DO k = 1, nz
1088	work_fftx(i,k,j) = f_in(k,j,i)
1089	ENDDO
1090	ENDDO
1091
1092	ENDDO
1093
1094	!$OMP DO
1095	DO j = nys, nyn
1096	DO k = 1, nz
1097
1098	CALL fft_x( work_fftx(0:nx,k,j), 'forward' )
1099
1100	DO i = 0, nx
1101	work(j,k,i) = work_fftx(i,k,j)
1102	ENDDO
1103	ENDDO
1104
1105	ENDDO
1106	!$OMP END PARALLEL
1107
1108	ENDIF
1109	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
1110
1111	!
1112	!-- Transpose array
1113	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1114	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1115	CALL MPI_ALLTOALL( work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
1116	f_out(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
1117	comm1dy, ierr )
1118	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1119
1120	END SUBROUTINE fftx_tr_xy
1121
1122
1123	SUBROUTINE tr_yx_fftx( f_in, work, f_out )
1124
1125	!------------------------------------------------------------------------------!
1126	! Transposition y --> x with a subsequent backward Fourier transformation for
1127	! a 1d-decomposition along x
1128	!------------------------------------------------------------------------------!
1129
1130	USE control_parameters
1131	USE cpulog
1132	USE indices
1133	USE interfaces
1134	USE pegrid
1135	USE transpose_indices
1136
1137	IMPLICIT NONE
1138
1139	INTEGER :: i, j, k
1140
1141	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
1142	REAL, DIMENSION(nny,1:nza,nxl_y:nxr_ya,pdims(2)) :: f_in
1143	REAL, DIMENSION(1:nza,nys:nyna,0:nxa) :: f_out
1144	REAL, DIMENSION(nys:nyna,1:nza,0:nxa) :: work
1145
1146	!
1147	!-- Transpose array
1148	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1149	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1150	CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
1151	work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
1152	comm1dy, ierr )
1153	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1154
1155	!
1156	!-- Carry out the FFT along x, where all data are present due to the
1157	!-- 1d-decomposition along y. Resort the data in a way that y becomes
1158	!-- the first index.
1159	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
1160
1161	IF ( host(1:3) == 'nec' ) THEN
1162	!
1163	!-- Code optimized for vector processors
1164	!$OMP PARALLEL PRIVATE ( i, j, k )
1165	!$OMP DO
1166	DO j = nys, nyn
1167
1168	DO k = 1, nz
1169	DO i = 0, nx
1170	work_fftx(i,k,j) = work(j,k,i)
1171	ENDDO
1172	ENDDO
1173
1174	CALL fft_x_m( work_fftx(:,:,j), 'backward' )
1175
1176	ENDDO
1177
1178	!$OMP DO
1179	DO i = 0, nx
1180	DO j = nys, nyn
1181	DO k = 1, nz
1182	f_out(k,j,i) = work_fftx(i,k,j)
1183	ENDDO
1184	ENDDO
1185	ENDDO
1186	!$OMP END PARALLEL
1187
1188	ELSE
1189
1190	!
1191	!-- Cache optimized code (there might be still a potential for better
1192	!-- optimization).
1193	!$OMP PARALLEL PRIVATE (i,j,k)
1194	!$OMP DO
1195	DO j = nys, nyn
1196	DO k = 1, nz
1197
1198	DO i = 0, nx
1199	work_fftx(i,k,j) = work(j,k,i)
1200	ENDDO
1201
1202	CALL fft_x( work_fftx(0:nx,k,j), 'backward' )
1203
1204	ENDDO
1205	ENDDO
1206
1207	!$OMP DO
1208	DO i = 0, nx
1209	DO j = nys, nyn
1210	DO k = 1, nz
1211	f_out(k,j,i) = work_fftx(i,k,j)
1212	ENDDO
1213	ENDDO
1214	ENDDO
1215	!$OMP END PARALLEL
1216
1217	ENDIF
1218	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
1219
1220	END SUBROUTINE tr_yx_fftx
1221
1222
1223	SUBROUTINE ffty_tri_ffty( ar )
1224
1225	!------------------------------------------------------------------------------!
1226	! FFT along y, solution of the tridiagonal system and backward FFT for
1227	! a 1d-decomposition along y
1228	!
1229	! WARNING: this subroutine may still not work for hybrid parallelization
1230	! with OpenMP (for possible necessary changes see the original
1231	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
1232	!------------------------------------------------------------------------------!
1233
1234	USE control_parameters
1235	USE cpulog
1236	USE grid_variables
1237	USE indices
1238	USE interfaces
1239	USE pegrid
1240	USE transpose_indices
1241
1242	IMPLICIT NONE
1243
1244	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
1245
1246	REAL, DIMENSION(0:ny) :: work_ffty
1247	REAL, DIMENSION(0:ny,1:nz) :: work_triy
1248	REAL, DIMENSION(nny,1:nza,nxl_y:nxr_ya,pdims(2)) :: ar
1249	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
1250
1251
1252	CALL cpu_log( log_point_s(39), 'fft_y + tridia', 'start' )
1253
1254	ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) )
1255
1256	tn = 0 ! Default thread number in case of one thread
1257	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy )
1258	DO i = nxl_y, nxr_y
1259
1260	!$ tn = omp_get_thread_num()
1261
1262	IF ( host(1:3) == 'nec' ) THEN
1263	!
1264	!-- Code optimized for vector processors
1265	DO k = 1, nz
1266
1267	m = 0
1268	DO n = 1, pdims(2)
1269	DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
1270	work_triy(m,k) = ar(j,k,i,n)
1271	m = m + 1
1272	ENDDO
1273	ENDDO
1274
1275	ENDDO
1276
1277	CALL fft_y_m( work_triy, ny, 'forward' )
1278
1279	ELSE
1280	!
1281	!-- Cache optimized code
1282	DO k = 1, nz
1283
1284	m = 0
1285	DO n = 1, pdims(2)
1286	DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
1287	work_ffty(m) = ar(j,k,i,n)
1288	m = m + 1
1289	ENDDO
1290	ENDDO
1291
1292	CALL fft_y( work_ffty, 'forward' )
1293
1294	DO j = 0, ny
1295	work_triy(j,k) = work_ffty(j)
1296	ENDDO
1297
1298	ENDDO
1299
1300	ENDIF
1301
1302	!
1303	!-- Solve the linear equation system
1304	CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) )
1305
1306	IF ( host(1:3) == 'nec' ) THEN
1307	!
1308	!-- Code optimized for vector processors
1309	CALL fft_y_m( work_triy, ny, 'backward' )
1310
1311	DO k = 1, nz
1312
1313	m = 0
1314	DO n = 1, pdims(2)
1315	DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
1316	ar(j,k,i,n) = work_triy(m,k)
1317	m = m + 1
1318	ENDDO
1319	ENDDO
1320
1321	ENDDO
1322
1323	ELSE
1324	!
1325	!-- Cache optimized code
1326	DO k = 1, nz
1327
1328	DO j = 0, ny
1329	work_ffty(j) = work_triy(j,k)
1330	ENDDO
1331
1332	CALL fft_y( work_ffty, 'backward' )
1333
1334	m = 0
1335	DO n = 1, pdims(2)
1336	DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
1337	ar(j,k,i,n) = work_ffty(m)
1338	m = m + 1
1339	ENDDO
1340	ENDDO
1341
1342	ENDDO
1343
1344	ENDIF
1345
1346	ENDDO
1347
1348	DEALLOCATE( tri )
1349
1350	CALL cpu_log( log_point_s(39), 'fft_y + tridia', 'stop' )
1351
1352	END SUBROUTINE ffty_tri_ffty
1353
1354
1355	SUBROUTINE tridia_1dd( ddx2, ddy2, nx, ny, j, ar, tri )
1356
1357	!------------------------------------------------------------------------------!
1358	! Solves the linear system of equations for a 1d-decomposition along x (see
1359	! tridia)
1360	!
1361	! Attention: when using the intel compiler, array tri must be passed as an
1362	! argument to the contained subroutines. Otherwise addres faults
1363	! will occur.
1364	! On NEC, tri should not be passed (except for routine substi_1dd)
1365	! because this causes very bad performance.
1366	!------------------------------------------------------------------------------!
1367
1368	USE arrays_3d
1369	USE control_parameters
1370
1371	USE pegrid
1372
1373	IMPLICIT NONE
1374
1375	INTEGER :: i, j, k, nnyh, nx, ny, omp_get_thread_num, tn
1376
1377	REAL :: ddx2, ddy2
1378
1379	REAL, DIMENSION(0:nx,1:nz) :: ar
1380	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
1381	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
1382
1383
1384	nnyh = ( ny + 1 ) / 2
1385
1386	!
1387	!-- Define constant elements of the tridiagonal matrix.
1388	!-- The compiler on SX6 does loop exchange. If 0:nx is a high power of 2,
1389	!-- the exchanged loops create bank conflicts. The following directive
1390	!-- prohibits loop exchange and the loops perform much better.
1391	! tn = omp_get_thread_num()
1392	! WRITE( 120+tn, * ) '+++ id=',myid,' nx=',nx,' thread=', omp_get_thread_num()
1393	! CALL local_flush( 120+tn )
1394	!CDIR NOLOOPCHG
1395	DO k = 0, nz-1
1396	DO i = 0,nx
1397	tri(2,i,k) = ddzu_pres(k+1) * ddzw(k+1)
1398	tri(3,i,k) = ddzu_pres(k+2) * ddzw(k+1)
1399	ENDDO
1400	ENDDO
1401	! WRITE( 120+tn, * ) '+++ id=',myid,' end of first tridia loop thread=', omp_get_thread_num()
1402	! CALL local_flush( 120+tn )
1403
1404	IF ( j <= nnyh ) THEN
1405	#if defined( __lc )
1406	CALL maketri_1dd( j, tri )
1407	#else
1408	CALL maketri_1dd( j )
1409	#endif
1410	ELSE
1411	#if defined( __lc )
1412	CALL maketri_1dd( ny+1-j, tri )
1413	#else
1414	CALL maketri_1dd( ny+1-j )
1415	#endif
1416	ENDIF
1417	#if defined( __lc )
1418	CALL split_1dd( tri )
1419	#else
1420	CALL split_1dd
1421	#endif
1422	CALL substi_1dd( ar, tri )
1423
1424	CONTAINS
1425
1426	#if defined( __lc )
1427	SUBROUTINE maketri_1dd( j, tri )
1428	#else
1429	SUBROUTINE maketri_1dd( j )
1430	#endif
1431
1432	!------------------------------------------------------------------------------!
1433	! computes the i- and j-dependent component of the matrix
1434	!------------------------------------------------------------------------------!
1435
1436	USE constants
1437
1438	IMPLICIT NONE
1439
1440	INTEGER :: i, j, k, nnxh
1441	REAL :: a, c
1442
1443	REAL, DIMENSION(0:nx) :: l
1444
1445	#if defined( __lc )
1446	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
1447	#endif
1448
1449
1450	nnxh = ( nx + 1 ) / 2
1451	!
1452	!-- Provide the tridiagonal matrix for solution of the Poisson equation in
1453	!-- Fourier space. The coefficients are computed following the method of
1454	!-- Schmidt et al. (DFVLR-Mitteilung 84-15), which departs from Stephan
1455	!-- Siano's original version by discretizing the Poisson equation,
1456	!-- before it is Fourier-transformed
1457	DO i = 0, nx
1458	IF ( i >= 0 .AND. i <= nnxh ) THEN
1459	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
1460	FLOAT( nx+1 ) ) ) * ddx2 + &
1461	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
1462	FLOAT( ny+1 ) ) ) * ddy2
1463	ELSE
1464	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
1465	FLOAT( nx+1 ) ) ) * ddx2 + &
1466	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
1467	FLOAT( ny+1 ) ) ) * ddy2
1468	ENDIF
1469	ENDDO
1470
1471	DO k = 0, nz-1
1472	DO i = 0, nx
1473	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
1474	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
1475	tri(1,i,k) = a + c - l(i)
1476	ENDDO
1477	ENDDO
1478	IF ( ibc_p_b == 1 .OR. ibc_p_b == 2 ) THEN
1479	DO i = 0, nx
1480	tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
1481	ENDDO
1482	ENDIF
1483	IF ( ibc_p_t == 1 ) THEN
1484	DO i = 0, nx
1485	tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
1486	ENDDO
1487	ENDIF
1488
1489	END SUBROUTINE maketri_1dd
1490
1491
1492	#if defined( __lc )
1493	SUBROUTINE split_1dd( tri )
1494	#else
1495	SUBROUTINE split_1dd
1496	#endif
1497
1498	!------------------------------------------------------------------------------!
1499	! Splitting of the tridiagonal matrix (Thomas algorithm)
1500	!------------------------------------------------------------------------------!
1501
1502	IMPLICIT NONE
1503
1504	INTEGER :: i, k
1505
1506	#if defined( __lc )
1507	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
1508	#endif
1509
1510
1511	!
1512	!-- Splitting
1513	DO i = 0, nx
1514	tri(4,i,0) = tri(1,i,0)
1515	ENDDO
1516	DO k = 1, nz-1
1517	DO i = 0, nx
1518	tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
1519	tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
1520	ENDDO
1521	ENDDO
1522
1523	END SUBROUTINE split_1dd
1524
1525
1526	SUBROUTINE substi_1dd( ar, tri )
1527
1528	!------------------------------------------------------------------------------!
1529	! Substitution (Forward and Backward) (Thomas algorithm)
1530	!------------------------------------------------------------------------------!
1531
1532	IMPLICIT NONE
1533
1534	INTEGER :: i, k
1535
1536	REAL, DIMENSION(0:nx,nz) :: ar
1537	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
1538	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
1539
1540	!
1541	!-- Forward substitution
1542	DO i = 0, nx
1543	ar1(i,0) = ar(i,1)
1544	ENDDO
1545	DO k = 1, nz-1
1546	DO i = 0, nx
1547	ar1(i,k) = ar(i,k+1) - tri(5,i,k) * ar1(i,k-1)
1548	ENDDO
1549	ENDDO
1550
1551	!
1552	!-- Backward substitution
1553	!-- Note, the add of 1.0E-20 in the denominator is due to avoid divisions
1554	!-- by zero appearing if the pressure bc is set to neumann at the top of
1555	!-- the model domain.
1556	DO i = 0, nx
1557	ar(i,nz) = ar1(i,nz-1) / ( tri(4,i,nz-1) + 1.0E-20 )
1558	ENDDO
1559	DO k = nz-2, 0, -1
1560	DO i = 0, nx
1561	ar(i,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,k+2) ) &
1562	/ tri(4,i,k)
1563	ENDDO
1564	ENDDO
1565
1566	!
1567	!-- Indices i=0, j=0 correspond to horizontally averaged pressure.
1568	!-- The respective values of ar should be zero at all k-levels if
1569	!-- acceleration of horizontally averaged vertical velocity is zero.
1570	IF ( ibc_p_b == 1 .AND. ibc_p_t == 1 ) THEN
1571	IF ( j == 0 ) THEN
1572	DO k = 1, nz
1573	ar(0,k) = 0.0
1574	ENDDO
1575	ENDIF
1576	ENDIF
1577
1578	END SUBROUTINE substi_1dd
1579
1580	END SUBROUTINE tridia_1dd
1581
1582	#endif
1583
1584	END MODULE poisfft_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |