Home

Context Navigation

source: palm/trunk/SOURCE/poisfft.f90 @ 164

Last change on this file since 164 was 164, checked in by raasch, 16 years ago
optimization of transpositions for 2D decompositions, workaround for using -env option with mpiexec, adjustments for lcxt4
Property svn:keywords set to `Id`
File size: 44.4 KB

Line
1	MODULE poisfft_mod
2
3	!------------------------------------------------------------------------------!
4	! Actual revisions:
5	! -----------------
6	! Arguments removed from transpose routines
7	!
8	! Former revisions:
9	! -----------------
10	! $Id: poisfft.f90 164 2008-05-15 08:46:15Z raasch $
11	!
12	! 128 2007-10-26 13:11:14Z raasch
13	! Bugfix: wavenumber calculation for even nx in routines maketri
14	!
15	! 85 2007-05-11 09:35:14Z raasch
16	! Bugfix: work_fft*_vec removed from some PRIVATE-declarations
17	!
18	! 76 2007-03-29 00:58:32Z raasch
19	! Tridiagonal coefficients adjusted for Neumann boundary conditions both at
20	! the bottom and the top.
21	!
22	! RCS Log replace by Id keyword, revision history cleaned up
23	!
24	! Revision 1.24 2006/08/04 15:00:24 raasch
25	! Default setting of the thread number tn in case of not using OpenMP
26	!
27	! Revision 1.23 2006/02/23 12:48:38 raasch
28	! Additional compiler directive in routine tridia_1dd for preventing loop
29	! exchange on NEC-SX6
30	!
31	! Revision 1.20 2004/04/30 12:38:09 raasch
32	! Parts of former poisfft_hybrid moved to this subroutine,
33	! former subroutine changed to a module, renaming of FFT-subroutines and
34	! -module, FFTs completely substituted by calls of fft_x and fft_y,
35	! NAG fft used in the non-parallel case completely removed, l in maketri
36	! is now a 1d-array, variables passed by modules instead of using parameter
37	! lists, enlarged transposition arrays introduced
38	!
39	! Revision 1.1 1997/07/24 11:24:14 raasch
40	! Initial revision
41	!
42	!
43	! Description:
44	! ------------
45	! See below.
46	!------------------------------------------------------------------------------!
47
48	!--------------------------------------------------------------------------!
49	! poisfft !
50	! !
51	! Original version: Stephan Siano (pois3d) !
52	! !
53	! Institute of Meteorology and Climatology, University of Hannover !
54	! Germany !
55	! !
56	! Version as of July 23,1996 !
57	! !
58	! !
59	! Version for parallel computers: Siegfried Raasch !
60	! !
61	! Version as of July 03,1997 !
62	! !
63	! Solves the Poisson equation with a 2D spectral method !
64	! d^2 p / dx^2 + d^2 p / dy^2 + d^2 p / dz^2 = s !
65	! !
66	! Input: !
67	! real ar contains in the (nnx,nny,nnz) elements, !
68	! starting from the element (1,nys,nxl), the !
69	! values for s !
70	! real work Temporary array !
71	! !
72	! Output: !
73	! real ar contains the solution for p !
74	!--------------------------------------------------------------------------!
75
76	USE fft_xy
77	USE indices
78	USE transpose_indices
79
80	IMPLICIT NONE
81
82	PRIVATE
83	PUBLIC poisfft, poisfft_init
84
85	INTERFACE poisfft
86	MODULE PROCEDURE poisfft
87	END INTERFACE poisfft
88
89	INTERFACE poisfft_init
90	MODULE PROCEDURE poisfft_init
91	END INTERFACE poisfft_init
92
93	CONTAINS
94
95	SUBROUTINE poisfft_init
96
97	CALL fft_init
98
99	END SUBROUTINE poisfft_init
100
101
102	SUBROUTINE poisfft( ar, work )
103
104	USE cpulog
105	USE interfaces
106	USE pegrid
107
108	IMPLICIT NONE
109
110	REAL, DIMENSION(1:nza,nys:nyna,nxl:nxra) :: ar, work
111
112
113	CALL cpu_log( log_point_s(3), 'poisfft', 'start' )
114
115	!
116	!-- Two-dimensional Fourier Transformation in x- and y-direction.
117	#if defined( __parallel )
118	IF ( pdims(2) == 1 ) THEN
119
120	!
121	!-- 1d-domain-decomposition along x:
122	!-- FFT along y and transposition y --> x
123	CALL ffty_tr_yx( ar, work, ar )
124
125	!
126	!-- FFT along x, solving the tridiagonal system and backward FFT
127	CALL fftx_tri_fftx( ar )
128
129	!
130	!-- Transposition x --> y and backward FFT along y
131	CALL tr_xy_ffty( ar, work, ar )
132
133	ELSEIF ( pdims(1) == 1 ) THEN
134
135	!
136	!-- 1d-domain-decomposition along y:
137	!-- FFT along x and transposition x --> y
138	CALL fftx_tr_xy( ar, work, ar )
139
140	!
141	!-- FFT along y, solving the tridiagonal system and backward FFT
142	CALL ffty_tri_ffty( ar )
143
144	!
145	!-- Transposition y --> x and backward FFT along x
146	CALL tr_yx_fftx( ar, work, ar )
147
148	ELSE
149
150	!
151	!-- 2d-domain-decomposition
152	!-- Transposition z --> x
153	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
154	CALL transpose_zx( ar, work, ar )
155	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
156
157	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
158	CALL fftxp( ar, 'forward' )
159	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
160
161	!
162	!-- Transposition x --> y
163	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
164	CALL transpose_xy( ar, work, ar )
165	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
166
167	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
168	CALL fftyp( ar, 'forward' )
169	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
170
171	!
172	!-- Transposition y --> z
173	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
174	CALL transpose_yz( ar, work, ar )
175	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
176
177	!
178	!-- Solve the Poisson equation in z-direction in cartesian space.
179	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
180	CALL tridia( ar )
181	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
182
183	!
184	!-- Inverse Fourier Transformation
185	!-- Transposition z --> y
186	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
187	CALL transpose_zy( ar, work, ar )
188	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
189
190	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
191	CALL fftyp( ar, 'backward' )
192	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
193
194	!
195	!-- Transposition y --> x
196	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
197	CALL transpose_yx( ar, work, ar )
198	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
199
200	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
201	CALL fftxp( ar, 'backward' )
202	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
203
204	!
205	!-- Transposition x --> z
206	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
207	CALL transpose_xz( ar, work, ar )
208	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
209
210	ENDIF
211
212	#else
213
214	!
215	!-- Two-dimensional Fourier Transformation along x- and y-direction.
216	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
217	CALL fftx( ar, 'forward' )
218	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
219	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
220	CALL ffty( ar, 'forward' )
221	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
222
223	!
224	!-- Solve the Poisson equation in z-direction in cartesian space.
225	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
226	CALL tridia( ar )
227	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
228
229	!
230	!-- Inverse Fourier Transformation.
231	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
232	CALL ffty( ar, 'backward' )
233	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
234	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
235	CALL fftx( ar, 'backward' )
236	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
237
238	#endif
239
240	CALL cpu_log( log_point_s(3), 'poisfft', 'stop' )
241
242	END SUBROUTINE poisfft
243
244
245
246	SUBROUTINE tridia( ar )
247
248	!------------------------------------------------------------------------------!
249	! solves the linear system of equations:
250	!
251	! -(4 pi^2(i^2/(dx^2nnx^2)+j^2/(dy^2nny^2))+
252	! 1/(dzu(k)dzw(k))+1/(dzu(k-1)dzw(k)))*p(i,j,k)+
253	! 1/(dzu(k)dzw(k))p(i,j,k+1)+1/(dzu(k-1)dzw(k))p(i,j,k-1)=d(i,j,k)
254	!
255	! by using the Thomas algorithm
256	!------------------------------------------------------------------------------!
257
258	USE arrays_3d
259
260	IMPLICIT NONE
261
262	INTEGER :: i, j, k, nnyh
263
264	REAL, DIMENSION(nxl_z:nxr_z,0:nz-1) :: ar1
265	REAL, DIMENSION(5,nxl_z:nxr_z,0:nz-1) :: tri
266
267	#if defined( __parallel )
268	REAL :: ar(nxl_z:nxr_za,nys_z:nyn_za,1:nza)
269	#else
270	REAL :: ar(1:nz,nys_z:nyn_z,nxl_z:nxr_z)
271	#endif
272
273
274	nnyh = (ny+1) / 2
275
276	!
277	!-- Define constant elements of the tridiagonal matrix.
278	DO k = 0, nz-1
279	DO i = nxl_z, nxr_z
280	tri(2,i,k) = ddzu(k+1) * ddzw(k+1)
281	tri(3,i,k) = ddzu(k+2) * ddzw(k+1)
282	ENDDO
283	ENDDO
284
285	#if defined( __parallel )
286	!
287	!-- Repeat for all y-levels.
288	DO j = nys_z, nyn_z
289	IF ( j <= nnyh ) THEN
290	CALL maketri( tri, j )
291	ELSE
292	CALL maketri( tri, ny+1-j )
293	ENDIF
294	CALL split( tri )
295	CALL substi( ar, ar1, tri, j )
296	ENDDO
297	#else
298	!
299	!-- First y-level.
300	CALL maketri( tri, nys_z )
301	CALL split( tri )
302	CALL substi( ar, ar1, tri, 0 )
303
304	!
305	!-- Further y-levels.
306	DO j = 1, nnyh - 1
307	CALL maketri( tri, j )
308	CALL split( tri )
309	CALL substi( ar, ar1, tri, j )
310	CALL substi( ar, ar1, tri, ny+1-j )
311	ENDDO
312	CALL maketri( tri, nnyh )
313	CALL split( tri )
314	CALL substi( ar, ar1, tri, nnyh+nys )
315	#endif
316
317	CONTAINS
318
319	SUBROUTINE maketri( tri, j )
320
321	!------------------------------------------------------------------------------!
322	! Computes the i- and j-dependent component of the matrix
323	!------------------------------------------------------------------------------!
324
325	USE arrays_3d
326	USE constants
327	USE control_parameters
328	USE grid_variables
329
330	IMPLICIT NONE
331
332	INTEGER :: i, j, k, nnxh
333	REAL :: a, c
334	REAL :: ll(nxl_z:nxr_z)
335	REAL :: tri(5,nxl_z:nxr_z,0:nz-1)
336
337
338	nnxh = ( nx + 1 ) / 2
339
340	!
341	!-- Provide the tridiagonal matrix for solution of the Poisson equation in
342	!-- Fourier space. The coefficients are computed following the method of
343	!-- Schmidt et al. (DFVLR-Mitteilung 84-15), which departs from Stephan
344	!-- Siano's original version by discretizing the Poisson equation,
345	!-- before it is Fourier-transformed
346	#if defined( __parallel )
347	DO i = nxl_z, nxr_z
348	IF ( i >= 0 .AND. i <= nnxh ) THEN
349	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
350	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
351	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
352	FLOAT( ny+1 ) ) ) / ( dy * dy )
353	ELSE
354	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
355	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
356	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
357	FLOAT( ny+1 ) ) ) / ( dy * dy )
358	ENDIF
359	DO k = 0,nz-1
360	a = -1.0 * ddzu(k+2) * ddzw(k+1)
361	c = -1.0 * ddzu(k+1) * ddzw(k+1)
362	tri(1,i,k) = a + c - ll(i)
363	ENDDO
364	ENDDO
365	#else
366	DO i = 0, nnxh
367	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / FLOAT( nx+1 ) ) ) / &
368	( dx * dx ) + &
369	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / FLOAT( ny+1 ) ) ) / &
370	( dy * dy )
371	DO k = 0, nz-1
372	a = -1.0 * ddzu(k+2) * ddzw(k+1)
373	c = -1.0 * ddzu(k+1) * ddzw(k+1)
374	tri(1,i,k) = a + c - ll(i)
375	IF ( i >= 1 .and. i < nnxh ) THEN
376	tri(1,nx+1-i,k) = tri(1,i,k)
377	ENDIF
378	ENDDO
379	ENDDO
380	#endif
381	IF ( ibc_p_b == 1 .OR. ibc_p_b == 2 ) THEN
382	DO i = nxl_z, nxr_z
383	tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
384	ENDDO
385	ENDIF
386	IF ( ibc_p_t == 1 ) THEN
387	DO i = nxl_z, nxr_z
388	tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
389	ENDDO
390	ENDIF
391
392	END SUBROUTINE maketri
393
394
395	SUBROUTINE substi( ar, ar1, tri, j )
396
397	!------------------------------------------------------------------------------!
398	! Substitution (Forward and Backward) (Thomas algorithm)
399	!------------------------------------------------------------------------------!
400
401	USE control_parameters
402
403	IMPLICIT NONE
404
405	INTEGER :: i, j, k
406	REAL :: ar1(nxl_z:nxr_z,0:nz-1)
407	REAL :: tri(5,nxl_z:nxr_z,0:nz-1)
408	#if defined( __parallel )
409	REAL :: ar(nxl_z:nxr_za,nys_z:nyn_za,1:nza)
410	#else
411	REAL :: ar(1:nz,nys_z:nyn_z,nxl_z:nxr_z)
412	#endif
413
414	!
415	!-- Forward substitution.
416	DO i = nxl_z, nxr_z
417	#if defined( __parallel )
418	ar1(i,0) = ar(i,j,1)
419	#else
420	ar1(i,0) = ar(1,j,i)
421	#endif
422	ENDDO
423	DO k = 1, nz - 1
424	DO i = nxl_z, nxr_z
425	#if defined( __parallel )
426	ar1(i,k) = ar(i,j,k+1) - tri(5,i,k) * ar1(i,k-1)
427	#else
428	ar1(i,k) = ar(k+1,j,i) - tri(5,i,k) * ar1(i,k-1)
429	#endif
430	ENDDO
431	ENDDO
432
433	!
434	!-- Backward substitution.
435	DO i = nxl_z, nxr_z
436	#if defined( __parallel )
437	ar(i,j,nz) = ar1(i,nz-1) / tri(4,i,nz-1)
438	#else
439	ar(nz,j,i) = ar1(i,nz-1) / tri(4,i,nz-1)
440	#endif
441	ENDDO
442	DO k = nz-2, 0, -1
443	DO i = nxl_z, nxr_z
444	#if defined( __parallel )
445	ar(i,j,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,j,k+2) ) &
446	/ tri(4,i,k)
447	#else
448	ar(k+1,j,i) = ( ar1(i,k) - tri(3,i,k) * ar(k+2,j,i) ) &
449	/ tri(4,i,k)
450	#endif
451	ENDDO
452	ENDDO
453
454	!
455	!-- Indices i=0, j=0 correspond to horizontally averaged pressure.
456	!-- The respective values of ar should be zero at all k-levels if
457	!-- acceleration of horizontally averaged vertical velocity is zero.
458	IF ( ibc_p_b == 1 .AND. ibc_p_t == 1 ) THEN
459	IF ( j == 0 .AND. nxl_z == 0 ) THEN
460	#if defined( __parallel )
461	DO k = 1, nz
462	ar(nxl_z,j,k) = 0.0
463	ENDDO
464	#else
465	DO k = 1, nz
466	ar(k,j,nxl_z) = 0.0
467	ENDDO
468	#endif
469	ENDIF
470	ENDIF
471
472	END SUBROUTINE substi
473
474
475	SUBROUTINE split( tri )
476
477	!------------------------------------------------------------------------------!
478	! Splitting of the tridiagonal matrix (Thomas algorithm)
479	!------------------------------------------------------------------------------!
480
481	IMPLICIT NONE
482
483	INTEGER :: i, k
484	REAL :: tri(5,nxl_z:nxr_z,0:nz-1)
485
486	!
487	!-- Splitting.
488	DO i = nxl_z, nxr_z
489	tri(4,i,0) = tri(1,i,0)
490	ENDDO
491	DO k = 1, nz-1
492	DO i = nxl_z, nxr_z
493	tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
494	tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
495	ENDDO
496	ENDDO
497
498	END SUBROUTINE split
499
500	END SUBROUTINE tridia
501
502
503	#if defined( __parallel )
504	SUBROUTINE fftxp( ar, direction )
505
506	!------------------------------------------------------------------------------!
507	! Fourier-transformation along x-direction Parallelized version
508	!------------------------------------------------------------------------------!
509
510	IMPLICIT NONE
511
512	CHARACTER (LEN=*) :: direction
513	INTEGER :: j, k
514	REAL :: ar(0:nxa,nys_x:nyn_xa,nzb_x:nzt_xa)
515
516	!
517	!-- Performing the fft with one of the methods implemented
518	DO k = nzb_x, nzt_x
519	DO j = nys_x, nyn_x
520	CALL fft_x( ar(0:nx,j,k), direction )
521	ENDDO
522	ENDDO
523
524	END SUBROUTINE fftxp
525
526	#else
527	SUBROUTINE fftx( ar, direction )
528
529	!------------------------------------------------------------------------------!
530	! Fourier-transformation along x-direction Non parallel version
531	!------------------------------------------------------------------------------!
532
533	IMPLICIT NONE
534
535	CHARACTER (LEN=*) :: direction
536	INTEGER :: i, j, k
537	REAL :: ar(1:nz,0:ny,0:nx)
538
539	!
540	!-- Performing the fft with one of the methods implemented
541	DO k = 1, nz
542	DO j = 0, ny
543	CALL fft_x( ar(k,j,0:nx), direction )
544	ENDDO
545	ENDDO
546
547	END SUBROUTINE fftx
548	#endif
549
550
551	#if defined( __parallel )
552	SUBROUTINE fftyp( ar, direction )
553
554	!------------------------------------------------------------------------------!
555	! Fourier-transformation along y-direction Parallelized version
556	!------------------------------------------------------------------------------!
557
558	IMPLICIT NONE
559
560	CHARACTER (LEN=*) :: direction
561	INTEGER :: i, k
562	REAL :: ar(0:nya,nxl_y:nxr_ya,nzb_y:nzt_ya)
563
564	!
565	!-- Performing the fft with one of the methods implemented
566	DO k = nzb_y, nzt_y
567	DO i = nxl_y, nxr_y
568	CALL fft_y( ar(0:ny,i,k), direction )
569	ENDDO
570	ENDDO
571
572	END SUBROUTINE fftyp
573
574	#else
575	SUBROUTINE ffty( ar, direction )
576
577	!------------------------------------------------------------------------------!
578	! Fourier-transformation along y-direction Non parallel version
579	!------------------------------------------------------------------------------!
580
581	IMPLICIT NONE
582
583	CHARACTER (LEN=*) :: direction
584	INTEGER :: i, k
585	REAL :: ar(1:nz,0:ny,0:nx)
586
587	!
588	!-- Performing the fft with one of the methods implemented
589	DO k = 1, nz
590	DO i = 0, nx
591	CALL fft_y( ar(k,0:ny,i), direction )
592	ENDDO
593	ENDDO
594
595	END SUBROUTINE ffty
596	#endif
597
598	#if defined( __parallel )
599	SUBROUTINE ffty_tr_yx( f_in, work, f_out )
600
601	!------------------------------------------------------------------------------!
602	! Fourier-transformation along y with subsequent transposition y --> x for
603	! a 1d-decomposition along x
604	!
605	! ATTENTION: The performance of this routine is much faster on the NEC-SX6,
606	! if the first index of work_ffty_vec is odd. Otherwise
607	! memory bank conflicts may occur (especially if the index is a
608	! multiple of 128). That's why work_ffty_vec is dimensioned as
609	! 0:ny+1.
610	! Of course, this will not work if users are using an odd number
611	! of gridpoints along y.
612	!------------------------------------------------------------------------------!
613
614	USE control_parameters
615	USE cpulog
616	USE indices
617	USE interfaces
618	USE pegrid
619	USE transpose_indices
620
621	IMPLICIT NONE
622
623	INTEGER :: i, iend, iouter, ir, j, k
624	INTEGER, PARAMETER :: stridex = 4
625
626	REAL, DIMENSION(0:ny,stridex) :: work_ffty
627	#if defined( __nec )
628	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
629	#endif
630	REAL, DIMENSION(1:nza,0:nya,nxl:nxra) :: f_in
631	REAL, DIMENSION(nnx,1:nza,nys_x:nyn_xa,pdims(1)) :: f_out
632	REAL, DIMENSION(nxl:nxra,1:nza,0:nya) :: work
633
634	!
635	!-- Carry out the FFT along y, where all data are present due to the
636	!-- 1d-decomposition along x. Resort the data in a way that x becomes
637	!-- the first index.
638	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
639
640	IF ( host(1:3) == 'nec' ) THEN
641	#if defined( __nec )
642	!
643	!-- Code optimized for vector processors
644	!$OMP PARALLEL PRIVATE ( i, j, k )
645	!$OMP DO
646	DO i = nxl, nxr
647
648	DO j = 0, ny
649	DO k = 1, nz
650	work_ffty_vec(j,k,i) = f_in(k,j,i)
651	ENDDO
652	ENDDO
653
654	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' )
655
656	ENDDO
657
658	!$OMP DO
659	DO k = 1, nz
660	DO j = 0, ny
661	DO i = nxl, nxr
662	work(i,k,j) = work_ffty_vec(j,k,i)
663	ENDDO
664	ENDDO
665	ENDDO
666	!$OMP END PARALLEL
667	#endif
668
669	ELSE
670
671	!
672	!-- Cache optimized code.
673	!-- The i-(x-)direction is split into a strided outer loop and an inner
674	!-- loop for better cache performance
675	!$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty)
676	!$OMP DO
677	DO iouter = nxl, nxr, stridex
678
679	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
680
681	DO k = 1, nz
682
683	DO i = iouter, iend
684
685	ir = i-iouter+1 ! counter within a stride
686	DO j = 0, ny
687	work_ffty(j,ir) = f_in(k,j,i)
688	ENDDO
689	!
690	!-- FFT along y
691	CALL fft_y( work_ffty(:,ir), 'forward' )
692
693	ENDDO
694
695	!
696	!-- Resort
697	DO j = 0, ny
698	DO i = iouter, iend
699	work(i,k,j) = work_ffty(j,i-iouter+1)
700	ENDDO
701	ENDDO
702
703	ENDDO
704
705	ENDDO
706	!$OMP END PARALLEL
707
708	ENDIF
709	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
710
711	!
712	!-- Transpose array
713	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
714	CALL MPI_ALLTOALL( work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
715	f_out(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
716	comm1dx, ierr )
717	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
718
719	END SUBROUTINE ffty_tr_yx
720
721
722	SUBROUTINE tr_xy_ffty( f_in, work, f_out )
723
724	!------------------------------------------------------------------------------!
725	! Transposition x --> y with a subsequent backward Fourier transformation for
726	! a 1d-decomposition along x
727	!------------------------------------------------------------------------------!
728
729	USE control_parameters
730	USE cpulog
731	USE indices
732	USE interfaces
733	USE pegrid
734	USE transpose_indices
735
736	IMPLICIT NONE
737
738	INTEGER :: i, iend, iouter, ir, j, k
739	INTEGER, PARAMETER :: stridex = 4
740
741	REAL, DIMENSION(0:ny,stridex) :: work_ffty
742	#if defined( __nec )
743	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
744	#endif
745	REAL, DIMENSION(nnx,1:nza,nys_x:nyn_xa,pdims(1)) :: f_in
746	REAL, DIMENSION(1:nza,0:nya,nxl:nxra) :: f_out
747	REAL, DIMENSION(nxl:nxra,1:nza,0:nya) :: work
748
749	!
750	!-- Transpose array
751	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
752	CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
753	work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
754	comm1dx, ierr )
755	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
756
757	!
758	!-- Resort the data in a way that y becomes the first index and carry out the
759	!-- backward fft along y.
760	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
761
762	IF ( host(1:3) == 'nec' ) THEN
763	#if defined( __nec )
764	!
765	!-- Code optimized for vector processors
766	!$OMP PARALLEL PRIVATE ( i, j, k )
767	!$OMP DO
768	DO k = 1, nz
769	DO j = 0, ny
770	DO i = nxl, nxr
771	work_ffty_vec(j,k,i) = work(i,k,j)
772	ENDDO
773	ENDDO
774	ENDDO
775
776	!$OMP DO
777	DO i = nxl, nxr
778
779	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' )
780
781	DO j = 0, ny
782	DO k = 1, nz
783	f_out(k,j,i) = work_ffty_vec(j,k,i)
784	ENDDO
785	ENDDO
786
787	ENDDO
788	!$OMP END PARALLEL
789	#endif
790
791	ELSE
792
793	!
794	!-- Cache optimized code.
795	!-- The i-(x-)direction is split into a strided outer loop and an inner
796	!-- loop for better cache performance
797	!$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty )
798	!$OMP DO
799	DO iouter = nxl, nxr, stridex
800
801	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
802
803	DO k = 1, nz
804	!
805	!-- Resort
806	DO j = 0, ny
807	DO i = iouter, iend
808	work_ffty(j,i-iouter+1) = work(i,k,j)
809	ENDDO
810	ENDDO
811
812	DO i = iouter, iend
813
814	!
815	!-- FFT along y
816	ir = i-iouter+1 ! counter within a stride
817	CALL fft_y( work_ffty(:,ir), 'backward' )
818
819	DO j = 0, ny
820	f_out(k,j,i) = work_ffty(j,ir)
821	ENDDO
822	ENDDO
823
824	ENDDO
825
826	ENDDO
827	!$OMP END PARALLEL
828
829	ENDIF
830
831	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
832
833	END SUBROUTINE tr_xy_ffty
834
835
836	SUBROUTINE fftx_tri_fftx( ar )
837
838	!------------------------------------------------------------------------------!
839	! FFT along x, solution of the tridiagonal system and backward FFT for
840	! a 1d-decomposition along x
841	!
842	! WARNING: this subroutine may still not work for hybrid parallelization
843	! with OpenMP (for possible necessary changes see the original
844	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
845	!------------------------------------------------------------------------------!
846
847	USE control_parameters
848	USE cpulog
849	USE grid_variables
850	USE indices
851	USE interfaces
852	USE pegrid
853	USE transpose_indices
854
855	IMPLICIT NONE
856
857	character(len=3) :: myth_char
858
859	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
860
861	REAL, DIMENSION(0:nx) :: work_fftx
862	REAL, DIMENSION(0:nx,1:nz) :: work_trix
863	REAL, DIMENSION(nnx,1:nza,nys_x:nyn_xa,pdims(1)) :: ar
864	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
865
866
867	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'start' )
868
869	ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) )
870
871	tn = 0 ! Default thread number in case of one thread
872	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_fftx, work_trix )
873	DO j = nys_x, nyn_x
874
875	!$ tn = omp_get_thread_num()
876
877	IF ( host(1:3) == 'nec' ) THEN
878	!
879	!-- Code optimized for vector processors
880	DO k = 1, nz
881
882	m = 0
883	DO n = 1, pdims(1)
884	DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
885	work_trix(m,k) = ar(i,k,j,n)
886	m = m + 1
887	ENDDO
888	ENDDO
889
890	ENDDO
891
892	CALL fft_x_m( work_trix, 'forward' )
893
894	ELSE
895	!
896	!-- Cache optimized code
897	DO k = 1, nz
898
899	m = 0
900	DO n = 1, pdims(1)
901	DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
902	work_fftx(m) = ar(i,k,j,n)
903	m = m + 1
904	ENDDO
905	ENDDO
906
907	CALL fft_x( work_fftx, 'forward' )
908
909	DO i = 0, nx
910	work_trix(i,k) = work_fftx(i)
911	ENDDO
912
913	ENDDO
914
915	ENDIF
916
917	!
918	!-- Solve the linear equation system
919	CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) )
920
921	IF ( host(1:3) == 'nec' ) THEN
922	!
923	!-- Code optimized for vector processors
924	CALL fft_x_m( work_trix, 'backward' )
925
926	DO k = 1, nz
927
928	m = 0
929	DO n = 1, pdims(1)
930	DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
931	ar(i,k,j,n) = work_trix(m,k)
932	m = m + 1
933	ENDDO
934	ENDDO
935
936	ENDDO
937
938	ELSE
939	!
940	!-- Cache optimized code
941	DO k = 1, nz
942
943	DO i = 0, nx
944	work_fftx(i) = work_trix(i,k)
945	ENDDO
946
947	CALL fft_x( work_fftx, 'backward' )
948
949	m = 0
950	DO n = 1, pdims(1)
951	DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
952	ar(i,k,j,n) = work_fftx(m)
953	m = m + 1
954	ENDDO
955	ENDDO
956
957	ENDDO
958
959	ENDIF
960
961	ENDDO
962
963	DEALLOCATE( tri )
964
965	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'stop' )
966
967	END SUBROUTINE fftx_tri_fftx
968
969
970	SUBROUTINE fftx_tr_xy( f_in, work, f_out )
971
972	!------------------------------------------------------------------------------!
973	! Fourier-transformation along x with subsequent transposition x --> y for
974	! a 1d-decomposition along y
975	!
976	! ATTENTION: The NEC-branch of this routine may significantly profit from
977	! further optimizations. So far, performance is much worse than
978	! for routine ffty_tr_yx (more than three times slower).
979	!------------------------------------------------------------------------------!
980
981	USE control_parameters
982	USE cpulog
983	USE indices
984	USE interfaces
985	USE pegrid
986	USE transpose_indices
987
988	IMPLICIT NONE
989
990	INTEGER :: i, j, k
991
992	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
993	REAL, DIMENSION(1:nza,nys:nyna,0:nxa) :: f_in
994	REAL, DIMENSION(nny,1:nza,nxl_y:nxr_ya,pdims(2)) :: f_out
995	REAL, DIMENSION(nys:nyna,1:nza,0:nxa) :: work
996
997	!
998	!-- Carry out the FFT along x, where all data are present due to the
999	!-- 1d-decomposition along y. Resort the data in a way that y becomes
1000	!-- the first index.
1001	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
1002
1003	IF ( host(1:3) == 'nec' ) THEN
1004	!
1005	!-- Code for vector processors
1006	!$OMP PARALLEL PRIVATE ( i, j, k )
1007	!$OMP DO
1008	DO i = 0, nx
1009
1010	DO j = nys, nyn
1011	DO k = 1, nz
1012	work_fftx(i,k,j) = f_in(k,j,i)
1013	ENDDO
1014	ENDDO
1015
1016	ENDDO
1017
1018	!$OMP DO
1019	DO j = nys, nyn
1020
1021	CALL fft_x_m( work_fftx(:,:,j), 'forward' )
1022
1023	DO k = 1, nz
1024	DO i = 0, nx
1025	work(j,k,i) = work_fftx(i,k,j)
1026	ENDDO
1027	ENDDO
1028
1029	ENDDO
1030	!$OMP END PARALLEL
1031
1032	ELSE
1033
1034	!
1035	!-- Cache optimized code (there might be still a potential for better
1036	!-- optimization).
1037	!$OMP PARALLEL PRIVATE (i,j,k,work_fftx)
1038	!$OMP DO
1039	DO i = 0, nx
1040
1041	DO j = nys, nyn
1042	DO k = 1, nz
1043	work_fftx(i,k,j) = f_in(k,j,i)
1044	ENDDO
1045	ENDDO
1046
1047	ENDDO
1048
1049	!$OMP DO
1050	DO j = nys, nyn
1051	DO k = 1, nz
1052
1053	CALL fft_x( work_fftx(0:nx,k,j), 'forward' )
1054
1055	DO i = 0, nx
1056	work(j,k,i) = work_fftx(i,k,j)
1057	ENDDO
1058	ENDDO
1059
1060	ENDDO
1061	!$OMP END PARALLEL
1062
1063	ENDIF
1064	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
1065
1066	!
1067	!-- Transpose array
1068	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1069	CALL MPI_ALLTOALL( work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
1070	f_out(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
1071	comm1dy, ierr )
1072	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1073
1074	END SUBROUTINE fftx_tr_xy
1075
1076
1077	SUBROUTINE tr_yx_fftx( f_in, work, f_out )
1078
1079	!------------------------------------------------------------------------------!
1080	! Transposition y --> x with a subsequent backward Fourier transformation for
1081	! a 1d-decomposition along x
1082	!------------------------------------------------------------------------------!
1083
1084	USE control_parameters
1085	USE cpulog
1086	USE indices
1087	USE interfaces
1088	USE pegrid
1089	USE transpose_indices
1090
1091	IMPLICIT NONE
1092
1093	INTEGER :: i, j, k
1094
1095	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
1096	REAL, DIMENSION(nny,1:nza,nxl_y:nxr_ya,pdims(2)) :: f_in
1097	REAL, DIMENSION(1:nza,nys:nyna,0:nxa) :: f_out
1098	REAL, DIMENSION(nys:nyna,1:nza,0:nxa) :: work
1099
1100	!
1101	!-- Transpose array
1102	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1103	CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
1104	work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
1105	comm1dy, ierr )
1106	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1107
1108	!
1109	!-- Carry out the FFT along x, where all data are present due to the
1110	!-- 1d-decomposition along y. Resort the data in a way that y becomes
1111	!-- the first index.
1112	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
1113
1114	IF ( host(1:3) == 'nec' ) THEN
1115	!
1116	!-- Code optimized for vector processors
1117	!$OMP PARALLEL PRIVATE ( i, j, k )
1118	!$OMP DO
1119	DO j = nys, nyn
1120
1121	DO k = 1, nz
1122	DO i = 0, nx
1123	work_fftx(i,k,j) = work(j,k,i)
1124	ENDDO
1125	ENDDO
1126
1127	CALL fft_x_m( work_fftx(:,:,j), 'backward' )
1128
1129	ENDDO
1130
1131	!$OMP DO
1132	DO i = 0, nx
1133	DO j = nys, nyn
1134	DO k = 1, nz
1135	f_out(k,j,i) = work_fftx(i,k,j)
1136	ENDDO
1137	ENDDO
1138	ENDDO
1139	!$OMP END PARALLEL
1140
1141	ELSE
1142
1143	!
1144	!-- Cache optimized code (there might be still a potential for better
1145	!-- optimization).
1146	!$OMP PARALLEL PRIVATE (i,j,k,work_fftx)
1147	!$OMP DO
1148	DO j = nys, nyn
1149	DO k = 1, nz
1150
1151	DO i = 0, nx
1152	work_fftx(i,k,j) = work(j,k,i)
1153	ENDDO
1154
1155	CALL fft_x( work_fftx(0:nx,k,j), 'backward' )
1156
1157	ENDDO
1158	ENDDO
1159
1160	!$OMP DO
1161	DO i = 0, nx
1162	DO j = nys, nyn
1163	DO k = 1, nz
1164	f_out(k,j,i) = work_fftx(i,k,j)
1165	ENDDO
1166	ENDDO
1167	ENDDO
1168	!$OMP END PARALLEL
1169
1170	ENDIF
1171	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
1172
1173	END SUBROUTINE tr_yx_fftx
1174
1175
1176	SUBROUTINE ffty_tri_ffty( ar )
1177
1178	!------------------------------------------------------------------------------!
1179	! FFT along y, solution of the tridiagonal system and backward FFT for
1180	! a 1d-decomposition along y
1181	!
1182	! WARNING: this subroutine may still not work for hybrid parallelization
1183	! with OpenMP (for possible necessary changes see the original
1184	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
1185	!------------------------------------------------------------------------------!
1186
1187	USE control_parameters
1188	USE cpulog
1189	USE grid_variables
1190	USE indices
1191	USE interfaces
1192	USE pegrid
1193	USE transpose_indices
1194
1195	IMPLICIT NONE
1196
1197	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
1198
1199	REAL, DIMENSION(0:ny) :: work_ffty
1200	REAL, DIMENSION(0:ny,1:nz) :: work_triy
1201	REAL, DIMENSION(nny,1:nza,nxl_y:nxr_ya,pdims(2)) :: ar
1202	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
1203
1204
1205	CALL cpu_log( log_point_s(39), 'fft_y + tridia', 'start' )
1206
1207	ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) )
1208
1209	tn = 0 ! Default thread number in case of one thread
1210	!$OMP PARALLEL PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy )
1211	!$OMP DO
1212	DO i = nxl_y, nxr_y
1213
1214	!$ tn = omp_get_thread_num()
1215
1216	IF ( host(1:3) == 'nec' ) THEN
1217	!
1218	!-- Code optimized for vector processors
1219	DO k = 1, nz
1220
1221	m = 0
1222	DO n = 1, pdims(2)
1223	DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
1224	work_triy(m,k) = ar(j,k,i,n)
1225	m = m + 1
1226	ENDDO
1227	ENDDO
1228
1229	ENDDO
1230
1231	CALL fft_y_m( work_triy, ny, 'forward' )
1232
1233	ELSE
1234	!
1235	!-- Cache optimized code
1236	DO k = 1, nz
1237
1238	m = 0
1239	DO n = 1, pdims(2)
1240	DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
1241	work_ffty(m) = ar(j,k,i,n)
1242	m = m + 1
1243	ENDDO
1244	ENDDO
1245
1246	CALL fft_y( work_ffty, 'forward' )
1247
1248	DO j = 0, ny
1249	work_triy(j,k) = work_ffty(j)
1250	ENDDO
1251
1252	ENDDO
1253
1254	ENDIF
1255
1256	!
1257	!-- Solve the linear equation system
1258	CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) )
1259
1260	IF ( host(1:3) == 'nec' ) THEN
1261	!
1262	!-- Code optimized for vector processors
1263	CALL fft_y_m( work_triy, ny, 'backward' )
1264
1265	DO k = 1, nz
1266
1267	m = 0
1268	DO n = 1, pdims(2)
1269	DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
1270	ar(j,k,i,n) = work_triy(m,k)
1271	m = m + 1
1272	ENDDO
1273	ENDDO
1274
1275	ENDDO
1276
1277	ELSE
1278	!
1279	!-- Cache optimized code
1280	DO k = 1, nz
1281
1282	DO j = 0, ny
1283	work_ffty(j) = work_triy(j,k)
1284	ENDDO
1285
1286	CALL fft_y( work_ffty, 'backward' )
1287
1288	m = 0
1289	DO n = 1, pdims(2)
1290	DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
1291	ar(j,k,i,n) = work_ffty(m)
1292	m = m + 1
1293	ENDDO
1294	ENDDO
1295
1296	ENDDO
1297
1298	ENDIF
1299
1300	ENDDO
1301	!$OMP END PARALLEL
1302
1303	DEALLOCATE( tri )
1304
1305	CALL cpu_log( log_point_s(39), 'fft_y + tridia', 'stop' )
1306
1307	END SUBROUTINE ffty_tri_ffty
1308
1309
1310	SUBROUTINE tridia_1dd( ddx2, ddy2, nx, ny, j, ar, tri )
1311
1312	!------------------------------------------------------------------------------!
1313	! Solves the linear system of equations for a 1d-decomposition along x (see
1314	! tridia)
1315	!
1316	! Attention: when using the intel compiler, array tri must be passed as an
1317	! argument to the contained subroutines. Otherwise addres faults
1318	! will occur.
1319	! On NEC, tri should not be passed (except for routine substi_1dd)
1320	! because this causes very bad performance.
1321	!------------------------------------------------------------------------------!
1322
1323	USE arrays_3d
1324	USE control_parameters
1325
1326	USE pegrid
1327
1328	IMPLICIT NONE
1329
1330	INTEGER :: i, j, k, nnyh, nx, ny, omp_get_thread_num, tn
1331
1332	REAL :: ddx2, ddy2
1333
1334	REAL, DIMENSION(0:nx,1:nz) :: ar
1335	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
1336	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
1337
1338
1339	nnyh = ( ny + 1 ) / 2
1340
1341	!
1342	!-- Define constant elements of the tridiagonal matrix.
1343	!-- The compiler on SX6 does loop exchange. If 0:nx is a high power of 2,
1344	!-- the exchanged loops create bank conflicts. The following directive
1345	!-- prohibits loop exchange and the loops perform much better.
1346	! tn = omp_get_thread_num()
1347	! WRITE( 120+tn, * ) '+++ id=',myid,' nx=',nx,' thread=', omp_get_thread_num()
1348	! CALL local_flush( 120+tn )
1349	!CDIR NOLOOPCHG
1350	DO k = 0, nz-1
1351	DO i = 0,nx
1352	tri(2,i,k) = ddzu(k+1) * ddzw(k+1)
1353	tri(3,i,k) = ddzu(k+2) * ddzw(k+1)
1354	ENDDO
1355	ENDDO
1356	! WRITE( 120+tn, * ) '+++ id=',myid,' end of first tridia loop thread=', omp_get_thread_num()
1357	! CALL local_flush( 120+tn )
1358
1359	IF ( j <= nnyh ) THEN
1360	#if defined( __lcmuk )
1361	CALL maketri_1dd( j, tri )
1362	#else
1363	CALL maketri_1dd( j )
1364	#endif
1365	ELSE
1366	#if defined( __lcmuk )
1367	CALL maketri_1dd( ny+1-j, tri )
1368	#else
1369	CALL maketri_1dd( ny+1-j )
1370	#endif
1371	ENDIF
1372	#if defined( __lcmuk )
1373	CALL split_1dd( tri )
1374	#else
1375	CALL split_1dd
1376	#endif
1377	CALL substi_1dd( ar, tri )
1378
1379	CONTAINS
1380
1381	#if defined( __lcmuk )
1382	SUBROUTINE maketri_1dd( j, tri )
1383	#else
1384	SUBROUTINE maketri_1dd( j )
1385	#endif
1386
1387	!------------------------------------------------------------------------------!
1388	! computes the i- and j-dependent component of the matrix
1389	!------------------------------------------------------------------------------!
1390
1391	USE constants
1392
1393	IMPLICIT NONE
1394
1395	INTEGER :: i, j, k, nnxh
1396	REAL :: a, c
1397
1398	REAL, DIMENSION(0:nx) :: l
1399
1400	#if defined( __lcmuk )
1401	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
1402	#endif
1403
1404
1405	nnxh = ( nx + 1 ) / 2
1406	!
1407	!-- Provide the tridiagonal matrix for solution of the Poisson equation in
1408	!-- Fourier space. The coefficients are computed following the method of
1409	!-- Schmidt et al. (DFVLR-Mitteilung 84-15), which departs from Stephan
1410	!-- Siano's original version by discretizing the Poisson equation,
1411	!-- before it is Fourier-transformed
1412	DO i = 0, nx
1413	IF ( i >= 0 .AND. i <= nnxh ) THEN
1414	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
1415	FLOAT( nx+1 ) ) ) * ddx2 + &
1416	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
1417	FLOAT( ny+1 ) ) ) * ddy2
1418	ELSE
1419	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
1420	FLOAT( nx+1 ) ) ) * ddx2 + &
1421	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
1422	FLOAT( ny+1 ) ) ) * ddy2
1423	ENDIF
1424	ENDDO
1425
1426	DO k = 0, nz-1
1427	DO i = 0, nx
1428	a = -1.0 * ddzu(k+2) * ddzw(k+1)
1429	c = -1.0 * ddzu(k+1) * ddzw(k+1)
1430	tri(1,i,k) = a + c - l(i)
1431	ENDDO
1432	ENDDO
1433	IF ( ibc_p_b == 1 .OR. ibc_p_b == 2 ) THEN
1434	DO i = 0, nx
1435	tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
1436	ENDDO
1437	ENDIF
1438	IF ( ibc_p_t == 1 ) THEN
1439	DO i = 0, nx
1440	tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
1441	ENDDO
1442	ENDIF
1443
1444	END SUBROUTINE maketri_1dd
1445
1446
1447	#if defined( __lcmuk )
1448	SUBROUTINE split_1dd( tri )
1449	#else
1450	SUBROUTINE split_1dd
1451	#endif
1452
1453	!------------------------------------------------------------------------------!
1454	! Splitting of the tridiagonal matrix (Thomas algorithm)
1455	!------------------------------------------------------------------------------!
1456
1457	IMPLICIT NONE
1458
1459	INTEGER :: i, k
1460
1461	#if defined( __lcmuk )
1462	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
1463	#endif
1464
1465
1466	!
1467	!-- Splitting
1468	DO i = 0, nx
1469	tri(4,i,0) = tri(1,i,0)
1470	ENDDO
1471	DO k = 1, nz-1
1472	DO i = 0, nx
1473	tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
1474	tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
1475	ENDDO
1476	ENDDO
1477
1478	END SUBROUTINE split_1dd
1479
1480
1481	SUBROUTINE substi_1dd( ar, tri )
1482
1483	!------------------------------------------------------------------------------!
1484	! Substitution (Forward and Backward) (Thomas algorithm)
1485	!------------------------------------------------------------------------------!
1486
1487	IMPLICIT NONE
1488
1489	INTEGER :: i, k
1490
1491	REAL, DIMENSION(0:nx,nz) :: ar
1492	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
1493	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
1494
1495	!
1496	!-- Forward substitution
1497	DO i = 0, nx
1498	ar1(i,0) = ar(i,1)
1499	ENDDO
1500	DO k = 1, nz-1
1501	DO i = 0, nx
1502	ar1(i,k) = ar(i,k+1) - tri(5,i,k) * ar1(i,k-1)
1503	ENDDO
1504	ENDDO
1505
1506	!
1507	!-- Backward substitution
1508	DO i = 0, nx
1509	ar(i,nz) = ar1(i,nz-1) / tri(4,i,nz-1)
1510	ENDDO
1511	DO k = nz-2, 0, -1
1512	DO i = 0, nx
1513	ar(i,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,k+2) ) &
1514	/ tri(4,i,k)
1515	ENDDO
1516	ENDDO
1517
1518	!
1519	!-- Indices i=0, j=0 correspond to horizontally averaged pressure.
1520	!-- The respective values of ar should be zero at all k-levels if
1521	!-- acceleration of horizontally averaged vertical velocity is zero.
1522	IF ( ibc_p_b == 1 .AND. ibc_p_t == 1 ) THEN
1523	IF ( j == 0 ) THEN
1524	DO k = 1, nz
1525	ar(0,k) = 0.0
1526	ENDDO
1527	ENDIF
1528	ENDIF
1529
1530	END SUBROUTINE substi_1dd
1531
1532	END SUBROUTINE tridia_1dd
1533
1534	#endif
1535
1536	END MODULE poisfft_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |