Home

Context Navigation

source: palm/trunk/SOURCE/poisfft.f90 @ 812

Last change on this file since 812 was 810, checked in by maronga, 13 years ago
last commit documented
Property svn:keywords set to `Id`
File size: 46.5 KB

Line
1	MODULE poisfft_mod
2
3	!------------------------------------------------------------------------------!
4	! Current revisions:
5	! -----------------
6	!
7	!
8	! Former revisions:
9	! -----------------
10	! $Id: poisfft.f90 810 2012-01-30 13:40:12Z maronga $
11	!
12	! 809 2012-01-30 13:32:58Z maronga
13	! Bugfix: replaced .AND. and .NOT. with && and ! in the preprocessor directives
14	!
15	! 807 2012-01-25 11:53:51Z maronga
16	! New cpp directive "__check" implemented which is used by check_namelist_files
17	! (most of the code is unneeded by check_namelist_files).
18	!
19	! 763 2011-10-06 09:32:09Z suehring
20	! Comment added concerning the last change.
21	!
22	! 761 2011-10-05 17:58:52Z suehring
23	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
24	! pressure at the top of the model domain.
25	!
26	! 696 2011-03-18 07:03:49Z raasch
27	! work_fftx removed from PRIVATE clauses in fftx_tr_xy and tr_yx_fftx
28	!
29	! 683 2011-02-09 14:25:15Z raasch
30	! openMP parallelization for 2d-domain-decomposition
31	!
32	! 667 2010-12-23 12:06:00Z suehring/gryschka
33	! ddzu replaced by ddzu_pres due to changes in zu(0)
34	!
35	! 622 2010-12-10 08:08:13Z raasch
36	! optional barriers included in order to speed up collective operations
37	!
38	! 377 2009-09-04 11:09:00Z raasch
39	! __lcmuk changed to __lc to avoid problems with Intel compiler on sgi-ice
40	!
41	! 164 2008-05-15 08:46:15Z raasch
42	! Arguments removed from transpose routines
43	!
44	! 128 2007-10-26 13:11:14Z raasch
45	! Bugfix: wavenumber calculation for even nx in routines maketri
46	!
47	! 85 2007-05-11 09:35:14Z raasch
48	! Bugfix: work_fft*_vec removed from some PRIVATE-declarations
49	!
50	! 76 2007-03-29 00:58:32Z raasch
51	! Tridiagonal coefficients adjusted for Neumann boundary conditions both at
52	! the bottom and the top.
53	!
54	! RCS Log replace by Id keyword, revision history cleaned up
55	!
56	! Revision 1.24 2006/08/04 15:00:24 raasch
57	! Default setting of the thread number tn in case of not using OpenMP
58	!
59	! Revision 1.23 2006/02/23 12:48:38 raasch
60	! Additional compiler directive in routine tridia_1dd for preventing loop
61	! exchange on NEC-SX6
62	!
63	! Revision 1.20 2004/04/30 12:38:09 raasch
64	! Parts of former poisfft_hybrid moved to this subroutine,
65	! former subroutine changed to a module, renaming of FFT-subroutines and
66	! -module, FFTs completely substituted by calls of fft_x and fft_y,
67	! NAG fft used in the non-parallel case completely removed, l in maketri
68	! is now a 1d-array, variables passed by modules instead of using parameter
69	! lists, enlarged transposition arrays introduced
70	!
71	! Revision 1.1 1997/07/24 11:24:14 raasch
72	! Initial revision
73	!
74	!
75	! Description:
76	! ------------
77	! See below.
78	!------------------------------------------------------------------------------!
79
80	!--------------------------------------------------------------------------!
81	! poisfft !
82	! !
83	! Original version: Stephan Siano (pois3d) !
84	! !
85	! Institute of Meteorology and Climatology, University of Hannover !
86	! Germany !
87	! !
88	! Version as of July 23,1996 !
89	! !
90	! !
91	! Version for parallel computers: Siegfried Raasch !
92	! !
93	! Version as of July 03,1997 !
94	! !
95	! Solves the Poisson equation with a 2D spectral method !
96	! d^2 p / dx^2 + d^2 p / dy^2 + d^2 p / dz^2 = s !
97	! !
98	! Input: !
99	! real ar contains in the (nnx,nny,nnz) elements, !
100	! starting from the element (1,nys,nxl), the !
101	! values for s !
102	! real work Temporary array !
103	! !
104	! Output: !
105	! real ar contains the solution for p !
106	!--------------------------------------------------------------------------!
107
108	USE fft_xy
109	USE indices
110	USE transpose_indices
111
112	IMPLICIT NONE
113
114	PRIVATE
115
116	#if ! defined ( __check )
117	PUBLIC poisfft, poisfft_init
118
119	INTERFACE poisfft
120	MODULE PROCEDURE poisfft
121	END INTERFACE poisfft
122
123	INTERFACE poisfft_init
124	MODULE PROCEDURE poisfft_init
125	END INTERFACE poisfft_init
126	#else
127	PUBLIC poisfft_init
128
129	INTERFACE poisfft_init
130	MODULE PROCEDURE poisfft_init
131	END INTERFACE poisfft_init
132	#endif
133
134	CONTAINS
135
136	SUBROUTINE poisfft_init
137
138	CALL fft_init
139
140	END SUBROUTINE poisfft_init
141
142	#if ! defined ( __check )
143	SUBROUTINE poisfft( ar, work )
144
145	USE cpulog
146	USE interfaces
147	USE pegrid
148
149	IMPLICIT NONE
150
151	REAL, DIMENSION(1:nza,nys:nyna,nxl:nxra) :: ar, work
152
153
154	CALL cpu_log( log_point_s(3), 'poisfft', 'start' )
155
156	!
157	!-- Two-dimensional Fourier Transformation in x- and y-direction.
158	#if defined( __parallel )
159	IF ( pdims(2) == 1 ) THEN
160
161	!
162	!-- 1d-domain-decomposition along x:
163	!-- FFT along y and transposition y --> x
164	CALL ffty_tr_yx( ar, work, ar )
165
166	!
167	!-- FFT along x, solving the tridiagonal system and backward FFT
168	CALL fftx_tri_fftx( ar )
169
170	!
171	!-- Transposition x --> y and backward FFT along y
172	CALL tr_xy_ffty( ar, work, ar )
173
174	ELSEIF ( pdims(1) == 1 ) THEN
175
176	!
177	!-- 1d-domain-decomposition along y:
178	!-- FFT along x and transposition x --> y
179	CALL fftx_tr_xy( ar, work, ar )
180
181	!
182	!-- FFT along y, solving the tridiagonal system and backward FFT
183	CALL ffty_tri_ffty( ar )
184
185	!
186	!-- Transposition y --> x and backward FFT along x
187	CALL tr_yx_fftx( ar, work, ar )
188
189	ELSE
190
191	!
192	!-- 2d-domain-decomposition
193	!-- Transposition z --> x
194	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
195	CALL transpose_zx( ar, work, ar )
196	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
197
198	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
199	CALL fftxp( ar, 'forward' )
200	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
201
202	!
203	!-- Transposition x --> y
204	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
205	CALL transpose_xy( ar, work, ar )
206	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
207
208	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
209	CALL fftyp( ar, 'forward' )
210	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
211
212	!
213	!-- Transposition y --> z
214	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
215	CALL transpose_yz( ar, work, ar )
216	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
217
218	!
219	!-- Solve the Poisson equation in z-direction in cartesian space.
220	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
221	CALL tridia( ar )
222	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
223
224	!
225	!-- Inverse Fourier Transformation
226	!-- Transposition z --> y
227	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
228	CALL transpose_zy( ar, work, ar )
229	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
230
231	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
232	CALL fftyp( ar, 'backward' )
233	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
234
235	!
236	!-- Transposition y --> x
237	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
238	CALL transpose_yx( ar, work, ar )
239	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
240
241	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
242	CALL fftxp( ar, 'backward' )
243	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
244
245	!
246	!-- Transposition x --> z
247	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
248	CALL transpose_xz( ar, work, ar )
249	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
250
251	ENDIF
252
253	#else
254
255	!
256	!-- Two-dimensional Fourier Transformation along x- and y-direction.
257	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
258	CALL fftx( ar, 'forward' )
259	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
260	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
261	CALL ffty( ar, 'forward' )
262	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
263
264	!
265	!-- Solve the Poisson equation in z-direction in cartesian space.
266	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
267	CALL tridia( ar )
268	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
269
270	!
271	!-- Inverse Fourier Transformation.
272	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
273	CALL ffty( ar, 'backward' )
274	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
275	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
276	CALL fftx( ar, 'backward' )
277	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
278
279	#endif
280
281	CALL cpu_log( log_point_s(3), 'poisfft', 'stop' )
282
283	END SUBROUTINE poisfft
284
285
286
287	SUBROUTINE tridia( ar )
288
289	!------------------------------------------------------------------------------!
290	! solves the linear system of equations:
291	!
292	! -(4 pi^2(i^2/(dx^2nnx^2)+j^2/(dy^2nny^2))+
293	! 1/(dzu(k)dzw(k))+1/(dzu(k-1)dzw(k)))*p(i,j,k)+
294	! 1/(dzu(k)dzw(k))p(i,j,k+1)+1/(dzu(k-1)dzw(k))p(i,j,k-1)=d(i,j,k)
295	!
296	! by using the Thomas algorithm
297	!------------------------------------------------------------------------------!
298
299	USE arrays_3d
300
301	IMPLICIT NONE
302
303	INTEGER :: i, j, k, nnyh
304
305	REAL, DIMENSION(nxl_z:nxr_z,0:nz-1) :: ar1
306	REAL, DIMENSION(5,nxl_z:nxr_z,0:nz-1) :: tri
307
308	#if defined( __parallel )
309	REAL :: ar(nxl_z:nxr_za,nys_z:nyn_za,1:nza)
310	#else
311	REAL :: ar(1:nz,nys_z:nyn_z,nxl_z:nxr_z)
312	#endif
313
314
315	nnyh = (ny+1) / 2
316
317	!
318	!-- Define constant elements of the tridiagonal matrix.
319	!$OMP PARALLEL PRIVATE ( k, i )
320	!$OMP DO
321	DO k = 0, nz-1
322	DO i = nxl_z, nxr_z
323	tri(2,i,k) = ddzu_pres(k+1) * ddzw(k+1)
324	tri(3,i,k) = ddzu_pres(k+2) * ddzw(k+1)
325	ENDDO
326	ENDDO
327	!$OMP END PARALLEL
328
329	#if defined( __parallel )
330	!
331	!-- Repeat for all y-levels.
332	!$OMP PARALLEL FIRSTPRIVATE( tri ) PRIVATE ( ar1, j )
333	!$OMP DO
334	DO j = nys_z, nyn_z
335	IF ( j <= nnyh ) THEN
336	CALL maketri( tri, j )
337	ELSE
338	CALL maketri( tri, ny+1-j )
339	ENDIF
340	CALL split( tri )
341	CALL substi( ar, ar1, tri, j )
342	ENDDO
343	!$OMP END PARALLEL
344	#else
345	!
346	!-- First y-level.
347	CALL maketri( tri, nys_z )
348	CALL split( tri )
349	CALL substi( ar, ar1, tri, 0 )
350
351	!
352	!-- Further y-levels.
353	DO j = 1, nnyh - 1
354	CALL maketri( tri, j )
355	CALL split( tri )
356	CALL substi( ar, ar1, tri, j )
357	CALL substi( ar, ar1, tri, ny+1-j )
358	ENDDO
359	CALL maketri( tri, nnyh )
360	CALL split( tri )
361	CALL substi( ar, ar1, tri, nnyh+nys )
362	#endif
363
364	CONTAINS
365
366	SUBROUTINE maketri( tri, j )
367
368	!------------------------------------------------------------------------------!
369	! Computes the i- and j-dependent component of the matrix
370	!------------------------------------------------------------------------------!
371
372	USE arrays_3d
373	USE constants
374	USE control_parameters
375	USE grid_variables
376
377	IMPLICIT NONE
378
379	INTEGER :: i, j, k, nnxh
380	REAL :: a, c
381	REAL :: ll(nxl_z:nxr_z)
382	REAL :: tri(5,nxl_z:nxr_z,0:nz-1)
383
384
385	nnxh = ( nx + 1 ) / 2
386
387	!
388	!-- Provide the tridiagonal matrix for solution of the Poisson equation in
389	!-- Fourier space. The coefficients are computed following the method of
390	!-- Schmidt et al. (DFVLR-Mitteilung 84-15), which departs from Stephan
391	!-- Siano's original version by discretizing the Poisson equation,
392	!-- before it is Fourier-transformed
393	#if defined( __parallel )
394	DO i = nxl_z, nxr_z
395	IF ( i >= 0 .AND. i <= nnxh ) THEN
396	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
397	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
398	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
399	FLOAT( ny+1 ) ) ) / ( dy * dy )
400	ELSE
401	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
402	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
403	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
404	FLOAT( ny+1 ) ) ) / ( dy * dy )
405	ENDIF
406	DO k = 0,nz-1
407	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
408	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
409	tri(1,i,k) = a + c - ll(i)
410	ENDDO
411	ENDDO
412	#else
413	DO i = 0, nnxh
414	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / FLOAT( nx+1 ) ) ) / &
415	( dx * dx ) + &
416	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / FLOAT( ny+1 ) ) ) / &
417	( dy * dy )
418	DO k = 0, nz-1
419	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
420	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
421	tri(1,i,k) = a + c - ll(i)
422	IF ( i >= 1 .and. i < nnxh ) THEN
423	tri(1,nx+1-i,k) = tri(1,i,k)
424	ENDIF
425	ENDDO
426	ENDDO
427	#endif
428	IF ( ibc_p_b == 1 .OR. ibc_p_b == 2 ) THEN
429	DO i = nxl_z, nxr_z
430	tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
431	ENDDO
432	ENDIF
433	IF ( ibc_p_t == 1 ) THEN
434	DO i = nxl_z, nxr_z
435	tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
436	ENDDO
437	ENDIF
438
439	END SUBROUTINE maketri
440
441
442	SUBROUTINE substi( ar, ar1, tri, j )
443
444	!------------------------------------------------------------------------------!
445	! Substitution (Forward and Backward) (Thomas algorithm)
446	!------------------------------------------------------------------------------!
447
448	USE control_parameters
449
450	IMPLICIT NONE
451
452	INTEGER :: i, j, k
453	REAL :: ar1(nxl_z:nxr_z,0:nz-1)
454	REAL :: tri(5,nxl_z:nxr_z,0:nz-1)
455	#if defined( __parallel )
456	REAL :: ar(nxl_z:nxr_za,nys_z:nyn_za,1:nza)
457	#else
458	REAL :: ar(1:nz,nys_z:nyn_z,nxl_z:nxr_z)
459	#endif
460
461	!
462	!-- Forward substitution.
463	DO i = nxl_z, nxr_z
464	#if defined( __parallel )
465	ar1(i,0) = ar(i,j,1)
466	#else
467	ar1(i,0) = ar(1,j,i)
468	#endif
469	ENDDO
470	DO k = 1, nz - 1
471	DO i = nxl_z, nxr_z
472	#if defined( __parallel )
473	ar1(i,k) = ar(i,j,k+1) - tri(5,i,k) * ar1(i,k-1)
474	#else
475	ar1(i,k) = ar(k+1,j,i) - tri(5,i,k) * ar1(i,k-1)
476	#endif
477	ENDDO
478	ENDDO
479
480	!
481	!-- Backward substitution.
482	DO i = nxl_z, nxr_z
483	#if defined( __parallel )
484	ar(i,j,nz) = ar1(i,nz-1) / tri(4,i,nz-1)
485	#else
486	ar(nz,j,i) = ar1(i,nz-1) / tri(4,i,nz-1)
487	#endif
488	ENDDO
489	DO k = nz-2, 0, -1
490	DO i = nxl_z, nxr_z
491	#if defined( __parallel )
492	ar(i,j,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,j,k+2) ) &
493	/ tri(4,i,k)
494	#else
495	ar(k+1,j,i) = ( ar1(i,k) - tri(3,i,k) * ar(k+2,j,i) ) &
496	/ tri(4,i,k)
497	#endif
498	ENDDO
499	ENDDO
500
501	!
502	!-- Indices i=0, j=0 correspond to horizontally averaged pressure.
503	!-- The respective values of ar should be zero at all k-levels if
504	!-- acceleration of horizontally averaged vertical velocity is zero.
505	IF ( ibc_p_b == 1 .AND. ibc_p_t == 1 ) THEN
506	IF ( j == 0 .AND. nxl_z == 0 ) THEN
507	#if defined( __parallel )
508	DO k = 1, nz
509	ar(nxl_z,j,k) = 0.0
510	ENDDO
511	#else
512	DO k = 1, nz
513	ar(k,j,nxl_z) = 0.0
514	ENDDO
515	#endif
516	ENDIF
517	ENDIF
518
519	END SUBROUTINE substi
520
521
522	SUBROUTINE split( tri )
523
524	!------------------------------------------------------------------------------!
525	! Splitting of the tridiagonal matrix (Thomas algorithm)
526	!------------------------------------------------------------------------------!
527
528	IMPLICIT NONE
529
530	INTEGER :: i, k
531	REAL :: tri(5,nxl_z:nxr_z,0:nz-1)
532
533	!
534	!-- Splitting.
535	DO i = nxl_z, nxr_z
536	tri(4,i,0) = tri(1,i,0)
537	ENDDO
538	DO k = 1, nz-1
539	DO i = nxl_z, nxr_z
540	tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
541	tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
542	ENDDO
543	ENDDO
544
545	END SUBROUTINE split
546
547	END SUBROUTINE tridia
548
549
550	#if defined( __parallel )
551	SUBROUTINE fftxp( ar, direction )
552
553	!------------------------------------------------------------------------------!
554	! Fourier-transformation along x-direction Parallelized version
555	!------------------------------------------------------------------------------!
556
557	IMPLICIT NONE
558
559	CHARACTER (LEN=*) :: direction
560	INTEGER :: j, k
561	REAL :: ar(0:nxa,nys_x:nyn_xa,nzb_x:nzt_xa)
562
563	!
564	!-- Performing the fft with one of the methods implemented
565	!$OMP PARALLEL PRIVATE ( j, k )
566	!$OMP DO
567	DO k = nzb_x, nzt_x
568	DO j = nys_x, nyn_x
569	CALL fft_x( ar(0:nx,j,k), direction )
570	ENDDO
571	ENDDO
572	!$OMP END PARALLEL
573
574	END SUBROUTINE fftxp
575
576	#else
577	SUBROUTINE fftx( ar, direction )
578
579	!------------------------------------------------------------------------------!
580	! Fourier-transformation along x-direction Non parallel version
581	!------------------------------------------------------------------------------!
582
583	IMPLICIT NONE
584
585	CHARACTER (LEN=*) :: direction
586	INTEGER :: i, j, k
587	REAL :: ar(1:nz,0:ny,0:nx)
588
589	!
590	!-- Performing the fft with one of the methods implemented
591	!$OMP PARALLEL PRIVATE ( j, k )
592	!$OMP DO
593	DO k = 1, nz
594	DO j = 0, ny
595	CALL fft_x( ar(k,j,0:nx), direction )
596	ENDDO
597	ENDDO
598	!$OMP END PARALLEL
599
600	END SUBROUTINE fftx
601	#endif
602
603
604	#if defined( __parallel )
605	SUBROUTINE fftyp( ar, direction )
606
607	!------------------------------------------------------------------------------!
608	! Fourier-transformation along y-direction Parallelized version
609	!------------------------------------------------------------------------------!
610
611	IMPLICIT NONE
612
613	CHARACTER (LEN=*) :: direction
614	INTEGER :: i, k
615	REAL :: ar(0:nya,nxl_y:nxr_ya,nzb_y:nzt_ya)
616
617	!
618	!-- Performing the fft with one of the methods implemented
619	!$OMP PARALLEL PRIVATE ( i, k )
620	!$OMP DO
621	DO k = nzb_y, nzt_y
622	DO i = nxl_y, nxr_y
623	CALL fft_y( ar(0:ny,i,k), direction )
624	ENDDO
625	ENDDO
626	!$OMP END PARALLEL
627
628	END SUBROUTINE fftyp
629
630	#else
631	SUBROUTINE ffty( ar, direction )
632
633	!------------------------------------------------------------------------------!
634	! Fourier-transformation along y-direction Non parallel version
635	!------------------------------------------------------------------------------!
636
637	IMPLICIT NONE
638
639	CHARACTER (LEN=*) :: direction
640	INTEGER :: i, k
641	REAL :: ar(1:nz,0:ny,0:nx)
642
643	!
644	!-- Performing the fft with one of the methods implemented
645	!$OMP PARALLEL PRIVATE ( i, k )
646	!$OMP DO
647	DO k = 1, nz
648	DO i = 0, nx
649	CALL fft_y( ar(k,0:ny,i), direction )
650	ENDDO
651	ENDDO
652	!$OMP END PARALLEL
653
654	END SUBROUTINE ffty
655	#endif
656
657	#if defined( __parallel )
658	SUBROUTINE ffty_tr_yx( f_in, work, f_out )
659
660	!------------------------------------------------------------------------------!
661	! Fourier-transformation along y with subsequent transposition y --> x for
662	! a 1d-decomposition along x
663	!
664	! ATTENTION: The performance of this routine is much faster on the NEC-SX6,
665	! if the first index of work_ffty_vec is odd. Otherwise
666	! memory bank conflicts may occur (especially if the index is a
667	! multiple of 128). That's why work_ffty_vec is dimensioned as
668	! 0:ny+1.
669	! Of course, this will not work if users are using an odd number
670	! of gridpoints along y.
671	!------------------------------------------------------------------------------!
672
673	USE control_parameters
674	USE cpulog
675	USE indices
676	USE interfaces
677	USE pegrid
678	USE transpose_indices
679
680	IMPLICIT NONE
681
682	INTEGER :: i, iend, iouter, ir, j, k
683	INTEGER, PARAMETER :: stridex = 4
684
685	REAL, DIMENSION(0:ny,stridex) :: work_ffty
686	#if defined( __nec )
687	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
688	#endif
689	REAL, DIMENSION(1:nza,0:nya,nxl:nxra) :: f_in
690	REAL, DIMENSION(nnx,1:nza,nys_x:nyn_xa,pdims(1)) :: f_out
691	REAL, DIMENSION(nxl:nxra,1:nza,0:nya) :: work
692
693	!
694	!-- Carry out the FFT along y, where all data are present due to the
695	!-- 1d-decomposition along x. Resort the data in a way that x becomes
696	!-- the first index.
697	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
698
699	IF ( host(1:3) == 'nec' ) THEN
700	#if defined( __nec )
701	!
702	!-- Code optimized for vector processors
703	!$OMP PARALLEL PRIVATE ( i, j, k )
704	!$OMP DO
705	DO i = nxl, nxr
706
707	DO j = 0, ny
708	DO k = 1, nz
709	work_ffty_vec(j,k,i) = f_in(k,j,i)
710	ENDDO
711	ENDDO
712
713	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' )
714
715	ENDDO
716
717	!$OMP DO
718	DO k = 1, nz
719	DO j = 0, ny
720	DO i = nxl, nxr
721	work(i,k,j) = work_ffty_vec(j,k,i)
722	ENDDO
723	ENDDO
724	ENDDO
725	!$OMP END PARALLEL
726	#endif
727
728	ELSE
729
730	!
731	!-- Cache optimized code.
732	!-- The i-(x-)direction is split into a strided outer loop and an inner
733	!-- loop for better cache performance
734	!$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty)
735	!$OMP DO
736	DO iouter = nxl, nxr, stridex
737
738	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
739
740	DO k = 1, nz
741
742	DO i = iouter, iend
743
744	ir = i-iouter+1 ! counter within a stride
745	DO j = 0, ny
746	work_ffty(j,ir) = f_in(k,j,i)
747	ENDDO
748	!
749	!-- FFT along y
750	CALL fft_y( work_ffty(:,ir), 'forward' )
751
752	ENDDO
753
754	!
755	!-- Resort
756	DO j = 0, ny
757	DO i = iouter, iend
758	work(i,k,j) = work_ffty(j,i-iouter+1)
759	ENDDO
760	ENDDO
761
762	ENDDO
763
764	ENDDO
765	!$OMP END PARALLEL
766
767	ENDIF
768	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
769
770	!
771	!-- Transpose array
772	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
773	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
774	CALL MPI_ALLTOALL( work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
775	f_out(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
776	comm1dx, ierr )
777	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
778
779	END SUBROUTINE ffty_tr_yx
780
781
782	SUBROUTINE tr_xy_ffty( f_in, work, f_out )
783
784	!------------------------------------------------------------------------------!
785	! Transposition x --> y with a subsequent backward Fourier transformation for
786	! a 1d-decomposition along x
787	!------------------------------------------------------------------------------!
788
789	USE control_parameters
790	USE cpulog
791	USE indices
792	USE interfaces
793	USE pegrid
794	USE transpose_indices
795
796	IMPLICIT NONE
797
798	INTEGER :: i, iend, iouter, ir, j, k
799	INTEGER, PARAMETER :: stridex = 4
800
801	REAL, DIMENSION(0:ny,stridex) :: work_ffty
802	#if defined( __nec )
803	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
804	#endif
805	REAL, DIMENSION(nnx,1:nza,nys_x:nyn_xa,pdims(1)) :: f_in
806	REAL, DIMENSION(1:nza,0:nya,nxl:nxra) :: f_out
807	REAL, DIMENSION(nxl:nxra,1:nza,0:nya) :: work
808
809	!
810	!-- Transpose array
811	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
812	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
813	CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
814	work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
815	comm1dx, ierr )
816	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
817
818	!
819	!-- Resort the data in a way that y becomes the first index and carry out the
820	!-- backward fft along y.
821	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
822
823	IF ( host(1:3) == 'nec' ) THEN
824	#if defined( __nec )
825	!
826	!-- Code optimized for vector processors
827	!$OMP PARALLEL PRIVATE ( i, j, k )
828	!$OMP DO
829	DO k = 1, nz
830	DO j = 0, ny
831	DO i = nxl, nxr
832	work_ffty_vec(j,k,i) = work(i,k,j)
833	ENDDO
834	ENDDO
835	ENDDO
836
837	!$OMP DO
838	DO i = nxl, nxr
839
840	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' )
841
842	DO j = 0, ny
843	DO k = 1, nz
844	f_out(k,j,i) = work_ffty_vec(j,k,i)
845	ENDDO
846	ENDDO
847
848	ENDDO
849	!$OMP END PARALLEL
850	#endif
851
852	ELSE
853
854	!
855	!-- Cache optimized code.
856	!-- The i-(x-)direction is split into a strided outer loop and an inner
857	!-- loop for better cache performance
858	!$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty )
859	!$OMP DO
860	DO iouter = nxl, nxr, stridex
861
862	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
863
864	DO k = 1, nz
865	!
866	!-- Resort
867	DO j = 0, ny
868	DO i = iouter, iend
869	work_ffty(j,i-iouter+1) = work(i,k,j)
870	ENDDO
871	ENDDO
872
873	DO i = iouter, iend
874
875	!
876	!-- FFT along y
877	ir = i-iouter+1 ! counter within a stride
878	CALL fft_y( work_ffty(:,ir), 'backward' )
879
880	DO j = 0, ny
881	f_out(k,j,i) = work_ffty(j,ir)
882	ENDDO
883	ENDDO
884
885	ENDDO
886
887	ENDDO
888	!$OMP END PARALLEL
889
890	ENDIF
891
892	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
893
894	END SUBROUTINE tr_xy_ffty
895
896
897	SUBROUTINE fftx_tri_fftx( ar )
898
899	!------------------------------------------------------------------------------!
900	! FFT along x, solution of the tridiagonal system and backward FFT for
901	! a 1d-decomposition along x
902	!
903	! WARNING: this subroutine may still not work for hybrid parallelization
904	! with OpenMP (for possible necessary changes see the original
905	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
906	!------------------------------------------------------------------------------!
907
908	USE control_parameters
909	USE cpulog
910	USE grid_variables
911	USE indices
912	USE interfaces
913	USE pegrid
914	USE transpose_indices
915
916	IMPLICIT NONE
917
918	character(len=3) :: myth_char
919
920	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
921
922	REAL, DIMENSION(0:nx) :: work_fftx
923	REAL, DIMENSION(0:nx,1:nz) :: work_trix
924	REAL, DIMENSION(nnx,1:nza,nys_x:nyn_xa,pdims(1)) :: ar
925	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
926
927
928	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'start' )
929
930	ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) )
931
932	tn = 0 ! Default thread number in case of one thread
933	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_fftx, work_trix )
934	DO j = nys_x, nyn_x
935
936	!$ tn = omp_get_thread_num()
937
938	IF ( host(1:3) == 'nec' ) THEN
939	!
940	!-- Code optimized for vector processors
941	DO k = 1, nz
942
943	m = 0
944	DO n = 1, pdims(1)
945	DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
946	work_trix(m,k) = ar(i,k,j,n)
947	m = m + 1
948	ENDDO
949	ENDDO
950
951	ENDDO
952
953	CALL fft_x_m( work_trix, 'forward' )
954
955	ELSE
956	!
957	!-- Cache optimized code
958	DO k = 1, nz
959
960	m = 0
961	DO n = 1, pdims(1)
962	DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
963	work_fftx(m) = ar(i,k,j,n)
964	m = m + 1
965	ENDDO
966	ENDDO
967
968	CALL fft_x( work_fftx, 'forward' )
969
970	DO i = 0, nx
971	work_trix(i,k) = work_fftx(i)
972	ENDDO
973
974	ENDDO
975
976	ENDIF
977
978	!
979	!-- Solve the linear equation system
980	CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) )
981
982	IF ( host(1:3) == 'nec' ) THEN
983	!
984	!-- Code optimized for vector processors
985	CALL fft_x_m( work_trix, 'backward' )
986
987	DO k = 1, nz
988
989	m = 0
990	DO n = 1, pdims(1)
991	DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
992	ar(i,k,j,n) = work_trix(m,k)
993	m = m + 1
994	ENDDO
995	ENDDO
996
997	ENDDO
998
999	ELSE
1000	!
1001	!-- Cache optimized code
1002	DO k = 1, nz
1003
1004	DO i = 0, nx
1005	work_fftx(i) = work_trix(i,k)
1006	ENDDO
1007
1008	CALL fft_x( work_fftx, 'backward' )
1009
1010	m = 0
1011	DO n = 1, pdims(1)
1012	DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
1013	ar(i,k,j,n) = work_fftx(m)
1014	m = m + 1
1015	ENDDO
1016	ENDDO
1017
1018	ENDDO
1019
1020	ENDIF
1021
1022	ENDDO
1023
1024	DEALLOCATE( tri )
1025
1026	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'stop' )
1027
1028	END SUBROUTINE fftx_tri_fftx
1029
1030
1031	SUBROUTINE fftx_tr_xy( f_in, work, f_out )
1032
1033	!------------------------------------------------------------------------------!
1034	! Fourier-transformation along x with subsequent transposition x --> y for
1035	! a 1d-decomposition along y
1036	!
1037	! ATTENTION: The NEC-branch of this routine may significantly profit from
1038	! further optimizations. So far, performance is much worse than
1039	! for routine ffty_tr_yx (more than three times slower).
1040	!------------------------------------------------------------------------------!
1041
1042	USE control_parameters
1043	USE cpulog
1044	USE indices
1045	USE interfaces
1046	USE pegrid
1047	USE transpose_indices
1048
1049	IMPLICIT NONE
1050
1051	INTEGER :: i, j, k
1052
1053	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
1054	REAL, DIMENSION(1:nza,nys:nyna,0:nxa) :: f_in
1055	REAL, DIMENSION(nny,1:nza,nxl_y:nxr_ya,pdims(2)) :: f_out
1056	REAL, DIMENSION(nys:nyna,1:nza,0:nxa) :: work
1057
1058	!
1059	!-- Carry out the FFT along x, where all data are present due to the
1060	!-- 1d-decomposition along y. Resort the data in a way that y becomes
1061	!-- the first index.
1062	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
1063
1064	IF ( host(1:3) == 'nec' ) THEN
1065	!
1066	!-- Code for vector processors
1067	!$OMP PARALLEL PRIVATE ( i, j, k )
1068	!$OMP DO
1069	DO i = 0, nx
1070
1071	DO j = nys, nyn
1072	DO k = 1, nz
1073	work_fftx(i,k,j) = f_in(k,j,i)
1074	ENDDO
1075	ENDDO
1076
1077	ENDDO
1078
1079	!$OMP DO
1080	DO j = nys, nyn
1081
1082	CALL fft_x_m( work_fftx(:,:,j), 'forward' )
1083
1084	DO k = 1, nz
1085	DO i = 0, nx
1086	work(j,k,i) = work_fftx(i,k,j)
1087	ENDDO
1088	ENDDO
1089
1090	ENDDO
1091	!$OMP END PARALLEL
1092
1093	ELSE
1094
1095	!
1096	!-- Cache optimized code (there might be still a potential for better
1097	!-- optimization).
1098	!$OMP PARALLEL PRIVATE (i,j,k)
1099	!$OMP DO
1100	DO i = 0, nx
1101
1102	DO j = nys, nyn
1103	DO k = 1, nz
1104	work_fftx(i,k,j) = f_in(k,j,i)
1105	ENDDO
1106	ENDDO
1107
1108	ENDDO
1109
1110	!$OMP DO
1111	DO j = nys, nyn
1112	DO k = 1, nz
1113
1114	CALL fft_x( work_fftx(0:nx,k,j), 'forward' )
1115
1116	DO i = 0, nx
1117	work(j,k,i) = work_fftx(i,k,j)
1118	ENDDO
1119	ENDDO
1120
1121	ENDDO
1122	!$OMP END PARALLEL
1123
1124	ENDIF
1125	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
1126
1127	!
1128	!-- Transpose array
1129	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1130	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1131	CALL MPI_ALLTOALL( work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
1132	f_out(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
1133	comm1dy, ierr )
1134	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1135
1136	END SUBROUTINE fftx_tr_xy
1137
1138
1139	SUBROUTINE tr_yx_fftx( f_in, work, f_out )
1140
1141	!------------------------------------------------------------------------------!
1142	! Transposition y --> x with a subsequent backward Fourier transformation for
1143	! a 1d-decomposition along x
1144	!------------------------------------------------------------------------------!
1145
1146	USE control_parameters
1147	USE cpulog
1148	USE indices
1149	USE interfaces
1150	USE pegrid
1151	USE transpose_indices
1152
1153	IMPLICIT NONE
1154
1155	INTEGER :: i, j, k
1156
1157	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
1158	REAL, DIMENSION(nny,1:nza,nxl_y:nxr_ya,pdims(2)) :: f_in
1159	REAL, DIMENSION(1:nza,nys:nyna,0:nxa) :: f_out
1160	REAL, DIMENSION(nys:nyna,1:nza,0:nxa) :: work
1161
1162	!
1163	!-- Transpose array
1164	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1165	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1166	CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
1167	work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
1168	comm1dy, ierr )
1169	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1170
1171	!
1172	!-- Carry out the FFT along x, where all data are present due to the
1173	!-- 1d-decomposition along y. Resort the data in a way that y becomes
1174	!-- the first index.
1175	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
1176
1177	IF ( host(1:3) == 'nec' ) THEN
1178	!
1179	!-- Code optimized for vector processors
1180	!$OMP PARALLEL PRIVATE ( i, j, k )
1181	!$OMP DO
1182	DO j = nys, nyn
1183
1184	DO k = 1, nz
1185	DO i = 0, nx
1186	work_fftx(i,k,j) = work(j,k,i)
1187	ENDDO
1188	ENDDO
1189
1190	CALL fft_x_m( work_fftx(:,:,j), 'backward' )
1191
1192	ENDDO
1193
1194	!$OMP DO
1195	DO i = 0, nx
1196	DO j = nys, nyn
1197	DO k = 1, nz
1198	f_out(k,j,i) = work_fftx(i,k,j)
1199	ENDDO
1200	ENDDO
1201	ENDDO
1202	!$OMP END PARALLEL
1203
1204	ELSE
1205
1206	!
1207	!-- Cache optimized code (there might be still a potential for better
1208	!-- optimization).
1209	!$OMP PARALLEL PRIVATE (i,j,k)
1210	!$OMP DO
1211	DO j = nys, nyn
1212	DO k = 1, nz
1213
1214	DO i = 0, nx
1215	work_fftx(i,k,j) = work(j,k,i)
1216	ENDDO
1217
1218	CALL fft_x( work_fftx(0:nx,k,j), 'backward' )
1219
1220	ENDDO
1221	ENDDO
1222
1223	!$OMP DO
1224	DO i = 0, nx
1225	DO j = nys, nyn
1226	DO k = 1, nz
1227	f_out(k,j,i) = work_fftx(i,k,j)
1228	ENDDO
1229	ENDDO
1230	ENDDO
1231	!$OMP END PARALLEL
1232
1233	ENDIF
1234	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
1235
1236	END SUBROUTINE tr_yx_fftx
1237
1238
1239	SUBROUTINE ffty_tri_ffty( ar )
1240
1241	!------------------------------------------------------------------------------!
1242	! FFT along y, solution of the tridiagonal system and backward FFT for
1243	! a 1d-decomposition along y
1244	!
1245	! WARNING: this subroutine may still not work for hybrid parallelization
1246	! with OpenMP (for possible necessary changes see the original
1247	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
1248	!------------------------------------------------------------------------------!
1249
1250	USE control_parameters
1251	USE cpulog
1252	USE grid_variables
1253	USE indices
1254	USE interfaces
1255	USE pegrid
1256	USE transpose_indices
1257
1258	IMPLICIT NONE
1259
1260	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
1261
1262	REAL, DIMENSION(0:ny) :: work_ffty
1263	REAL, DIMENSION(0:ny,1:nz) :: work_triy
1264	REAL, DIMENSION(nny,1:nza,nxl_y:nxr_ya,pdims(2)) :: ar
1265	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
1266
1267
1268	CALL cpu_log( log_point_s(39), 'fft_y + tridia', 'start' )
1269
1270	ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) )
1271
1272	tn = 0 ! Default thread number in case of one thread
1273	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy )
1274	DO i = nxl_y, nxr_y
1275
1276	!$ tn = omp_get_thread_num()
1277
1278	IF ( host(1:3) == 'nec' ) THEN
1279	!
1280	!-- Code optimized for vector processors
1281	DO k = 1, nz
1282
1283	m = 0
1284	DO n = 1, pdims(2)
1285	DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
1286	work_triy(m,k) = ar(j,k,i,n)
1287	m = m + 1
1288	ENDDO
1289	ENDDO
1290
1291	ENDDO
1292
1293	CALL fft_y_m( work_triy, ny, 'forward' )
1294
1295	ELSE
1296	!
1297	!-- Cache optimized code
1298	DO k = 1, nz
1299
1300	m = 0
1301	DO n = 1, pdims(2)
1302	DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
1303	work_ffty(m) = ar(j,k,i,n)
1304	m = m + 1
1305	ENDDO
1306	ENDDO
1307
1308	CALL fft_y( work_ffty, 'forward' )
1309
1310	DO j = 0, ny
1311	work_triy(j,k) = work_ffty(j)
1312	ENDDO
1313
1314	ENDDO
1315
1316	ENDIF
1317
1318	!
1319	!-- Solve the linear equation system
1320	CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) )
1321
1322	IF ( host(1:3) == 'nec' ) THEN
1323	!
1324	!-- Code optimized for vector processors
1325	CALL fft_y_m( work_triy, ny, 'backward' )
1326
1327	DO k = 1, nz
1328
1329	m = 0
1330	DO n = 1, pdims(2)
1331	DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
1332	ar(j,k,i,n) = work_triy(m,k)
1333	m = m + 1
1334	ENDDO
1335	ENDDO
1336
1337	ENDDO
1338
1339	ELSE
1340	!
1341	!-- Cache optimized code
1342	DO k = 1, nz
1343
1344	DO j = 0, ny
1345	work_ffty(j) = work_triy(j,k)
1346	ENDDO
1347
1348	CALL fft_y( work_ffty, 'backward' )
1349
1350	m = 0
1351	DO n = 1, pdims(2)
1352	DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
1353	ar(j,k,i,n) = work_ffty(m)
1354	m = m + 1
1355	ENDDO
1356	ENDDO
1357
1358	ENDDO
1359
1360	ENDIF
1361
1362	ENDDO
1363
1364	DEALLOCATE( tri )
1365
1366	CALL cpu_log( log_point_s(39), 'fft_y + tridia', 'stop' )
1367
1368	END SUBROUTINE ffty_tri_ffty
1369
1370
1371	SUBROUTINE tridia_1dd( ddx2, ddy2, nx, ny, j, ar, tri )
1372
1373	!------------------------------------------------------------------------------!
1374	! Solves the linear system of equations for a 1d-decomposition along x (see
1375	! tridia)
1376	!
1377	! Attention: when using the intel compiler, array tri must be passed as an
1378	! argument to the contained subroutines. Otherwise addres faults
1379	! will occur.
1380	! On NEC, tri should not be passed (except for routine substi_1dd)
1381	! because this causes very bad performance.
1382	!------------------------------------------------------------------------------!
1383
1384	USE arrays_3d
1385	USE control_parameters
1386
1387	USE pegrid
1388
1389	IMPLICIT NONE
1390
1391	INTEGER :: i, j, k, nnyh, nx, ny, omp_get_thread_num, tn
1392
1393	REAL :: ddx2, ddy2
1394
1395	REAL, DIMENSION(0:nx,1:nz) :: ar
1396	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
1397	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
1398
1399
1400	nnyh = ( ny + 1 ) / 2
1401
1402	!
1403	!-- Define constant elements of the tridiagonal matrix.
1404	!-- The compiler on SX6 does loop exchange. If 0:nx is a high power of 2,
1405	!-- the exchanged loops create bank conflicts. The following directive
1406	!-- prohibits loop exchange and the loops perform much better.
1407	! tn = omp_get_thread_num()
1408	! WRITE( 120+tn, * ) '+++ id=',myid,' nx=',nx,' thread=', omp_get_thread_num()
1409	! CALL local_flush( 120+tn )
1410	!CDIR NOLOOPCHG
1411	DO k = 0, nz-1
1412	DO i = 0,nx
1413	tri(2,i,k) = ddzu_pres(k+1) * ddzw(k+1)
1414	tri(3,i,k) = ddzu_pres(k+2) * ddzw(k+1)
1415	ENDDO
1416	ENDDO
1417	! WRITE( 120+tn, * ) '+++ id=',myid,' end of first tridia loop thread=', omp_get_thread_num()
1418	! CALL local_flush( 120+tn )
1419
1420	IF ( j <= nnyh ) THEN
1421	#if defined( __lc )
1422	CALL maketri_1dd( j, tri )
1423	#else
1424	CALL maketri_1dd( j )
1425	#endif
1426	ELSE
1427	#if defined( __lc )
1428	CALL maketri_1dd( ny+1-j, tri )
1429	#else
1430	CALL maketri_1dd( ny+1-j )
1431	#endif
1432	ENDIF
1433	#if defined( __lc )
1434	CALL split_1dd( tri )
1435	#else
1436	CALL split_1dd
1437	#endif
1438	CALL substi_1dd( ar, tri )
1439
1440	CONTAINS
1441
1442	#if defined( __lc )
1443	SUBROUTINE maketri_1dd( j, tri )
1444	#else
1445	SUBROUTINE maketri_1dd( j )
1446	#endif
1447
1448	!------------------------------------------------------------------------------!
1449	! computes the i- and j-dependent component of the matrix
1450	!------------------------------------------------------------------------------!
1451
1452	USE constants
1453
1454	IMPLICIT NONE
1455
1456	INTEGER :: i, j, k, nnxh
1457	REAL :: a, c
1458
1459	REAL, DIMENSION(0:nx) :: l
1460
1461	#if defined( __lc )
1462	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
1463	#endif
1464
1465
1466	nnxh = ( nx + 1 ) / 2
1467	!
1468	!-- Provide the tridiagonal matrix for solution of the Poisson equation in
1469	!-- Fourier space. The coefficients are computed following the method of
1470	!-- Schmidt et al. (DFVLR-Mitteilung 84-15), which departs from Stephan
1471	!-- Siano's original version by discretizing the Poisson equation,
1472	!-- before it is Fourier-transformed
1473	DO i = 0, nx
1474	IF ( i >= 0 .AND. i <= nnxh ) THEN
1475	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
1476	FLOAT( nx+1 ) ) ) * ddx2 + &
1477	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
1478	FLOAT( ny+1 ) ) ) * ddy2
1479	ELSE
1480	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
1481	FLOAT( nx+1 ) ) ) * ddx2 + &
1482	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
1483	FLOAT( ny+1 ) ) ) * ddy2
1484	ENDIF
1485	ENDDO
1486
1487	DO k = 0, nz-1
1488	DO i = 0, nx
1489	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
1490	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
1491	tri(1,i,k) = a + c - l(i)
1492	ENDDO
1493	ENDDO
1494	IF ( ibc_p_b == 1 .OR. ibc_p_b == 2 ) THEN
1495	DO i = 0, nx
1496	tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
1497	ENDDO
1498	ENDIF
1499	IF ( ibc_p_t == 1 ) THEN
1500	DO i = 0, nx
1501	tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
1502	ENDDO
1503	ENDIF
1504
1505	END SUBROUTINE maketri_1dd
1506
1507
1508	#if defined( __lc )
1509	SUBROUTINE split_1dd( tri )
1510	#else
1511	SUBROUTINE split_1dd
1512	#endif
1513
1514	!------------------------------------------------------------------------------!
1515	! Splitting of the tridiagonal matrix (Thomas algorithm)
1516	!------------------------------------------------------------------------------!
1517
1518	IMPLICIT NONE
1519
1520	INTEGER :: i, k
1521
1522	#if defined( __lc )
1523	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
1524	#endif
1525
1526
1527	!
1528	!-- Splitting
1529	DO i = 0, nx
1530	tri(4,i,0) = tri(1,i,0)
1531	ENDDO
1532	DO k = 1, nz-1
1533	DO i = 0, nx
1534	tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
1535	tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
1536	ENDDO
1537	ENDDO
1538
1539	END SUBROUTINE split_1dd
1540
1541
1542	SUBROUTINE substi_1dd( ar, tri )
1543
1544	!------------------------------------------------------------------------------!
1545	! Substitution (Forward and Backward) (Thomas algorithm)
1546	!------------------------------------------------------------------------------!
1547
1548	IMPLICIT NONE
1549
1550	INTEGER :: i, k
1551
1552	REAL, DIMENSION(0:nx,nz) :: ar
1553	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
1554	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
1555
1556	!
1557	!-- Forward substitution
1558	DO i = 0, nx
1559	ar1(i,0) = ar(i,1)
1560	ENDDO
1561	DO k = 1, nz-1
1562	DO i = 0, nx
1563	ar1(i,k) = ar(i,k+1) - tri(5,i,k) * ar1(i,k-1)
1564	ENDDO
1565	ENDDO
1566
1567	!
1568	!-- Backward substitution
1569	!-- Note, the add of 1.0E-20 in the denominator is due to avoid divisions
1570	!-- by zero appearing if the pressure bc is set to neumann at the top of
1571	!-- the model domain.
1572	DO i = 0, nx
1573	ar(i,nz) = ar1(i,nz-1) / ( tri(4,i,nz-1) + 1.0E-20 )
1574	ENDDO
1575	DO k = nz-2, 0, -1
1576	DO i = 0, nx
1577	ar(i,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,k+2) ) &
1578	/ tri(4,i,k)
1579	ENDDO
1580	ENDDO
1581
1582	!
1583	!-- Indices i=0, j=0 correspond to horizontally averaged pressure.
1584	!-- The respective values of ar should be zero at all k-levels if
1585	!-- acceleration of horizontally averaged vertical velocity is zero.
1586	IF ( ibc_p_b == 1 .AND. ibc_p_t == 1 ) THEN
1587	IF ( j == 0 ) THEN
1588	DO k = 1, nz
1589	ar(0,k) = 0.0
1590	ENDDO
1591	ENDIF
1592	ENDIF
1593
1594	END SUBROUTINE substi_1dd
1595
1596	END SUBROUTINE tridia_1dd
1597
1598	#endif
1599	#endif
1600	END MODULE poisfft_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |