Home

Context Navigation

source: palm/trunk/SOURCE/poisfft.f90 @ 807

Last change on this file since 807 was 807, checked in by maronga, 12 years ago
new utility check_namelist_files implemented
Property svn:keywords set to `Id`
File size: 46.4 KB

Line
1	MODULE poisfft_mod
2
3	!------------------------------------------------------------------------------!
4	! Current revisions:
5	! -----------------
6	! New cpp directive "__check" implemented which is used by check_namelist_files
7	! (most of the code is unneeded by check_namelist_files).
8	!
9	!
10	! Former revisions:
11	! -----------------
12	! $Id: poisfft.f90 807 2012-01-25 11:53:51Z maronga $
13	!
14	! 763 2011-10-06 09:32:09Z suehring
15	! Comment added concerning the last change.
16	!
17	! 761 2011-10-05 17:58:52Z suehring
18	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
19	! pressure at the top of the model domain.
20	!
21	! 696 2011-03-18 07:03:49Z raasch
22	! work_fftx removed from PRIVATE clauses in fftx_tr_xy and tr_yx_fftx
23	!
24	! 683 2011-02-09 14:25:15Z raasch
25	! openMP parallelization for 2d-domain-decomposition
26	!
27	! 667 2010-12-23 12:06:00Z suehring/gryschka
28	! ddzu replaced by ddzu_pres due to changes in zu(0)
29	!
30	! 622 2010-12-10 08:08:13Z raasch
31	! optional barriers included in order to speed up collective operations
32	!
33	! 377 2009-09-04 11:09:00Z raasch
34	! __lcmuk changed to __lc to avoid problems with Intel compiler on sgi-ice
35	!
36	! 164 2008-05-15 08:46:15Z raasch
37	! Arguments removed from transpose routines
38	!
39	! 128 2007-10-26 13:11:14Z raasch
40	! Bugfix: wavenumber calculation for even nx in routines maketri
41	!
42	! 85 2007-05-11 09:35:14Z raasch
43	! Bugfix: work_fft*_vec removed from some PRIVATE-declarations
44	!
45	! 76 2007-03-29 00:58:32Z raasch
46	! Tridiagonal coefficients adjusted for Neumann boundary conditions both at
47	! the bottom and the top.
48	!
49	! RCS Log replace by Id keyword, revision history cleaned up
50	!
51	! Revision 1.24 2006/08/04 15:00:24 raasch
52	! Default setting of the thread number tn in case of not using OpenMP
53	!
54	! Revision 1.23 2006/02/23 12:48:38 raasch
55	! Additional compiler directive in routine tridia_1dd for preventing loop
56	! exchange on NEC-SX6
57	!
58	! Revision 1.20 2004/04/30 12:38:09 raasch
59	! Parts of former poisfft_hybrid moved to this subroutine,
60	! former subroutine changed to a module, renaming of FFT-subroutines and
61	! -module, FFTs completely substituted by calls of fft_x and fft_y,
62	! NAG fft used in the non-parallel case completely removed, l in maketri
63	! is now a 1d-array, variables passed by modules instead of using parameter
64	! lists, enlarged transposition arrays introduced
65	!
66	! Revision 1.1 1997/07/24 11:24:14 raasch
67	! Initial revision
68	!
69	!
70	! Description:
71	! ------------
72	! See below.
73	!------------------------------------------------------------------------------!
74
75	!--------------------------------------------------------------------------!
76	! poisfft !
77	! !
78	! Original version: Stephan Siano (pois3d) !
79	! !
80	! Institute of Meteorology and Climatology, University of Hannover !
81	! Germany !
82	! !
83	! Version as of July 23,1996 !
84	! !
85	! !
86	! Version for parallel computers: Siegfried Raasch !
87	! !
88	! Version as of July 03,1997 !
89	! !
90	! Solves the Poisson equation with a 2D spectral method !
91	! d^2 p / dx^2 + d^2 p / dy^2 + d^2 p / dz^2 = s !
92	! !
93	! Input: !
94	! real ar contains in the (nnx,nny,nnz) elements, !
95	! starting from the element (1,nys,nxl), the !
96	! values for s !
97	! real work Temporary array !
98	! !
99	! Output: !
100	! real ar contains the solution for p !
101	!--------------------------------------------------------------------------!
102
103	USE fft_xy
104	USE indices
105	USE transpose_indices
106
107	IMPLICIT NONE
108
109	PRIVATE
110
111	#if .NOT. defined ( __check )
112	PUBLIC poisfft, poisfft_init
113
114	INTERFACE poisfft
115	MODULE PROCEDURE poisfft
116	END INTERFACE poisfft
117
118	INTERFACE poisfft_init
119	MODULE PROCEDURE poisfft_init
120	END INTERFACE poisfft_init
121	#else
122	PUBLIC poisfft_init
123
124	INTERFACE poisfft_init
125	MODULE PROCEDURE poisfft_init
126	END INTERFACE poisfft_init
127	#endif
128
129	CONTAINS
130
131	SUBROUTINE poisfft_init
132
133	CALL fft_init
134
135	END SUBROUTINE poisfft_init
136
137	#if .NOT. defined ( __check )
138	SUBROUTINE poisfft( ar, work )
139
140	USE cpulog
141	USE interfaces
142	USE pegrid
143
144	IMPLICIT NONE
145
146	REAL, DIMENSION(1:nza,nys:nyna,nxl:nxra) :: ar, work
147
148
149	CALL cpu_log( log_point_s(3), 'poisfft', 'start' )
150
151	!
152	!-- Two-dimensional Fourier Transformation in x- and y-direction.
153	#if defined( __parallel )
154	IF ( pdims(2) == 1 ) THEN
155
156	!
157	!-- 1d-domain-decomposition along x:
158	!-- FFT along y and transposition y --> x
159	CALL ffty_tr_yx( ar, work, ar )
160
161	!
162	!-- FFT along x, solving the tridiagonal system and backward FFT
163	CALL fftx_tri_fftx( ar )
164
165	!
166	!-- Transposition x --> y and backward FFT along y
167	CALL tr_xy_ffty( ar, work, ar )
168
169	ELSEIF ( pdims(1) == 1 ) THEN
170
171	!
172	!-- 1d-domain-decomposition along y:
173	!-- FFT along x and transposition x --> y
174	CALL fftx_tr_xy( ar, work, ar )
175
176	!
177	!-- FFT along y, solving the tridiagonal system and backward FFT
178	CALL ffty_tri_ffty( ar )
179
180	!
181	!-- Transposition y --> x and backward FFT along x
182	CALL tr_yx_fftx( ar, work, ar )
183
184	ELSE
185
186	!
187	!-- 2d-domain-decomposition
188	!-- Transposition z --> x
189	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
190	CALL transpose_zx( ar, work, ar )
191	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
192
193	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
194	CALL fftxp( ar, 'forward' )
195	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
196
197	!
198	!-- Transposition x --> y
199	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
200	CALL transpose_xy( ar, work, ar )
201	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
202
203	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
204	CALL fftyp( ar, 'forward' )
205	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
206
207	!
208	!-- Transposition y --> z
209	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
210	CALL transpose_yz( ar, work, ar )
211	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
212
213	!
214	!-- Solve the Poisson equation in z-direction in cartesian space.
215	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
216	CALL tridia( ar )
217	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
218
219	!
220	!-- Inverse Fourier Transformation
221	!-- Transposition z --> y
222	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
223	CALL transpose_zy( ar, work, ar )
224	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
225
226	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
227	CALL fftyp( ar, 'backward' )
228	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
229
230	!
231	!-- Transposition y --> x
232	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
233	CALL transpose_yx( ar, work, ar )
234	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
235
236	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
237	CALL fftxp( ar, 'backward' )
238	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
239
240	!
241	!-- Transposition x --> z
242	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
243	CALL transpose_xz( ar, work, ar )
244	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
245
246	ENDIF
247
248	#else
249
250	!
251	!-- Two-dimensional Fourier Transformation along x- and y-direction.
252	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
253	CALL fftx( ar, 'forward' )
254	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
255	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
256	CALL ffty( ar, 'forward' )
257	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
258
259	!
260	!-- Solve the Poisson equation in z-direction in cartesian space.
261	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
262	CALL tridia( ar )
263	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
264
265	!
266	!-- Inverse Fourier Transformation.
267	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
268	CALL ffty( ar, 'backward' )
269	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
270	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
271	CALL fftx( ar, 'backward' )
272	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
273
274	#endif
275
276	CALL cpu_log( log_point_s(3), 'poisfft', 'stop' )
277
278	END SUBROUTINE poisfft
279
280
281
282	SUBROUTINE tridia( ar )
283
284	!------------------------------------------------------------------------------!
285	! solves the linear system of equations:
286	!
287	! -(4 pi^2(i^2/(dx^2nnx^2)+j^2/(dy^2nny^2))+
288	! 1/(dzu(k)dzw(k))+1/(dzu(k-1)dzw(k)))*p(i,j,k)+
289	! 1/(dzu(k)dzw(k))p(i,j,k+1)+1/(dzu(k-1)dzw(k))p(i,j,k-1)=d(i,j,k)
290	!
291	! by using the Thomas algorithm
292	!------------------------------------------------------------------------------!
293
294	USE arrays_3d
295
296	IMPLICIT NONE
297
298	INTEGER :: i, j, k, nnyh
299
300	REAL, DIMENSION(nxl_z:nxr_z,0:nz-1) :: ar1
301	REAL, DIMENSION(5,nxl_z:nxr_z,0:nz-1) :: tri
302
303	#if defined( __parallel )
304	REAL :: ar(nxl_z:nxr_za,nys_z:nyn_za,1:nza)
305	#else
306	REAL :: ar(1:nz,nys_z:nyn_z,nxl_z:nxr_z)
307	#endif
308
309
310	nnyh = (ny+1) / 2
311
312	!
313	!-- Define constant elements of the tridiagonal matrix.
314	!$OMP PARALLEL PRIVATE ( k, i )
315	!$OMP DO
316	DO k = 0, nz-1
317	DO i = nxl_z, nxr_z
318	tri(2,i,k) = ddzu_pres(k+1) * ddzw(k+1)
319	tri(3,i,k) = ddzu_pres(k+2) * ddzw(k+1)
320	ENDDO
321	ENDDO
322	!$OMP END PARALLEL
323
324	#if defined( __parallel )
325	!
326	!-- Repeat for all y-levels.
327	!$OMP PARALLEL FIRSTPRIVATE( tri ) PRIVATE ( ar1, j )
328	!$OMP DO
329	DO j = nys_z, nyn_z
330	IF ( j <= nnyh ) THEN
331	CALL maketri( tri, j )
332	ELSE
333	CALL maketri( tri, ny+1-j )
334	ENDIF
335	CALL split( tri )
336	CALL substi( ar, ar1, tri, j )
337	ENDDO
338	!$OMP END PARALLEL
339	#else
340	!
341	!-- First y-level.
342	CALL maketri( tri, nys_z )
343	CALL split( tri )
344	CALL substi( ar, ar1, tri, 0 )
345
346	!
347	!-- Further y-levels.
348	DO j = 1, nnyh - 1
349	CALL maketri( tri, j )
350	CALL split( tri )
351	CALL substi( ar, ar1, tri, j )
352	CALL substi( ar, ar1, tri, ny+1-j )
353	ENDDO
354	CALL maketri( tri, nnyh )
355	CALL split( tri )
356	CALL substi( ar, ar1, tri, nnyh+nys )
357	#endif
358
359	CONTAINS
360
361	SUBROUTINE maketri( tri, j )
362
363	!------------------------------------------------------------------------------!
364	! Computes the i- and j-dependent component of the matrix
365	!------------------------------------------------------------------------------!
366
367	USE arrays_3d
368	USE constants
369	USE control_parameters
370	USE grid_variables
371
372	IMPLICIT NONE
373
374	INTEGER :: i, j, k, nnxh
375	REAL :: a, c
376	REAL :: ll(nxl_z:nxr_z)
377	REAL :: tri(5,nxl_z:nxr_z,0:nz-1)
378
379
380	nnxh = ( nx + 1 ) / 2
381
382	!
383	!-- Provide the tridiagonal matrix for solution of the Poisson equation in
384	!-- Fourier space. The coefficients are computed following the method of
385	!-- Schmidt et al. (DFVLR-Mitteilung 84-15), which departs from Stephan
386	!-- Siano's original version by discretizing the Poisson equation,
387	!-- before it is Fourier-transformed
388	#if defined( __parallel )
389	DO i = nxl_z, nxr_z
390	IF ( i >= 0 .AND. i <= nnxh ) THEN
391	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
392	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
393	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
394	FLOAT( ny+1 ) ) ) / ( dy * dy )
395	ELSE
396	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
397	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
398	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
399	FLOAT( ny+1 ) ) ) / ( dy * dy )
400	ENDIF
401	DO k = 0,nz-1
402	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
403	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
404	tri(1,i,k) = a + c - ll(i)
405	ENDDO
406	ENDDO
407	#else
408	DO i = 0, nnxh
409	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / FLOAT( nx+1 ) ) ) / &
410	( dx * dx ) + &
411	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / FLOAT( ny+1 ) ) ) / &
412	( dy * dy )
413	DO k = 0, nz-1
414	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
415	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
416	tri(1,i,k) = a + c - ll(i)
417	IF ( i >= 1 .and. i < nnxh ) THEN
418	tri(1,nx+1-i,k) = tri(1,i,k)
419	ENDIF
420	ENDDO
421	ENDDO
422	#endif
423	IF ( ibc_p_b == 1 .OR. ibc_p_b == 2 ) THEN
424	DO i = nxl_z, nxr_z
425	tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
426	ENDDO
427	ENDIF
428	IF ( ibc_p_t == 1 ) THEN
429	DO i = nxl_z, nxr_z
430	tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
431	ENDDO
432	ENDIF
433
434	END SUBROUTINE maketri
435
436
437	SUBROUTINE substi( ar, ar1, tri, j )
438
439	!------------------------------------------------------------------------------!
440	! Substitution (Forward and Backward) (Thomas algorithm)
441	!------------------------------------------------------------------------------!
442
443	USE control_parameters
444
445	IMPLICIT NONE
446
447	INTEGER :: i, j, k
448	REAL :: ar1(nxl_z:nxr_z,0:nz-1)
449	REAL :: tri(5,nxl_z:nxr_z,0:nz-1)
450	#if defined( __parallel )
451	REAL :: ar(nxl_z:nxr_za,nys_z:nyn_za,1:nza)
452	#else
453	REAL :: ar(1:nz,nys_z:nyn_z,nxl_z:nxr_z)
454	#endif
455
456	!
457	!-- Forward substitution.
458	DO i = nxl_z, nxr_z
459	#if defined( __parallel )
460	ar1(i,0) = ar(i,j,1)
461	#else
462	ar1(i,0) = ar(1,j,i)
463	#endif
464	ENDDO
465	DO k = 1, nz - 1
466	DO i = nxl_z, nxr_z
467	#if defined( __parallel )
468	ar1(i,k) = ar(i,j,k+1) - tri(5,i,k) * ar1(i,k-1)
469	#else
470	ar1(i,k) = ar(k+1,j,i) - tri(5,i,k) * ar1(i,k-1)
471	#endif
472	ENDDO
473	ENDDO
474
475	!
476	!-- Backward substitution.
477	DO i = nxl_z, nxr_z
478	#if defined( __parallel )
479	ar(i,j,nz) = ar1(i,nz-1) / tri(4,i,nz-1)
480	#else
481	ar(nz,j,i) = ar1(i,nz-1) / tri(4,i,nz-1)
482	#endif
483	ENDDO
484	DO k = nz-2, 0, -1
485	DO i = nxl_z, nxr_z
486	#if defined( __parallel )
487	ar(i,j,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,j,k+2) ) &
488	/ tri(4,i,k)
489	#else
490	ar(k+1,j,i) = ( ar1(i,k) - tri(3,i,k) * ar(k+2,j,i) ) &
491	/ tri(4,i,k)
492	#endif
493	ENDDO
494	ENDDO
495
496	!
497	!-- Indices i=0, j=0 correspond to horizontally averaged pressure.
498	!-- The respective values of ar should be zero at all k-levels if
499	!-- acceleration of horizontally averaged vertical velocity is zero.
500	IF ( ibc_p_b == 1 .AND. ibc_p_t == 1 ) THEN
501	IF ( j == 0 .AND. nxl_z == 0 ) THEN
502	#if defined( __parallel )
503	DO k = 1, nz
504	ar(nxl_z,j,k) = 0.0
505	ENDDO
506	#else
507	DO k = 1, nz
508	ar(k,j,nxl_z) = 0.0
509	ENDDO
510	#endif
511	ENDIF
512	ENDIF
513
514	END SUBROUTINE substi
515
516
517	SUBROUTINE split( tri )
518
519	!------------------------------------------------------------------------------!
520	! Splitting of the tridiagonal matrix (Thomas algorithm)
521	!------------------------------------------------------------------------------!
522
523	IMPLICIT NONE
524
525	INTEGER :: i, k
526	REAL :: tri(5,nxl_z:nxr_z,0:nz-1)
527
528	!
529	!-- Splitting.
530	DO i = nxl_z, nxr_z
531	tri(4,i,0) = tri(1,i,0)
532	ENDDO
533	DO k = 1, nz-1
534	DO i = nxl_z, nxr_z
535	tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
536	tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
537	ENDDO
538	ENDDO
539
540	END SUBROUTINE split
541
542	END SUBROUTINE tridia
543
544
545	#if defined( __parallel )
546	SUBROUTINE fftxp( ar, direction )
547
548	!------------------------------------------------------------------------------!
549	! Fourier-transformation along x-direction Parallelized version
550	!------------------------------------------------------------------------------!
551
552	IMPLICIT NONE
553
554	CHARACTER (LEN=*) :: direction
555	INTEGER :: j, k
556	REAL :: ar(0:nxa,nys_x:nyn_xa,nzb_x:nzt_xa)
557
558	!
559	!-- Performing the fft with one of the methods implemented
560	!$OMP PARALLEL PRIVATE ( j, k )
561	!$OMP DO
562	DO k = nzb_x, nzt_x
563	DO j = nys_x, nyn_x
564	CALL fft_x( ar(0:nx,j,k), direction )
565	ENDDO
566	ENDDO
567	!$OMP END PARALLEL
568
569	END SUBROUTINE fftxp
570
571	#else
572	SUBROUTINE fftx( ar, direction )
573
574	!------------------------------------------------------------------------------!
575	! Fourier-transformation along x-direction Non parallel version
576	!------------------------------------------------------------------------------!
577
578	IMPLICIT NONE
579
580	CHARACTER (LEN=*) :: direction
581	INTEGER :: i, j, k
582	REAL :: ar(1:nz,0:ny,0:nx)
583
584	!
585	!-- Performing the fft with one of the methods implemented
586	!$OMP PARALLEL PRIVATE ( j, k )
587	!$OMP DO
588	DO k = 1, nz
589	DO j = 0, ny
590	CALL fft_x( ar(k,j,0:nx), direction )
591	ENDDO
592	ENDDO
593	!$OMP END PARALLEL
594
595	END SUBROUTINE fftx
596	#endif
597
598
599	#if defined( __parallel )
600	SUBROUTINE fftyp( ar, direction )
601
602	!------------------------------------------------------------------------------!
603	! Fourier-transformation along y-direction Parallelized version
604	!------------------------------------------------------------------------------!
605
606	IMPLICIT NONE
607
608	CHARACTER (LEN=*) :: direction
609	INTEGER :: i, k
610	REAL :: ar(0:nya,nxl_y:nxr_ya,nzb_y:nzt_ya)
611
612	!
613	!-- Performing the fft with one of the methods implemented
614	!$OMP PARALLEL PRIVATE ( i, k )
615	!$OMP DO
616	DO k = nzb_y, nzt_y
617	DO i = nxl_y, nxr_y
618	CALL fft_y( ar(0:ny,i,k), direction )
619	ENDDO
620	ENDDO
621	!$OMP END PARALLEL
622
623	END SUBROUTINE fftyp
624
625	#else
626	SUBROUTINE ffty( ar, direction )
627
628	!------------------------------------------------------------------------------!
629	! Fourier-transformation along y-direction Non parallel version
630	!------------------------------------------------------------------------------!
631
632	IMPLICIT NONE
633
634	CHARACTER (LEN=*) :: direction
635	INTEGER :: i, k
636	REAL :: ar(1:nz,0:ny,0:nx)
637
638	!
639	!-- Performing the fft with one of the methods implemented
640	!$OMP PARALLEL PRIVATE ( i, k )
641	!$OMP DO
642	DO k = 1, nz
643	DO i = 0, nx
644	CALL fft_y( ar(k,0:ny,i), direction )
645	ENDDO
646	ENDDO
647	!$OMP END PARALLEL
648
649	END SUBROUTINE ffty
650	#endif
651
652	#if defined( __parallel )
653	SUBROUTINE ffty_tr_yx( f_in, work, f_out )
654
655	!------------------------------------------------------------------------------!
656	! Fourier-transformation along y with subsequent transposition y --> x for
657	! a 1d-decomposition along x
658	!
659	! ATTENTION: The performance of this routine is much faster on the NEC-SX6,
660	! if the first index of work_ffty_vec is odd. Otherwise
661	! memory bank conflicts may occur (especially if the index is a
662	! multiple of 128). That's why work_ffty_vec is dimensioned as
663	! 0:ny+1.
664	! Of course, this will not work if users are using an odd number
665	! of gridpoints along y.
666	!------------------------------------------------------------------------------!
667
668	USE control_parameters
669	USE cpulog
670	USE indices
671	USE interfaces
672	USE pegrid
673	USE transpose_indices
674
675	IMPLICIT NONE
676
677	INTEGER :: i, iend, iouter, ir, j, k
678	INTEGER, PARAMETER :: stridex = 4
679
680	REAL, DIMENSION(0:ny,stridex) :: work_ffty
681	#if defined( __nec )
682	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
683	#endif
684	REAL, DIMENSION(1:nza,0:nya,nxl:nxra) :: f_in
685	REAL, DIMENSION(nnx,1:nza,nys_x:nyn_xa,pdims(1)) :: f_out
686	REAL, DIMENSION(nxl:nxra,1:nza,0:nya) :: work
687
688	!
689	!-- Carry out the FFT along y, where all data are present due to the
690	!-- 1d-decomposition along x. Resort the data in a way that x becomes
691	!-- the first index.
692	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
693
694	IF ( host(1:3) == 'nec' ) THEN
695	#if defined( __nec )
696	!
697	!-- Code optimized for vector processors
698	!$OMP PARALLEL PRIVATE ( i, j, k )
699	!$OMP DO
700	DO i = nxl, nxr
701
702	DO j = 0, ny
703	DO k = 1, nz
704	work_ffty_vec(j,k,i) = f_in(k,j,i)
705	ENDDO
706	ENDDO
707
708	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' )
709
710	ENDDO
711
712	!$OMP DO
713	DO k = 1, nz
714	DO j = 0, ny
715	DO i = nxl, nxr
716	work(i,k,j) = work_ffty_vec(j,k,i)
717	ENDDO
718	ENDDO
719	ENDDO
720	!$OMP END PARALLEL
721	#endif
722
723	ELSE
724
725	!
726	!-- Cache optimized code.
727	!-- The i-(x-)direction is split into a strided outer loop and an inner
728	!-- loop for better cache performance
729	!$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty)
730	!$OMP DO
731	DO iouter = nxl, nxr, stridex
732
733	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
734
735	DO k = 1, nz
736
737	DO i = iouter, iend
738
739	ir = i-iouter+1 ! counter within a stride
740	DO j = 0, ny
741	work_ffty(j,ir) = f_in(k,j,i)
742	ENDDO
743	!
744	!-- FFT along y
745	CALL fft_y( work_ffty(:,ir), 'forward' )
746
747	ENDDO
748
749	!
750	!-- Resort
751	DO j = 0, ny
752	DO i = iouter, iend
753	work(i,k,j) = work_ffty(j,i-iouter+1)
754	ENDDO
755	ENDDO
756
757	ENDDO
758
759	ENDDO
760	!$OMP END PARALLEL
761
762	ENDIF
763	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
764
765	!
766	!-- Transpose array
767	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
768	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
769	CALL MPI_ALLTOALL( work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
770	f_out(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
771	comm1dx, ierr )
772	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
773
774	END SUBROUTINE ffty_tr_yx
775
776
777	SUBROUTINE tr_xy_ffty( f_in, work, f_out )
778
779	!------------------------------------------------------------------------------!
780	! Transposition x --> y with a subsequent backward Fourier transformation for
781	! a 1d-decomposition along x
782	!------------------------------------------------------------------------------!
783
784	USE control_parameters
785	USE cpulog
786	USE indices
787	USE interfaces
788	USE pegrid
789	USE transpose_indices
790
791	IMPLICIT NONE
792
793	INTEGER :: i, iend, iouter, ir, j, k
794	INTEGER, PARAMETER :: stridex = 4
795
796	REAL, DIMENSION(0:ny,stridex) :: work_ffty
797	#if defined( __nec )
798	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
799	#endif
800	REAL, DIMENSION(nnx,1:nza,nys_x:nyn_xa,pdims(1)) :: f_in
801	REAL, DIMENSION(1:nza,0:nya,nxl:nxra) :: f_out
802	REAL, DIMENSION(nxl:nxra,1:nza,0:nya) :: work
803
804	!
805	!-- Transpose array
806	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
807	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
808	CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
809	work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
810	comm1dx, ierr )
811	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
812
813	!
814	!-- Resort the data in a way that y becomes the first index and carry out the
815	!-- backward fft along y.
816	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
817
818	IF ( host(1:3) == 'nec' ) THEN
819	#if defined( __nec )
820	!
821	!-- Code optimized for vector processors
822	!$OMP PARALLEL PRIVATE ( i, j, k )
823	!$OMP DO
824	DO k = 1, nz
825	DO j = 0, ny
826	DO i = nxl, nxr
827	work_ffty_vec(j,k,i) = work(i,k,j)
828	ENDDO
829	ENDDO
830	ENDDO
831
832	!$OMP DO
833	DO i = nxl, nxr
834
835	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' )
836
837	DO j = 0, ny
838	DO k = 1, nz
839	f_out(k,j,i) = work_ffty_vec(j,k,i)
840	ENDDO
841	ENDDO
842
843	ENDDO
844	!$OMP END PARALLEL
845	#endif
846
847	ELSE
848
849	!
850	!-- Cache optimized code.
851	!-- The i-(x-)direction is split into a strided outer loop and an inner
852	!-- loop for better cache performance
853	!$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty )
854	!$OMP DO
855	DO iouter = nxl, nxr, stridex
856
857	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
858
859	DO k = 1, nz
860	!
861	!-- Resort
862	DO j = 0, ny
863	DO i = iouter, iend
864	work_ffty(j,i-iouter+1) = work(i,k,j)
865	ENDDO
866	ENDDO
867
868	DO i = iouter, iend
869
870	!
871	!-- FFT along y
872	ir = i-iouter+1 ! counter within a stride
873	CALL fft_y( work_ffty(:,ir), 'backward' )
874
875	DO j = 0, ny
876	f_out(k,j,i) = work_ffty(j,ir)
877	ENDDO
878	ENDDO
879
880	ENDDO
881
882	ENDDO
883	!$OMP END PARALLEL
884
885	ENDIF
886
887	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
888
889	END SUBROUTINE tr_xy_ffty
890
891
892	SUBROUTINE fftx_tri_fftx( ar )
893
894	!------------------------------------------------------------------------------!
895	! FFT along x, solution of the tridiagonal system and backward FFT for
896	! a 1d-decomposition along x
897	!
898	! WARNING: this subroutine may still not work for hybrid parallelization
899	! with OpenMP (for possible necessary changes see the original
900	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
901	!------------------------------------------------------------------------------!
902
903	USE control_parameters
904	USE cpulog
905	USE grid_variables
906	USE indices
907	USE interfaces
908	USE pegrid
909	USE transpose_indices
910
911	IMPLICIT NONE
912
913	character(len=3) :: myth_char
914
915	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
916
917	REAL, DIMENSION(0:nx) :: work_fftx
918	REAL, DIMENSION(0:nx,1:nz) :: work_trix
919	REAL, DIMENSION(nnx,1:nza,nys_x:nyn_xa,pdims(1)) :: ar
920	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
921
922
923	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'start' )
924
925	ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) )
926
927	tn = 0 ! Default thread number in case of one thread
928	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_fftx, work_trix )
929	DO j = nys_x, nyn_x
930
931	!$ tn = omp_get_thread_num()
932
933	IF ( host(1:3) == 'nec' ) THEN
934	!
935	!-- Code optimized for vector processors
936	DO k = 1, nz
937
938	m = 0
939	DO n = 1, pdims(1)
940	DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
941	work_trix(m,k) = ar(i,k,j,n)
942	m = m + 1
943	ENDDO
944	ENDDO
945
946	ENDDO
947
948	CALL fft_x_m( work_trix, 'forward' )
949
950	ELSE
951	!
952	!-- Cache optimized code
953	DO k = 1, nz
954
955	m = 0
956	DO n = 1, pdims(1)
957	DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
958	work_fftx(m) = ar(i,k,j,n)
959	m = m + 1
960	ENDDO
961	ENDDO
962
963	CALL fft_x( work_fftx, 'forward' )
964
965	DO i = 0, nx
966	work_trix(i,k) = work_fftx(i)
967	ENDDO
968
969	ENDDO
970
971	ENDIF
972
973	!
974	!-- Solve the linear equation system
975	CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) )
976
977	IF ( host(1:3) == 'nec' ) THEN
978	!
979	!-- Code optimized for vector processors
980	CALL fft_x_m( work_trix, 'backward' )
981
982	DO k = 1, nz
983
984	m = 0
985	DO n = 1, pdims(1)
986	DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
987	ar(i,k,j,n) = work_trix(m,k)
988	m = m + 1
989	ENDDO
990	ENDDO
991
992	ENDDO
993
994	ELSE
995	!
996	!-- Cache optimized code
997	DO k = 1, nz
998
999	DO i = 0, nx
1000	work_fftx(i) = work_trix(i,k)
1001	ENDDO
1002
1003	CALL fft_x( work_fftx, 'backward' )
1004
1005	m = 0
1006	DO n = 1, pdims(1)
1007	DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
1008	ar(i,k,j,n) = work_fftx(m)
1009	m = m + 1
1010	ENDDO
1011	ENDDO
1012
1013	ENDDO
1014
1015	ENDIF
1016
1017	ENDDO
1018
1019	DEALLOCATE( tri )
1020
1021	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'stop' )
1022
1023	END SUBROUTINE fftx_tri_fftx
1024
1025
1026	SUBROUTINE fftx_tr_xy( f_in, work, f_out )
1027
1028	!------------------------------------------------------------------------------!
1029	! Fourier-transformation along x with subsequent transposition x --> y for
1030	! a 1d-decomposition along y
1031	!
1032	! ATTENTION: The NEC-branch of this routine may significantly profit from
1033	! further optimizations. So far, performance is much worse than
1034	! for routine ffty_tr_yx (more than three times slower).
1035	!------------------------------------------------------------------------------!
1036
1037	USE control_parameters
1038	USE cpulog
1039	USE indices
1040	USE interfaces
1041	USE pegrid
1042	USE transpose_indices
1043
1044	IMPLICIT NONE
1045
1046	INTEGER :: i, j, k
1047
1048	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
1049	REAL, DIMENSION(1:nza,nys:nyna,0:nxa) :: f_in
1050	REAL, DIMENSION(nny,1:nza,nxl_y:nxr_ya,pdims(2)) :: f_out
1051	REAL, DIMENSION(nys:nyna,1:nza,0:nxa) :: work
1052
1053	!
1054	!-- Carry out the FFT along x, where all data are present due to the
1055	!-- 1d-decomposition along y. Resort the data in a way that y becomes
1056	!-- the first index.
1057	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
1058
1059	IF ( host(1:3) == 'nec' ) THEN
1060	!
1061	!-- Code for vector processors
1062	!$OMP PARALLEL PRIVATE ( i, j, k )
1063	!$OMP DO
1064	DO i = 0, nx
1065
1066	DO j = nys, nyn
1067	DO k = 1, nz
1068	work_fftx(i,k,j) = f_in(k,j,i)
1069	ENDDO
1070	ENDDO
1071
1072	ENDDO
1073
1074	!$OMP DO
1075	DO j = nys, nyn
1076
1077	CALL fft_x_m( work_fftx(:,:,j), 'forward' )
1078
1079	DO k = 1, nz
1080	DO i = 0, nx
1081	work(j,k,i) = work_fftx(i,k,j)
1082	ENDDO
1083	ENDDO
1084
1085	ENDDO
1086	!$OMP END PARALLEL
1087
1088	ELSE
1089
1090	!
1091	!-- Cache optimized code (there might be still a potential for better
1092	!-- optimization).
1093	!$OMP PARALLEL PRIVATE (i,j,k)
1094	!$OMP DO
1095	DO i = 0, nx
1096
1097	DO j = nys, nyn
1098	DO k = 1, nz
1099	work_fftx(i,k,j) = f_in(k,j,i)
1100	ENDDO
1101	ENDDO
1102
1103	ENDDO
1104
1105	!$OMP DO
1106	DO j = nys, nyn
1107	DO k = 1, nz
1108
1109	CALL fft_x( work_fftx(0:nx,k,j), 'forward' )
1110
1111	DO i = 0, nx
1112	work(j,k,i) = work_fftx(i,k,j)
1113	ENDDO
1114	ENDDO
1115
1116	ENDDO
1117	!$OMP END PARALLEL
1118
1119	ENDIF
1120	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
1121
1122	!
1123	!-- Transpose array
1124	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1125	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1126	CALL MPI_ALLTOALL( work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
1127	f_out(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
1128	comm1dy, ierr )
1129	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1130
1131	END SUBROUTINE fftx_tr_xy
1132
1133
1134	SUBROUTINE tr_yx_fftx( f_in, work, f_out )
1135
1136	!------------------------------------------------------------------------------!
1137	! Transposition y --> x with a subsequent backward Fourier transformation for
1138	! a 1d-decomposition along x
1139	!------------------------------------------------------------------------------!
1140
1141	USE control_parameters
1142	USE cpulog
1143	USE indices
1144	USE interfaces
1145	USE pegrid
1146	USE transpose_indices
1147
1148	IMPLICIT NONE
1149
1150	INTEGER :: i, j, k
1151
1152	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
1153	REAL, DIMENSION(nny,1:nza,nxl_y:nxr_ya,pdims(2)) :: f_in
1154	REAL, DIMENSION(1:nza,nys:nyna,0:nxa) :: f_out
1155	REAL, DIMENSION(nys:nyna,1:nza,0:nxa) :: work
1156
1157	!
1158	!-- Transpose array
1159	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1160	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1161	CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
1162	work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
1163	comm1dy, ierr )
1164	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1165
1166	!
1167	!-- Carry out the FFT along x, where all data are present due to the
1168	!-- 1d-decomposition along y. Resort the data in a way that y becomes
1169	!-- the first index.
1170	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
1171
1172	IF ( host(1:3) == 'nec' ) THEN
1173	!
1174	!-- Code optimized for vector processors
1175	!$OMP PARALLEL PRIVATE ( i, j, k )
1176	!$OMP DO
1177	DO j = nys, nyn
1178
1179	DO k = 1, nz
1180	DO i = 0, nx
1181	work_fftx(i,k,j) = work(j,k,i)
1182	ENDDO
1183	ENDDO
1184
1185	CALL fft_x_m( work_fftx(:,:,j), 'backward' )
1186
1187	ENDDO
1188
1189	!$OMP DO
1190	DO i = 0, nx
1191	DO j = nys, nyn
1192	DO k = 1, nz
1193	f_out(k,j,i) = work_fftx(i,k,j)
1194	ENDDO
1195	ENDDO
1196	ENDDO
1197	!$OMP END PARALLEL
1198
1199	ELSE
1200
1201	!
1202	!-- Cache optimized code (there might be still a potential for better
1203	!-- optimization).
1204	!$OMP PARALLEL PRIVATE (i,j,k)
1205	!$OMP DO
1206	DO j = nys, nyn
1207	DO k = 1, nz
1208
1209	DO i = 0, nx
1210	work_fftx(i,k,j) = work(j,k,i)
1211	ENDDO
1212
1213	CALL fft_x( work_fftx(0:nx,k,j), 'backward' )
1214
1215	ENDDO
1216	ENDDO
1217
1218	!$OMP DO
1219	DO i = 0, nx
1220	DO j = nys, nyn
1221	DO k = 1, nz
1222	f_out(k,j,i) = work_fftx(i,k,j)
1223	ENDDO
1224	ENDDO
1225	ENDDO
1226	!$OMP END PARALLEL
1227
1228	ENDIF
1229	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
1230
1231	END SUBROUTINE tr_yx_fftx
1232
1233
1234	SUBROUTINE ffty_tri_ffty( ar )
1235
1236	!------------------------------------------------------------------------------!
1237	! FFT along y, solution of the tridiagonal system and backward FFT for
1238	! a 1d-decomposition along y
1239	!
1240	! WARNING: this subroutine may still not work for hybrid parallelization
1241	! with OpenMP (for possible necessary changes see the original
1242	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
1243	!------------------------------------------------------------------------------!
1244
1245	USE control_parameters
1246	USE cpulog
1247	USE grid_variables
1248	USE indices
1249	USE interfaces
1250	USE pegrid
1251	USE transpose_indices
1252
1253	IMPLICIT NONE
1254
1255	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
1256
1257	REAL, DIMENSION(0:ny) :: work_ffty
1258	REAL, DIMENSION(0:ny,1:nz) :: work_triy
1259	REAL, DIMENSION(nny,1:nza,nxl_y:nxr_ya,pdims(2)) :: ar
1260	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
1261
1262
1263	CALL cpu_log( log_point_s(39), 'fft_y + tridia', 'start' )
1264
1265	ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) )
1266
1267	tn = 0 ! Default thread number in case of one thread
1268	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy )
1269	DO i = nxl_y, nxr_y
1270
1271	!$ tn = omp_get_thread_num()
1272
1273	IF ( host(1:3) == 'nec' ) THEN
1274	!
1275	!-- Code optimized for vector processors
1276	DO k = 1, nz
1277
1278	m = 0
1279	DO n = 1, pdims(2)
1280	DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
1281	work_triy(m,k) = ar(j,k,i,n)
1282	m = m + 1
1283	ENDDO
1284	ENDDO
1285
1286	ENDDO
1287
1288	CALL fft_y_m( work_triy, ny, 'forward' )
1289
1290	ELSE
1291	!
1292	!-- Cache optimized code
1293	DO k = 1, nz
1294
1295	m = 0
1296	DO n = 1, pdims(2)
1297	DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
1298	work_ffty(m) = ar(j,k,i,n)
1299	m = m + 1
1300	ENDDO
1301	ENDDO
1302
1303	CALL fft_y( work_ffty, 'forward' )
1304
1305	DO j = 0, ny
1306	work_triy(j,k) = work_ffty(j)
1307	ENDDO
1308
1309	ENDDO
1310
1311	ENDIF
1312
1313	!
1314	!-- Solve the linear equation system
1315	CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) )
1316
1317	IF ( host(1:3) == 'nec' ) THEN
1318	!
1319	!-- Code optimized for vector processors
1320	CALL fft_y_m( work_triy, ny, 'backward' )
1321
1322	DO k = 1, nz
1323
1324	m = 0
1325	DO n = 1, pdims(2)
1326	DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
1327	ar(j,k,i,n) = work_triy(m,k)
1328	m = m + 1
1329	ENDDO
1330	ENDDO
1331
1332	ENDDO
1333
1334	ELSE
1335	!
1336	!-- Cache optimized code
1337	DO k = 1, nz
1338
1339	DO j = 0, ny
1340	work_ffty(j) = work_triy(j,k)
1341	ENDDO
1342
1343	CALL fft_y( work_ffty, 'backward' )
1344
1345	m = 0
1346	DO n = 1, pdims(2)
1347	DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
1348	ar(j,k,i,n) = work_ffty(m)
1349	m = m + 1
1350	ENDDO
1351	ENDDO
1352
1353	ENDDO
1354
1355	ENDIF
1356
1357	ENDDO
1358
1359	DEALLOCATE( tri )
1360
1361	CALL cpu_log( log_point_s(39), 'fft_y + tridia', 'stop' )
1362
1363	END SUBROUTINE ffty_tri_ffty
1364
1365
1366	SUBROUTINE tridia_1dd( ddx2, ddy2, nx, ny, j, ar, tri )
1367
1368	!------------------------------------------------------------------------------!
1369	! Solves the linear system of equations for a 1d-decomposition along x (see
1370	! tridia)
1371	!
1372	! Attention: when using the intel compiler, array tri must be passed as an
1373	! argument to the contained subroutines. Otherwise addres faults
1374	! will occur.
1375	! On NEC, tri should not be passed (except for routine substi_1dd)
1376	! because this causes very bad performance.
1377	!------------------------------------------------------------------------------!
1378
1379	USE arrays_3d
1380	USE control_parameters
1381
1382	USE pegrid
1383
1384	IMPLICIT NONE
1385
1386	INTEGER :: i, j, k, nnyh, nx, ny, omp_get_thread_num, tn
1387
1388	REAL :: ddx2, ddy2
1389
1390	REAL, DIMENSION(0:nx,1:nz) :: ar
1391	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
1392	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
1393
1394
1395	nnyh = ( ny + 1 ) / 2
1396
1397	!
1398	!-- Define constant elements of the tridiagonal matrix.
1399	!-- The compiler on SX6 does loop exchange. If 0:nx is a high power of 2,
1400	!-- the exchanged loops create bank conflicts. The following directive
1401	!-- prohibits loop exchange and the loops perform much better.
1402	! tn = omp_get_thread_num()
1403	! WRITE( 120+tn, * ) '+++ id=',myid,' nx=',nx,' thread=', omp_get_thread_num()
1404	! CALL local_flush( 120+tn )
1405	!CDIR NOLOOPCHG
1406	DO k = 0, nz-1
1407	DO i = 0,nx
1408	tri(2,i,k) = ddzu_pres(k+1) * ddzw(k+1)
1409	tri(3,i,k) = ddzu_pres(k+2) * ddzw(k+1)
1410	ENDDO
1411	ENDDO
1412	! WRITE( 120+tn, * ) '+++ id=',myid,' end of first tridia loop thread=', omp_get_thread_num()
1413	! CALL local_flush( 120+tn )
1414
1415	IF ( j <= nnyh ) THEN
1416	#if defined( __lc )
1417	CALL maketri_1dd( j, tri )
1418	#else
1419	CALL maketri_1dd( j )
1420	#endif
1421	ELSE
1422	#if defined( __lc )
1423	CALL maketri_1dd( ny+1-j, tri )
1424	#else
1425	CALL maketri_1dd( ny+1-j )
1426	#endif
1427	ENDIF
1428	#if defined( __lc )
1429	CALL split_1dd( tri )
1430	#else
1431	CALL split_1dd
1432	#endif
1433	CALL substi_1dd( ar, tri )
1434
1435	CONTAINS
1436
1437	#if defined( __lc )
1438	SUBROUTINE maketri_1dd( j, tri )
1439	#else
1440	SUBROUTINE maketri_1dd( j )
1441	#endif
1442
1443	!------------------------------------------------------------------------------!
1444	! computes the i- and j-dependent component of the matrix
1445	!------------------------------------------------------------------------------!
1446
1447	USE constants
1448
1449	IMPLICIT NONE
1450
1451	INTEGER :: i, j, k, nnxh
1452	REAL :: a, c
1453
1454	REAL, DIMENSION(0:nx) :: l
1455
1456	#if defined( __lc )
1457	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
1458	#endif
1459
1460
1461	nnxh = ( nx + 1 ) / 2
1462	!
1463	!-- Provide the tridiagonal matrix for solution of the Poisson equation in
1464	!-- Fourier space. The coefficients are computed following the method of
1465	!-- Schmidt et al. (DFVLR-Mitteilung 84-15), which departs from Stephan
1466	!-- Siano's original version by discretizing the Poisson equation,
1467	!-- before it is Fourier-transformed
1468	DO i = 0, nx
1469	IF ( i >= 0 .AND. i <= nnxh ) THEN
1470	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
1471	FLOAT( nx+1 ) ) ) * ddx2 + &
1472	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
1473	FLOAT( ny+1 ) ) ) * ddy2
1474	ELSE
1475	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
1476	FLOAT( nx+1 ) ) ) * ddx2 + &
1477	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
1478	FLOAT( ny+1 ) ) ) * ddy2
1479	ENDIF
1480	ENDDO
1481
1482	DO k = 0, nz-1
1483	DO i = 0, nx
1484	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
1485	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
1486	tri(1,i,k) = a + c - l(i)
1487	ENDDO
1488	ENDDO
1489	IF ( ibc_p_b == 1 .OR. ibc_p_b == 2 ) THEN
1490	DO i = 0, nx
1491	tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
1492	ENDDO
1493	ENDIF
1494	IF ( ibc_p_t == 1 ) THEN
1495	DO i = 0, nx
1496	tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
1497	ENDDO
1498	ENDIF
1499
1500	END SUBROUTINE maketri_1dd
1501
1502
1503	#if defined( __lc )
1504	SUBROUTINE split_1dd( tri )
1505	#else
1506	SUBROUTINE split_1dd
1507	#endif
1508
1509	!------------------------------------------------------------------------------!
1510	! Splitting of the tridiagonal matrix (Thomas algorithm)
1511	!------------------------------------------------------------------------------!
1512
1513	IMPLICIT NONE
1514
1515	INTEGER :: i, k
1516
1517	#if defined( __lc )
1518	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
1519	#endif
1520
1521
1522	!
1523	!-- Splitting
1524	DO i = 0, nx
1525	tri(4,i,0) = tri(1,i,0)
1526	ENDDO
1527	DO k = 1, nz-1
1528	DO i = 0, nx
1529	tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
1530	tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
1531	ENDDO
1532	ENDDO
1533
1534	END SUBROUTINE split_1dd
1535
1536
1537	SUBROUTINE substi_1dd( ar, tri )
1538
1539	!------------------------------------------------------------------------------!
1540	! Substitution (Forward and Backward) (Thomas algorithm)
1541	!------------------------------------------------------------------------------!
1542
1543	IMPLICIT NONE
1544
1545	INTEGER :: i, k
1546
1547	REAL, DIMENSION(0:nx,nz) :: ar
1548	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
1549	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
1550
1551	!
1552	!-- Forward substitution
1553	DO i = 0, nx
1554	ar1(i,0) = ar(i,1)
1555	ENDDO
1556	DO k = 1, nz-1
1557	DO i = 0, nx
1558	ar1(i,k) = ar(i,k+1) - tri(5,i,k) * ar1(i,k-1)
1559	ENDDO
1560	ENDDO
1561
1562	!
1563	!-- Backward substitution
1564	!-- Note, the add of 1.0E-20 in the denominator is due to avoid divisions
1565	!-- by zero appearing if the pressure bc is set to neumann at the top of
1566	!-- the model domain.
1567	DO i = 0, nx
1568	ar(i,nz) = ar1(i,nz-1) / ( tri(4,i,nz-1) + 1.0E-20 )
1569	ENDDO
1570	DO k = nz-2, 0, -1
1571	DO i = 0, nx
1572	ar(i,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,k+2) ) &
1573	/ tri(4,i,k)
1574	ENDDO
1575	ENDDO
1576
1577	!
1578	!-- Indices i=0, j=0 correspond to horizontally averaged pressure.
1579	!-- The respective values of ar should be zero at all k-levels if
1580	!-- acceleration of horizontally averaged vertical velocity is zero.
1581	IF ( ibc_p_b == 1 .AND. ibc_p_t == 1 ) THEN
1582	IF ( j == 0 ) THEN
1583	DO k = 1, nz
1584	ar(0,k) = 0.0
1585	ENDDO
1586	ENDIF
1587	ENDIF
1588
1589	END SUBROUTINE substi_1dd
1590
1591	END SUBROUTINE tridia_1dd
1592
1593	#endif
1594	#endif
1595	END MODULE poisfft_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |