Home

Context Navigation

source: palm/trunk/SOURCE/poisfft.f90 @ 877

Last change on this file since 877 was 877, checked in by suehring, 12 years ago
Avoid divisions by zero in case of using a 'neumann' bc for the pressure at the top of the model domain.
Property svn:keywords set to `Id`
File size: 46.8 KB

Line
1	MODULE poisfft_mod
2
3	!------------------------------------------------------------------------------!
4	! Current revisions:
5	! -----------------
6	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
7	! pressure at the top of the model domain.
8	!
9	! Former revisions:
10	! -----------------
11	! $Id: poisfft.f90 877 2012-04-03 11:21:44Z suehring $
12	!
13	! 809 2012-01-30 13:32:58Z maronga
14	! Bugfix: replaced .AND. and .NOT. with && and ! in the preprocessor directives
15	!
16	! 807 2012-01-25 11:53:51Z maronga
17	! New cpp directive "__check" implemented which is used by check_namelist_files
18	! (most of the code is unneeded by check_namelist_files).
19	!
20	! 763 2011-10-06 09:32:09Z suehring
21	! Comment added concerning the last change.
22	!
23	! 761 2011-10-05 17:58:52Z suehring
24	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
25	! pressure at the top of the model domain.
26	!
27	! 696 2011-03-18 07:03:49Z raasch
28	! work_fftx removed from PRIVATE clauses in fftx_tr_xy and tr_yx_fftx
29	!
30	! 683 2011-02-09 14:25:15Z raasch
31	! openMP parallelization for 2d-domain-decomposition
32	!
33	! 667 2010-12-23 12:06:00Z suehring/gryschka
34	! ddzu replaced by ddzu_pres due to changes in zu(0)
35	!
36	! 622 2010-12-10 08:08:13Z raasch
37	! optional barriers included in order to speed up collective operations
38	!
39	! 377 2009-09-04 11:09:00Z raasch
40	! __lcmuk changed to __lc to avoid problems with Intel compiler on sgi-ice
41	!
42	! 164 2008-05-15 08:46:15Z raasch
43	! Arguments removed from transpose routines
44	!
45	! 128 2007-10-26 13:11:14Z raasch
46	! Bugfix: wavenumber calculation for even nx in routines maketri
47	!
48	! 85 2007-05-11 09:35:14Z raasch
49	! Bugfix: work_fft*_vec removed from some PRIVATE-declarations
50	!
51	! 76 2007-03-29 00:58:32Z raasch
52	! Tridiagonal coefficients adjusted for Neumann boundary conditions both at
53	! the bottom and the top.
54	!
55	! RCS Log replace by Id keyword, revision history cleaned up
56	!
57	! Revision 1.24 2006/08/04 15:00:24 raasch
58	! Default setting of the thread number tn in case of not using OpenMP
59	!
60	! Revision 1.23 2006/02/23 12:48:38 raasch
61	! Additional compiler directive in routine tridia_1dd for preventing loop
62	! exchange on NEC-SX6
63	!
64	! Revision 1.20 2004/04/30 12:38:09 raasch
65	! Parts of former poisfft_hybrid moved to this subroutine,
66	! former subroutine changed to a module, renaming of FFT-subroutines and
67	! -module, FFTs completely substituted by calls of fft_x and fft_y,
68	! NAG fft used in the non-parallel case completely removed, l in maketri
69	! is now a 1d-array, variables passed by modules instead of using parameter
70	! lists, enlarged transposition arrays introduced
71	!
72	! Revision 1.1 1997/07/24 11:24:14 raasch
73	! Initial revision
74	!
75	!
76	! Description:
77	! ------------
78	! See below.
79	!------------------------------------------------------------------------------!
80
81	!--------------------------------------------------------------------------!
82	! poisfft !
83	! !
84	! Original version: Stephan Siano (pois3d) !
85	! !
86	! Institute of Meteorology and Climatology, University of Hannover !
87	! Germany !
88	! !
89	! Version as of July 23,1996 !
90	! !
91	! !
92	! Version for parallel computers: Siegfried Raasch !
93	! !
94	! Version as of July 03,1997 !
95	! !
96	! Solves the Poisson equation with a 2D spectral method !
97	! d^2 p / dx^2 + d^2 p / dy^2 + d^2 p / dz^2 = s !
98	! !
99	! Input: !
100	! real ar contains in the (nnx,nny,nnz) elements, !
101	! starting from the element (1,nys,nxl), the !
102	! values for s !
103	! real work Temporary array !
104	! !
105	! Output: !
106	! real ar contains the solution for p !
107	!--------------------------------------------------------------------------!
108
109	USE fft_xy
110	USE indices
111	USE transpose_indices
112
113	IMPLICIT NONE
114
115	PRIVATE
116
117	#if ! defined ( __check )
118	PUBLIC poisfft, poisfft_init
119
120	INTERFACE poisfft
121	MODULE PROCEDURE poisfft
122	END INTERFACE poisfft
123
124	INTERFACE poisfft_init
125	MODULE PROCEDURE poisfft_init
126	END INTERFACE poisfft_init
127	#else
128	PUBLIC poisfft_init
129
130	INTERFACE poisfft_init
131	MODULE PROCEDURE poisfft_init
132	END INTERFACE poisfft_init
133	#endif
134
135	CONTAINS
136
137	SUBROUTINE poisfft_init
138
139	CALL fft_init
140
141	END SUBROUTINE poisfft_init
142
143	#if ! defined ( __check )
144	SUBROUTINE poisfft( ar, work )
145
146	USE cpulog
147	USE interfaces
148	USE pegrid
149
150	IMPLICIT NONE
151
152	REAL, DIMENSION(1:nza,nys:nyna,nxl:nxra) :: ar, work
153
154
155	CALL cpu_log( log_point_s(3), 'poisfft', 'start' )
156
157	!
158	!-- Two-dimensional Fourier Transformation in x- and y-direction.
159	#if defined( __parallel )
160	IF ( pdims(2) == 1 ) THEN
161
162	!
163	!-- 1d-domain-decomposition along x:
164	!-- FFT along y and transposition y --> x
165	CALL ffty_tr_yx( ar, work, ar )
166
167	!
168	!-- FFT along x, solving the tridiagonal system and backward FFT
169	CALL fftx_tri_fftx( ar )
170
171	!
172	!-- Transposition x --> y and backward FFT along y
173	CALL tr_xy_ffty( ar, work, ar )
174
175	ELSEIF ( pdims(1) == 1 ) THEN
176
177	!
178	!-- 1d-domain-decomposition along y:
179	!-- FFT along x and transposition x --> y
180	CALL fftx_tr_xy( ar, work, ar )
181
182	!
183	!-- FFT along y, solving the tridiagonal system and backward FFT
184	CALL ffty_tri_ffty( ar )
185
186	!
187	!-- Transposition y --> x and backward FFT along x
188	CALL tr_yx_fftx( ar, work, ar )
189
190	ELSE
191
192	!
193	!-- 2d-domain-decomposition
194	!-- Transposition z --> x
195	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
196	CALL transpose_zx( ar, work, ar )
197	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
198
199	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
200	CALL fftxp( ar, 'forward' )
201	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
202
203	!
204	!-- Transposition x --> y
205	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
206	CALL transpose_xy( ar, work, ar )
207	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
208
209	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
210	CALL fftyp( ar, 'forward' )
211	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
212
213	!
214	!-- Transposition y --> z
215	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
216	CALL transpose_yz( ar, work, ar )
217	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
218
219	!
220	!-- Solve the Poisson equation in z-direction in cartesian space.
221	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
222	CALL tridia( ar )
223	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
224
225	!
226	!-- Inverse Fourier Transformation
227	!-- Transposition z --> y
228	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
229	CALL transpose_zy( ar, work, ar )
230	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
231
232	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
233	CALL fftyp( ar, 'backward' )
234	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
235
236	!
237	!-- Transposition y --> x
238	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
239	CALL transpose_yx( ar, work, ar )
240	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
241
242	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
243	CALL fftxp( ar, 'backward' )
244	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
245
246	!
247	!-- Transposition x --> z
248	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
249	CALL transpose_xz( ar, work, ar )
250	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
251
252	ENDIF
253
254	#else
255
256	!
257	!-- Two-dimensional Fourier Transformation along x- and y-direction.
258	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
259	CALL fftx( ar, 'forward' )
260	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
261	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
262	CALL ffty( ar, 'forward' )
263	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
264
265	!
266	!-- Solve the Poisson equation in z-direction in cartesian space.
267	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
268	CALL tridia( ar )
269	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
270
271	!
272	!-- Inverse Fourier Transformation.
273	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
274	CALL ffty( ar, 'backward' )
275	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
276	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
277	CALL fftx( ar, 'backward' )
278	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
279
280	#endif
281
282	CALL cpu_log( log_point_s(3), 'poisfft', 'stop' )
283
284	END SUBROUTINE poisfft
285
286
287
288	SUBROUTINE tridia( ar )
289
290	!------------------------------------------------------------------------------!
291	! solves the linear system of equations:
292	!
293	! -(4 pi^2(i^2/(dx^2nnx^2)+j^2/(dy^2nny^2))+
294	! 1/(dzu(k)dzw(k))+1/(dzu(k-1)dzw(k)))*p(i,j,k)+
295	! 1/(dzu(k)dzw(k))p(i,j,k+1)+1/(dzu(k-1)dzw(k))p(i,j,k-1)=d(i,j,k)
296	!
297	! by using the Thomas algorithm
298	!------------------------------------------------------------------------------!
299
300	USE arrays_3d
301
302	IMPLICIT NONE
303
304	INTEGER :: i, j, k, nnyh
305
306	REAL, DIMENSION(nxl_z:nxr_z,0:nz-1) :: ar1
307	REAL, DIMENSION(5,nxl_z:nxr_z,0:nz-1) :: tri
308
309	#if defined( __parallel )
310	REAL :: ar(nxl_z:nxr_za,nys_z:nyn_za,1:nza)
311	#else
312	REAL :: ar(1:nz,nys_z:nyn_z,nxl_z:nxr_z)
313	#endif
314
315
316	nnyh = (ny+1) / 2
317
318	!
319	!-- Define constant elements of the tridiagonal matrix.
320	!$OMP PARALLEL PRIVATE ( k, i )
321	!$OMP DO
322	DO k = 0, nz-1
323	DO i = nxl_z, nxr_z
324	tri(2,i,k) = ddzu_pres(k+1) * ddzw(k+1)
325	tri(3,i,k) = ddzu_pres(k+2) * ddzw(k+1)
326	ENDDO
327	ENDDO
328	!$OMP END PARALLEL
329
330	#if defined( __parallel )
331	!
332	!-- Repeat for all y-levels.
333	!$OMP PARALLEL FIRSTPRIVATE( tri ) PRIVATE ( ar1, j )
334	!$OMP DO
335	DO j = nys_z, nyn_z
336	IF ( j <= nnyh ) THEN
337	CALL maketri( tri, j )
338	ELSE
339	CALL maketri( tri, ny+1-j )
340	ENDIF
341	CALL split( tri )
342	CALL substi( ar, ar1, tri, j )
343	ENDDO
344	!$OMP END PARALLEL
345	#else
346	!
347	!-- First y-level.
348	CALL maketri( tri, nys_z )
349	CALL split( tri )
350	CALL substi( ar, ar1, tri, 0 )
351
352	!
353	!-- Further y-levels.
354	DO j = 1, nnyh - 1
355	CALL maketri( tri, j )
356	CALL split( tri )
357	CALL substi( ar, ar1, tri, j )
358	CALL substi( ar, ar1, tri, ny+1-j )
359	ENDDO
360	CALL maketri( tri, nnyh )
361	CALL split( tri )
362	CALL substi( ar, ar1, tri, nnyh+nys )
363	#endif
364
365	CONTAINS
366
367	SUBROUTINE maketri( tri, j )
368
369	!------------------------------------------------------------------------------!
370	! Computes the i- and j-dependent component of the matrix
371	!------------------------------------------------------------------------------!
372
373	USE arrays_3d
374	USE constants
375	USE control_parameters
376	USE grid_variables
377
378	IMPLICIT NONE
379
380	INTEGER :: i, j, k, nnxh
381	REAL :: a, c
382	REAL :: ll(nxl_z:nxr_z)
383	REAL :: tri(5,nxl_z:nxr_z,0:nz-1)
384
385
386	nnxh = ( nx + 1 ) / 2
387
388	!
389	!-- Provide the tridiagonal matrix for solution of the Poisson equation in
390	!-- Fourier space. The coefficients are computed following the method of
391	!-- Schmidt et al. (DFVLR-Mitteilung 84-15), which departs from Stephan
392	!-- Siano's original version by discretizing the Poisson equation,
393	!-- before it is Fourier-transformed
394	#if defined( __parallel )
395	DO i = nxl_z, nxr_z
396	IF ( i >= 0 .AND. i <= nnxh ) THEN
397	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
398	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
399	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
400	FLOAT( ny+1 ) ) ) / ( dy * dy )
401	ELSE
402	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
403	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
404	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
405	FLOAT( ny+1 ) ) ) / ( dy * dy )
406	ENDIF
407	DO k = 0,nz-1
408	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
409	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
410	tri(1,i,k) = a + c - ll(i)
411	ENDDO
412	ENDDO
413	#else
414	DO i = 0, nnxh
415	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / FLOAT( nx+1 ) ) ) / &
416	( dx * dx ) + &
417	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / FLOAT( ny+1 ) ) ) / &
418	( dy * dy )
419	DO k = 0, nz-1
420	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
421	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
422	tri(1,i,k) = a + c - ll(i)
423	IF ( i >= 1 .and. i < nnxh ) THEN
424	tri(1,nx+1-i,k) = tri(1,i,k)
425	ENDIF
426	ENDDO
427	ENDDO
428	#endif
429	IF ( ibc_p_b == 1 .OR. ibc_p_b == 2 ) THEN
430	DO i = nxl_z, nxr_z
431	tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
432	ENDDO
433	ENDIF
434	IF ( ibc_p_t == 1 ) THEN
435	DO i = nxl_z, nxr_z
436	tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
437	ENDDO
438	ENDIF
439
440	END SUBROUTINE maketri
441
442
443	SUBROUTINE substi( ar, ar1, tri, j )
444
445	!------------------------------------------------------------------------------!
446	! Substitution (Forward and Backward) (Thomas algorithm)
447	!------------------------------------------------------------------------------!
448
449	USE control_parameters
450
451	IMPLICIT NONE
452
453	INTEGER :: i, j, k
454	REAL :: ar1(nxl_z:nxr_z,0:nz-1)
455	REAL :: tri(5,nxl_z:nxr_z,0:nz-1)
456	#if defined( __parallel )
457	REAL :: ar(nxl_z:nxr_za,nys_z:nyn_za,1:nza)
458	#else
459	REAL :: ar(1:nz,nys_z:nyn_z,nxl_z:nxr_z)
460	#endif
461
462	!
463	!-- Forward substitution.
464	DO i = nxl_z, nxr_z
465	#if defined( __parallel )
466	ar1(i,0) = ar(i,j,1)
467	#else
468	ar1(i,0) = ar(1,j,i)
469	#endif
470	ENDDO
471	DO k = 1, nz - 1
472	DO i = nxl_z, nxr_z
473	#if defined( __parallel )
474	ar1(i,k) = ar(i,j,k+1) - tri(5,i,k) * ar1(i,k-1)
475	#else
476	ar1(i,k) = ar(k+1,j,i) - tri(5,i,k) * ar1(i,k-1)
477	#endif
478	ENDDO
479	ENDDO
480
481	!
482	!-- Backward substitution
483	!-- Note, the 1.0E-20 in the denominator is due to avoid divisions
484	!-- by zero appearing if the pressure bc is set to neumann at the top of
485	!-- the model domain.
486	DO i = nxl_z, nxr_z
487	#if defined( __parallel )
488	ar(i,j,nz) = ar1(i,nz-1) / ( tri(4,i,nz-1) + 1.0E-20 )
489	#else
490	ar(nz,j,i) = ar1(i,nz-1) / ( tri(4,i,nz-1) + 1.0E-20 )
491	#endif
492	ENDDO
493	DO k = nz-2, 0, -1
494	DO i = nxl_z, nxr_z
495	#if defined( __parallel )
496	ar(i,j,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,j,k+2) ) &
497	/ tri(4,i,k)
498	#else
499	ar(k+1,j,i) = ( ar1(i,k) - tri(3,i,k) * ar(k+2,j,i) ) &
500	/ tri(4,i,k)
501	#endif
502	ENDDO
503	ENDDO
504
505	!
506	!-- Indices i=0, j=0 correspond to horizontally averaged pressure.
507	!-- The respective values of ar should be zero at all k-levels if
508	!-- acceleration of horizontally averaged vertical velocity is zero.
509	IF ( ibc_p_b == 1 .AND. ibc_p_t == 1 ) THEN
510	IF ( j == 0 .AND. nxl_z == 0 ) THEN
511	#if defined( __parallel )
512	DO k = 1, nz
513	ar(nxl_z,j,k) = 0.0
514	ENDDO
515	#else
516	DO k = 1, nz
517	ar(k,j,nxl_z) = 0.0
518	ENDDO
519	#endif
520	ENDIF
521	ENDIF
522
523	END SUBROUTINE substi
524
525
526	SUBROUTINE split( tri )
527
528	!------------------------------------------------------------------------------!
529	! Splitting of the tridiagonal matrix (Thomas algorithm)
530	!------------------------------------------------------------------------------!
531
532	IMPLICIT NONE
533
534	INTEGER :: i, k
535	REAL :: tri(5,nxl_z:nxr_z,0:nz-1)
536
537	!
538	!-- Splitting.
539	DO i = nxl_z, nxr_z
540	tri(4,i,0) = tri(1,i,0)
541	ENDDO
542	DO k = 1, nz-1
543	DO i = nxl_z, nxr_z
544	tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
545	tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
546	ENDDO
547	ENDDO
548
549	END SUBROUTINE split
550
551	END SUBROUTINE tridia
552
553
554	#if defined( __parallel )
555	SUBROUTINE fftxp( ar, direction )
556
557	!------------------------------------------------------------------------------!
558	! Fourier-transformation along x-direction Parallelized version
559	!------------------------------------------------------------------------------!
560
561	IMPLICIT NONE
562
563	CHARACTER (LEN=*) :: direction
564	INTEGER :: j, k
565	REAL :: ar(0:nxa,nys_x:nyn_xa,nzb_x:nzt_xa)
566
567	!
568	!-- Performing the fft with one of the methods implemented
569	!$OMP PARALLEL PRIVATE ( j, k )
570	!$OMP DO
571	DO k = nzb_x, nzt_x
572	DO j = nys_x, nyn_x
573	CALL fft_x( ar(0:nx,j,k), direction )
574	ENDDO
575	ENDDO
576	!$OMP END PARALLEL
577
578	END SUBROUTINE fftxp
579
580	#else
581	SUBROUTINE fftx( ar, direction )
582
583	!------------------------------------------------------------------------------!
584	! Fourier-transformation along x-direction Non parallel version
585	!------------------------------------------------------------------------------!
586
587	IMPLICIT NONE
588
589	CHARACTER (LEN=*) :: direction
590	INTEGER :: i, j, k
591	REAL :: ar(1:nz,0:ny,0:nx)
592
593	!
594	!-- Performing the fft with one of the methods implemented
595	!$OMP PARALLEL PRIVATE ( j, k )
596	!$OMP DO
597	DO k = 1, nz
598	DO j = 0, ny
599	CALL fft_x( ar(k,j,0:nx), direction )
600	ENDDO
601	ENDDO
602	!$OMP END PARALLEL
603
604	END SUBROUTINE fftx
605	#endif
606
607
608	#if defined( __parallel )
609	SUBROUTINE fftyp( ar, direction )
610
611	!------------------------------------------------------------------------------!
612	! Fourier-transformation along y-direction Parallelized version
613	!------------------------------------------------------------------------------!
614
615	IMPLICIT NONE
616
617	CHARACTER (LEN=*) :: direction
618	INTEGER :: i, k
619	REAL :: ar(0:nya,nxl_y:nxr_ya,nzb_y:nzt_ya)
620
621	!
622	!-- Performing the fft with one of the methods implemented
623	!$OMP PARALLEL PRIVATE ( i, k )
624	!$OMP DO
625	DO k = nzb_y, nzt_y
626	DO i = nxl_y, nxr_y
627	CALL fft_y( ar(0:ny,i,k), direction )
628	ENDDO
629	ENDDO
630	!$OMP END PARALLEL
631
632	END SUBROUTINE fftyp
633
634	#else
635	SUBROUTINE ffty( ar, direction )
636
637	!------------------------------------------------------------------------------!
638	! Fourier-transformation along y-direction Non parallel version
639	!------------------------------------------------------------------------------!
640
641	IMPLICIT NONE
642
643	CHARACTER (LEN=*) :: direction
644	INTEGER :: i, k
645	REAL :: ar(1:nz,0:ny,0:nx)
646
647	!
648	!-- Performing the fft with one of the methods implemented
649	!$OMP PARALLEL PRIVATE ( i, k )
650	!$OMP DO
651	DO k = 1, nz
652	DO i = 0, nx
653	CALL fft_y( ar(k,0:ny,i), direction )
654	ENDDO
655	ENDDO
656	!$OMP END PARALLEL
657
658	END SUBROUTINE ffty
659	#endif
660
661	#if defined( __parallel )
662	SUBROUTINE ffty_tr_yx( f_in, work, f_out )
663
664	!------------------------------------------------------------------------------!
665	! Fourier-transformation along y with subsequent transposition y --> x for
666	! a 1d-decomposition along x
667	!
668	! ATTENTION: The performance of this routine is much faster on the NEC-SX6,
669	! if the first index of work_ffty_vec is odd. Otherwise
670	! memory bank conflicts may occur (especially if the index is a
671	! multiple of 128). That's why work_ffty_vec is dimensioned as
672	! 0:ny+1.
673	! Of course, this will not work if users are using an odd number
674	! of gridpoints along y.
675	!------------------------------------------------------------------------------!
676
677	USE control_parameters
678	USE cpulog
679	USE indices
680	USE interfaces
681	USE pegrid
682	USE transpose_indices
683
684	IMPLICIT NONE
685
686	INTEGER :: i, iend, iouter, ir, j, k
687	INTEGER, PARAMETER :: stridex = 4
688
689	REAL, DIMENSION(0:ny,stridex) :: work_ffty
690	#if defined( __nec )
691	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
692	#endif
693	REAL, DIMENSION(1:nza,0:nya,nxl:nxra) :: f_in
694	REAL, DIMENSION(nnx,1:nza,nys_x:nyn_xa,pdims(1)) :: f_out
695	REAL, DIMENSION(nxl:nxra,1:nza,0:nya) :: work
696
697	!
698	!-- Carry out the FFT along y, where all data are present due to the
699	!-- 1d-decomposition along x. Resort the data in a way that x becomes
700	!-- the first index.
701	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
702
703	IF ( host(1:3) == 'nec' ) THEN
704	#if defined( __nec )
705	!
706	!-- Code optimized for vector processors
707	!$OMP PARALLEL PRIVATE ( i, j, k )
708	!$OMP DO
709	DO i = nxl, nxr
710
711	DO j = 0, ny
712	DO k = 1, nz
713	work_ffty_vec(j,k,i) = f_in(k,j,i)
714	ENDDO
715	ENDDO
716
717	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' )
718
719	ENDDO
720
721	!$OMP DO
722	DO k = 1, nz
723	DO j = 0, ny
724	DO i = nxl, nxr
725	work(i,k,j) = work_ffty_vec(j,k,i)
726	ENDDO
727	ENDDO
728	ENDDO
729	!$OMP END PARALLEL
730	#endif
731
732	ELSE
733
734	!
735	!-- Cache optimized code.
736	!-- The i-(x-)direction is split into a strided outer loop and an inner
737	!-- loop for better cache performance
738	!$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty)
739	!$OMP DO
740	DO iouter = nxl, nxr, stridex
741
742	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
743
744	DO k = 1, nz
745
746	DO i = iouter, iend
747
748	ir = i-iouter+1 ! counter within a stride
749	DO j = 0, ny
750	work_ffty(j,ir) = f_in(k,j,i)
751	ENDDO
752	!
753	!-- FFT along y
754	CALL fft_y( work_ffty(:,ir), 'forward' )
755
756	ENDDO
757
758	!
759	!-- Resort
760	DO j = 0, ny
761	DO i = iouter, iend
762	work(i,k,j) = work_ffty(j,i-iouter+1)
763	ENDDO
764	ENDDO
765
766	ENDDO
767
768	ENDDO
769	!$OMP END PARALLEL
770
771	ENDIF
772	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
773
774	!
775	!-- Transpose array
776	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
777	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
778	CALL MPI_ALLTOALL( work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
779	f_out(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
780	comm1dx, ierr )
781	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
782
783	END SUBROUTINE ffty_tr_yx
784
785
786	SUBROUTINE tr_xy_ffty( f_in, work, f_out )
787
788	!------------------------------------------------------------------------------!
789	! Transposition x --> y with a subsequent backward Fourier transformation for
790	! a 1d-decomposition along x
791	!------------------------------------------------------------------------------!
792
793	USE control_parameters
794	USE cpulog
795	USE indices
796	USE interfaces
797	USE pegrid
798	USE transpose_indices
799
800	IMPLICIT NONE
801
802	INTEGER :: i, iend, iouter, ir, j, k
803	INTEGER, PARAMETER :: stridex = 4
804
805	REAL, DIMENSION(0:ny,stridex) :: work_ffty
806	#if defined( __nec )
807	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
808	#endif
809	REAL, DIMENSION(nnx,1:nza,nys_x:nyn_xa,pdims(1)) :: f_in
810	REAL, DIMENSION(1:nza,0:nya,nxl:nxra) :: f_out
811	REAL, DIMENSION(nxl:nxra,1:nza,0:nya) :: work
812
813	!
814	!-- Transpose array
815	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
816	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
817	CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
818	work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
819	comm1dx, ierr )
820	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
821
822	!
823	!-- Resort the data in a way that y becomes the first index and carry out the
824	!-- backward fft along y.
825	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
826
827	IF ( host(1:3) == 'nec' ) THEN
828	#if defined( __nec )
829	!
830	!-- Code optimized for vector processors
831	!$OMP PARALLEL PRIVATE ( i, j, k )
832	!$OMP DO
833	DO k = 1, nz
834	DO j = 0, ny
835	DO i = nxl, nxr
836	work_ffty_vec(j,k,i) = work(i,k,j)
837	ENDDO
838	ENDDO
839	ENDDO
840
841	!$OMP DO
842	DO i = nxl, nxr
843
844	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' )
845
846	DO j = 0, ny
847	DO k = 1, nz
848	f_out(k,j,i) = work_ffty_vec(j,k,i)
849	ENDDO
850	ENDDO
851
852	ENDDO
853	!$OMP END PARALLEL
854	#endif
855
856	ELSE
857
858	!
859	!-- Cache optimized code.
860	!-- The i-(x-)direction is split into a strided outer loop and an inner
861	!-- loop for better cache performance
862	!$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty )
863	!$OMP DO
864	DO iouter = nxl, nxr, stridex
865
866	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
867
868	DO k = 1, nz
869	!
870	!-- Resort
871	DO j = 0, ny
872	DO i = iouter, iend
873	work_ffty(j,i-iouter+1) = work(i,k,j)
874	ENDDO
875	ENDDO
876
877	DO i = iouter, iend
878
879	!
880	!-- FFT along y
881	ir = i-iouter+1 ! counter within a stride
882	CALL fft_y( work_ffty(:,ir), 'backward' )
883
884	DO j = 0, ny
885	f_out(k,j,i) = work_ffty(j,ir)
886	ENDDO
887	ENDDO
888
889	ENDDO
890
891	ENDDO
892	!$OMP END PARALLEL
893
894	ENDIF
895
896	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
897
898	END SUBROUTINE tr_xy_ffty
899
900
901	SUBROUTINE fftx_tri_fftx( ar )
902
903	!------------------------------------------------------------------------------!
904	! FFT along x, solution of the tridiagonal system and backward FFT for
905	! a 1d-decomposition along x
906	!
907	! WARNING: this subroutine may still not work for hybrid parallelization
908	! with OpenMP (for possible necessary changes see the original
909	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
910	!------------------------------------------------------------------------------!
911
912	USE control_parameters
913	USE cpulog
914	USE grid_variables
915	USE indices
916	USE interfaces
917	USE pegrid
918	USE transpose_indices
919
920	IMPLICIT NONE
921
922	character(len=3) :: myth_char
923
924	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
925
926	REAL, DIMENSION(0:nx) :: work_fftx
927	REAL, DIMENSION(0:nx,1:nz) :: work_trix
928	REAL, DIMENSION(nnx,1:nza,nys_x:nyn_xa,pdims(1)) :: ar
929	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
930
931
932	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'start' )
933
934	ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) )
935
936	tn = 0 ! Default thread number in case of one thread
937	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_fftx, work_trix )
938	DO j = nys_x, nyn_x
939
940	!$ tn = omp_get_thread_num()
941
942	IF ( host(1:3) == 'nec' ) THEN
943	!
944	!-- Code optimized for vector processors
945	DO k = 1, nz
946
947	m = 0
948	DO n = 1, pdims(1)
949	DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
950	work_trix(m,k) = ar(i,k,j,n)
951	m = m + 1
952	ENDDO
953	ENDDO
954
955	ENDDO
956
957	CALL fft_x_m( work_trix, 'forward' )
958
959	ELSE
960	!
961	!-- Cache optimized code
962	DO k = 1, nz
963
964	m = 0
965	DO n = 1, pdims(1)
966	DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
967	work_fftx(m) = ar(i,k,j,n)
968	m = m + 1
969	ENDDO
970	ENDDO
971
972	CALL fft_x( work_fftx, 'forward' )
973
974	DO i = 0, nx
975	work_trix(i,k) = work_fftx(i)
976	ENDDO
977
978	ENDDO
979
980	ENDIF
981
982	!
983	!-- Solve the linear equation system
984	CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) )
985
986	IF ( host(1:3) == 'nec' ) THEN
987	!
988	!-- Code optimized for vector processors
989	CALL fft_x_m( work_trix, 'backward' )
990
991	DO k = 1, nz
992
993	m = 0
994	DO n = 1, pdims(1)
995	DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
996	ar(i,k,j,n) = work_trix(m,k)
997	m = m + 1
998	ENDDO
999	ENDDO
1000
1001	ENDDO
1002
1003	ELSE
1004	!
1005	!-- Cache optimized code
1006	DO k = 1, nz
1007
1008	DO i = 0, nx
1009	work_fftx(i) = work_trix(i,k)
1010	ENDDO
1011
1012	CALL fft_x( work_fftx, 'backward' )
1013
1014	m = 0
1015	DO n = 1, pdims(1)
1016	DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
1017	ar(i,k,j,n) = work_fftx(m)
1018	m = m + 1
1019	ENDDO
1020	ENDDO
1021
1022	ENDDO
1023
1024	ENDIF
1025
1026	ENDDO
1027
1028	DEALLOCATE( tri )
1029
1030	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'stop' )
1031
1032	END SUBROUTINE fftx_tri_fftx
1033
1034
1035	SUBROUTINE fftx_tr_xy( f_in, work, f_out )
1036
1037	!------------------------------------------------------------------------------!
1038	! Fourier-transformation along x with subsequent transposition x --> y for
1039	! a 1d-decomposition along y
1040	!
1041	! ATTENTION: The NEC-branch of this routine may significantly profit from
1042	! further optimizations. So far, performance is much worse than
1043	! for routine ffty_tr_yx (more than three times slower).
1044	!------------------------------------------------------------------------------!
1045
1046	USE control_parameters
1047	USE cpulog
1048	USE indices
1049	USE interfaces
1050	USE pegrid
1051	USE transpose_indices
1052
1053	IMPLICIT NONE
1054
1055	INTEGER :: i, j, k
1056
1057	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
1058	REAL, DIMENSION(1:nza,nys:nyna,0:nxa) :: f_in
1059	REAL, DIMENSION(nny,1:nza,nxl_y:nxr_ya,pdims(2)) :: f_out
1060	REAL, DIMENSION(nys:nyna,1:nza,0:nxa) :: work
1061
1062	!
1063	!-- Carry out the FFT along x, where all data are present due to the
1064	!-- 1d-decomposition along y. Resort the data in a way that y becomes
1065	!-- the first index.
1066	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
1067
1068	IF ( host(1:3) == 'nec' ) THEN
1069	!
1070	!-- Code for vector processors
1071	!$OMP PARALLEL PRIVATE ( i, j, k )
1072	!$OMP DO
1073	DO i = 0, nx
1074
1075	DO j = nys, nyn
1076	DO k = 1, nz
1077	work_fftx(i,k,j) = f_in(k,j,i)
1078	ENDDO
1079	ENDDO
1080
1081	ENDDO
1082
1083	!$OMP DO
1084	DO j = nys, nyn
1085
1086	CALL fft_x_m( work_fftx(:,:,j), 'forward' )
1087
1088	DO k = 1, nz
1089	DO i = 0, nx
1090	work(j,k,i) = work_fftx(i,k,j)
1091	ENDDO
1092	ENDDO
1093
1094	ENDDO
1095	!$OMP END PARALLEL
1096
1097	ELSE
1098
1099	!
1100	!-- Cache optimized code (there might be still a potential for better
1101	!-- optimization).
1102	!$OMP PARALLEL PRIVATE (i,j,k)
1103	!$OMP DO
1104	DO i = 0, nx
1105
1106	DO j = nys, nyn
1107	DO k = 1, nz
1108	work_fftx(i,k,j) = f_in(k,j,i)
1109	ENDDO
1110	ENDDO
1111
1112	ENDDO
1113
1114	!$OMP DO
1115	DO j = nys, nyn
1116	DO k = 1, nz
1117
1118	CALL fft_x( work_fftx(0:nx,k,j), 'forward' )
1119
1120	DO i = 0, nx
1121	work(j,k,i) = work_fftx(i,k,j)
1122	ENDDO
1123	ENDDO
1124
1125	ENDDO
1126	!$OMP END PARALLEL
1127
1128	ENDIF
1129	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
1130
1131	!
1132	!-- Transpose array
1133	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1134	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1135	CALL MPI_ALLTOALL( work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
1136	f_out(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
1137	comm1dy, ierr )
1138	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1139
1140	END SUBROUTINE fftx_tr_xy
1141
1142
1143	SUBROUTINE tr_yx_fftx( f_in, work, f_out )
1144
1145	!------------------------------------------------------------------------------!
1146	! Transposition y --> x with a subsequent backward Fourier transformation for
1147	! a 1d-decomposition along x
1148	!------------------------------------------------------------------------------!
1149
1150	USE control_parameters
1151	USE cpulog
1152	USE indices
1153	USE interfaces
1154	USE pegrid
1155	USE transpose_indices
1156
1157	IMPLICIT NONE
1158
1159	INTEGER :: i, j, k
1160
1161	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
1162	REAL, DIMENSION(nny,1:nza,nxl_y:nxr_ya,pdims(2)) :: f_in
1163	REAL, DIMENSION(1:nza,nys:nyna,0:nxa) :: f_out
1164	REAL, DIMENSION(nys:nyna,1:nza,0:nxa) :: work
1165
1166	!
1167	!-- Transpose array
1168	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1169	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1170	CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
1171	work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
1172	comm1dy, ierr )
1173	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1174
1175	!
1176	!-- Carry out the FFT along x, where all data are present due to the
1177	!-- 1d-decomposition along y. Resort the data in a way that y becomes
1178	!-- the first index.
1179	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
1180
1181	IF ( host(1:3) == 'nec' ) THEN
1182	!
1183	!-- Code optimized for vector processors
1184	!$OMP PARALLEL PRIVATE ( i, j, k )
1185	!$OMP DO
1186	DO j = nys, nyn
1187
1188	DO k = 1, nz
1189	DO i = 0, nx
1190	work_fftx(i,k,j) = work(j,k,i)
1191	ENDDO
1192	ENDDO
1193
1194	CALL fft_x_m( work_fftx(:,:,j), 'backward' )
1195
1196	ENDDO
1197
1198	!$OMP DO
1199	DO i = 0, nx
1200	DO j = nys, nyn
1201	DO k = 1, nz
1202	f_out(k,j,i) = work_fftx(i,k,j)
1203	ENDDO
1204	ENDDO
1205	ENDDO
1206	!$OMP END PARALLEL
1207
1208	ELSE
1209
1210	!
1211	!-- Cache optimized code (there might be still a potential for better
1212	!-- optimization).
1213	!$OMP PARALLEL PRIVATE (i,j,k)
1214	!$OMP DO
1215	DO j = nys, nyn
1216	DO k = 1, nz
1217
1218	DO i = 0, nx
1219	work_fftx(i,k,j) = work(j,k,i)
1220	ENDDO
1221
1222	CALL fft_x( work_fftx(0:nx,k,j), 'backward' )
1223
1224	ENDDO
1225	ENDDO
1226
1227	!$OMP DO
1228	DO i = 0, nx
1229	DO j = nys, nyn
1230	DO k = 1, nz
1231	f_out(k,j,i) = work_fftx(i,k,j)
1232	ENDDO
1233	ENDDO
1234	ENDDO
1235	!$OMP END PARALLEL
1236
1237	ENDIF
1238	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
1239
1240	END SUBROUTINE tr_yx_fftx
1241
1242
1243	SUBROUTINE ffty_tri_ffty( ar )
1244
1245	!------------------------------------------------------------------------------!
1246	! FFT along y, solution of the tridiagonal system and backward FFT for
1247	! a 1d-decomposition along y
1248	!
1249	! WARNING: this subroutine may still not work for hybrid parallelization
1250	! with OpenMP (for possible necessary changes see the original
1251	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
1252	!------------------------------------------------------------------------------!
1253
1254	USE control_parameters
1255	USE cpulog
1256	USE grid_variables
1257	USE indices
1258	USE interfaces
1259	USE pegrid
1260	USE transpose_indices
1261
1262	IMPLICIT NONE
1263
1264	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
1265
1266	REAL, DIMENSION(0:ny) :: work_ffty
1267	REAL, DIMENSION(0:ny,1:nz) :: work_triy
1268	REAL, DIMENSION(nny,1:nza,nxl_y:nxr_ya,pdims(2)) :: ar
1269	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
1270
1271
1272	CALL cpu_log( log_point_s(39), 'fft_y + tridia', 'start' )
1273
1274	ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) )
1275
1276	tn = 0 ! Default thread number in case of one thread
1277	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy )
1278	DO i = nxl_y, nxr_y
1279
1280	!$ tn = omp_get_thread_num()
1281
1282	IF ( host(1:3) == 'nec' ) THEN
1283	!
1284	!-- Code optimized for vector processors
1285	DO k = 1, nz
1286
1287	m = 0
1288	DO n = 1, pdims(2)
1289	DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
1290	work_triy(m,k) = ar(j,k,i,n)
1291	m = m + 1
1292	ENDDO
1293	ENDDO
1294
1295	ENDDO
1296
1297	CALL fft_y_m( work_triy, ny, 'forward' )
1298
1299	ELSE
1300	!
1301	!-- Cache optimized code
1302	DO k = 1, nz
1303
1304	m = 0
1305	DO n = 1, pdims(2)
1306	DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
1307	work_ffty(m) = ar(j,k,i,n)
1308	m = m + 1
1309	ENDDO
1310	ENDDO
1311
1312	CALL fft_y( work_ffty, 'forward' )
1313
1314	DO j = 0, ny
1315	work_triy(j,k) = work_ffty(j)
1316	ENDDO
1317
1318	ENDDO
1319
1320	ENDIF
1321
1322	!
1323	!-- Solve the linear equation system
1324	CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) )
1325
1326	IF ( host(1:3) == 'nec' ) THEN
1327	!
1328	!-- Code optimized for vector processors
1329	CALL fft_y_m( work_triy, ny, 'backward' )
1330
1331	DO k = 1, nz
1332
1333	m = 0
1334	DO n = 1, pdims(2)
1335	DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
1336	ar(j,k,i,n) = work_triy(m,k)
1337	m = m + 1
1338	ENDDO
1339	ENDDO
1340
1341	ENDDO
1342
1343	ELSE
1344	!
1345	!-- Cache optimized code
1346	DO k = 1, nz
1347
1348	DO j = 0, ny
1349	work_ffty(j) = work_triy(j,k)
1350	ENDDO
1351
1352	CALL fft_y( work_ffty, 'backward' )
1353
1354	m = 0
1355	DO n = 1, pdims(2)
1356	DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
1357	ar(j,k,i,n) = work_ffty(m)
1358	m = m + 1
1359	ENDDO
1360	ENDDO
1361
1362	ENDDO
1363
1364	ENDIF
1365
1366	ENDDO
1367
1368	DEALLOCATE( tri )
1369
1370	CALL cpu_log( log_point_s(39), 'fft_y + tridia', 'stop' )
1371
1372	END SUBROUTINE ffty_tri_ffty
1373
1374
1375	SUBROUTINE tridia_1dd( ddx2, ddy2, nx, ny, j, ar, tri )
1376
1377	!------------------------------------------------------------------------------!
1378	! Solves the linear system of equations for a 1d-decomposition along x (see
1379	! tridia)
1380	!
1381	! Attention: when using the intel compiler, array tri must be passed as an
1382	! argument to the contained subroutines. Otherwise addres faults
1383	! will occur.
1384	! On NEC, tri should not be passed (except for routine substi_1dd)
1385	! because this causes very bad performance.
1386	!------------------------------------------------------------------------------!
1387
1388	USE arrays_3d
1389	USE control_parameters
1390
1391	USE pegrid
1392
1393	IMPLICIT NONE
1394
1395	INTEGER :: i, j, k, nnyh, nx, ny, omp_get_thread_num, tn
1396
1397	REAL :: ddx2, ddy2
1398
1399	REAL, DIMENSION(0:nx,1:nz) :: ar
1400	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
1401	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
1402
1403
1404	nnyh = ( ny + 1 ) / 2
1405
1406	!
1407	!-- Define constant elements of the tridiagonal matrix.
1408	!-- The compiler on SX6 does loop exchange. If 0:nx is a high power of 2,
1409	!-- the exchanged loops create bank conflicts. The following directive
1410	!-- prohibits loop exchange and the loops perform much better.
1411	! tn = omp_get_thread_num()
1412	! WRITE( 120+tn, * ) '+++ id=',myid,' nx=',nx,' thread=', omp_get_thread_num()
1413	! CALL local_flush( 120+tn )
1414	!CDIR NOLOOPCHG
1415	DO k = 0, nz-1
1416	DO i = 0,nx
1417	tri(2,i,k) = ddzu_pres(k+1) * ddzw(k+1)
1418	tri(3,i,k) = ddzu_pres(k+2) * ddzw(k+1)
1419	ENDDO
1420	ENDDO
1421	! WRITE( 120+tn, * ) '+++ id=',myid,' end of first tridia loop thread=', omp_get_thread_num()
1422	! CALL local_flush( 120+tn )
1423
1424	IF ( j <= nnyh ) THEN
1425	#if defined( __lc )
1426	CALL maketri_1dd( j, tri )
1427	#else
1428	CALL maketri_1dd( j )
1429	#endif
1430	ELSE
1431	#if defined( __lc )
1432	CALL maketri_1dd( ny+1-j, tri )
1433	#else
1434	CALL maketri_1dd( ny+1-j )
1435	#endif
1436	ENDIF
1437	#if defined( __lc )
1438	CALL split_1dd( tri )
1439	#else
1440	CALL split_1dd
1441	#endif
1442	CALL substi_1dd( ar, tri )
1443
1444	CONTAINS
1445
1446	#if defined( __lc )
1447	SUBROUTINE maketri_1dd( j, tri )
1448	#else
1449	SUBROUTINE maketri_1dd( j )
1450	#endif
1451
1452	!------------------------------------------------------------------------------!
1453	! computes the i- and j-dependent component of the matrix
1454	!------------------------------------------------------------------------------!
1455
1456	USE constants
1457
1458	IMPLICIT NONE
1459
1460	INTEGER :: i, j, k, nnxh
1461	REAL :: a, c
1462
1463	REAL, DIMENSION(0:nx) :: l
1464
1465	#if defined( __lc )
1466	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
1467	#endif
1468
1469
1470	nnxh = ( nx + 1 ) / 2
1471	!
1472	!-- Provide the tridiagonal matrix for solution of the Poisson equation in
1473	!-- Fourier space. The coefficients are computed following the method of
1474	!-- Schmidt et al. (DFVLR-Mitteilung 84-15), which departs from Stephan
1475	!-- Siano's original version by discretizing the Poisson equation,
1476	!-- before it is Fourier-transformed
1477	DO i = 0, nx
1478	IF ( i >= 0 .AND. i <= nnxh ) THEN
1479	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
1480	FLOAT( nx+1 ) ) ) * ddx2 + &
1481	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
1482	FLOAT( ny+1 ) ) ) * ddy2
1483	ELSE
1484	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
1485	FLOAT( nx+1 ) ) ) * ddx2 + &
1486	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
1487	FLOAT( ny+1 ) ) ) * ddy2
1488	ENDIF
1489	ENDDO
1490
1491	DO k = 0, nz-1
1492	DO i = 0, nx
1493	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
1494	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
1495	tri(1,i,k) = a + c - l(i)
1496	ENDDO
1497	ENDDO
1498	IF ( ibc_p_b == 1 .OR. ibc_p_b == 2 ) THEN
1499	DO i = 0, nx
1500	tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
1501	ENDDO
1502	ENDIF
1503	IF ( ibc_p_t == 1 ) THEN
1504	DO i = 0, nx
1505	tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
1506	ENDDO
1507	ENDIF
1508
1509	END SUBROUTINE maketri_1dd
1510
1511
1512	#if defined( __lc )
1513	SUBROUTINE split_1dd( tri )
1514	#else
1515	SUBROUTINE split_1dd
1516	#endif
1517
1518	!------------------------------------------------------------------------------!
1519	! Splitting of the tridiagonal matrix (Thomas algorithm)
1520	!------------------------------------------------------------------------------!
1521
1522	IMPLICIT NONE
1523
1524	INTEGER :: i, k
1525
1526	#if defined( __lc )
1527	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
1528	#endif
1529
1530
1531	!
1532	!-- Splitting
1533	DO i = 0, nx
1534	tri(4,i,0) = tri(1,i,0)
1535	ENDDO
1536	DO k = 1, nz-1
1537	DO i = 0, nx
1538	tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
1539	tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
1540	ENDDO
1541	ENDDO
1542
1543	END SUBROUTINE split_1dd
1544
1545
1546	SUBROUTINE substi_1dd( ar, tri )
1547
1548	!------------------------------------------------------------------------------!
1549	! Substitution (Forward and Backward) (Thomas algorithm)
1550	!------------------------------------------------------------------------------!
1551
1552	IMPLICIT NONE
1553
1554	INTEGER :: i, k
1555
1556	REAL, DIMENSION(0:nx,nz) :: ar
1557	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
1558	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
1559
1560	!
1561	!-- Forward substitution
1562	DO i = 0, nx
1563	ar1(i,0) = ar(i,1)
1564	ENDDO
1565	DO k = 1, nz-1
1566	DO i = 0, nx
1567	ar1(i,k) = ar(i,k+1) - tri(5,i,k) * ar1(i,k-1)
1568	ENDDO
1569	ENDDO
1570
1571	!
1572	!-- Backward substitution
1573	!-- Note, the add of 1.0E-20 in the denominator is due to avoid divisions
1574	!-- by zero appearing if the pressure bc is set to neumann at the top of
1575	!-- the model domain.
1576	DO i = 0, nx
1577	ar(i,nz) = ar1(i,nz-1) / ( tri(4,i,nz-1) + 1.0E-20 )
1578	ENDDO
1579	DO k = nz-2, 0, -1
1580	DO i = 0, nx
1581	ar(i,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,k+2) ) &
1582	/ tri(4,i,k)
1583	ENDDO
1584	ENDDO
1585
1586	!
1587	!-- Indices i=0, j=0 correspond to horizontally averaged pressure.
1588	!-- The respective values of ar should be zero at all k-levels if
1589	!-- acceleration of horizontally averaged vertical velocity is zero.
1590	IF ( ibc_p_b == 1 .AND. ibc_p_t == 1 ) THEN
1591	IF ( j == 0 ) THEN
1592	DO k = 1, nz
1593	ar(0,k) = 0.0
1594	ENDDO
1595	ENDIF
1596	ENDIF
1597
1598	END SUBROUTINE substi_1dd
1599
1600	END SUBROUTINE tridia_1dd
1601
1602	#endif
1603	#endif
1604	END MODULE poisfft_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |