Home

Context Navigation

source: palm/trunk/SOURCE/poisfft.f90 @ 931

Last change on this file since 931 was 878, checked in by suehring, 12 years ago
last commit documented
Property svn:keywords set to `Id`
File size: 46.9 KB

Line
1	MODULE poisfft_mod
2
3	!------------------------------------------------------------------------------!
4	! Current revisions:
5	! -----------------
6	!
7	! Former revisions:
8	! -----------------
9	! $Id: poisfft.f90 878 2012-04-03 11:23:30Z maronga $
10	!
11	! 877 2012-04-03 11:21:44Z suehring
12	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
13	! pressure at the top of the model domain.
14	!
15	! 809 2012-01-30 13:32:58Z maronga
16	! Bugfix: replaced .AND. and .NOT. with && and ! in the preprocessor directives
17	!
18	! 807 2012-01-25 11:53:51Z maronga
19	! New cpp directive "__check" implemented which is used by check_namelist_files
20	! (most of the code is unneeded by check_namelist_files).
21	!
22	! 763 2011-10-06 09:32:09Z suehring
23	! Comment added concerning the last change.
24	!
25	! 761 2011-10-05 17:58:52Z suehring
26	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
27	! pressure at the top of the model domain.
28	!
29	! 696 2011-03-18 07:03:49Z raasch
30	! work_fftx removed from PRIVATE clauses in fftx_tr_xy and tr_yx_fftx
31	!
32	! 683 2011-02-09 14:25:15Z raasch
33	! openMP parallelization for 2d-domain-decomposition
34	!
35	! 667 2010-12-23 12:06:00Z suehring/gryschka
36	! ddzu replaced by ddzu_pres due to changes in zu(0)
37	!
38	! 622 2010-12-10 08:08:13Z raasch
39	! optional barriers included in order to speed up collective operations
40	!
41	! 377 2009-09-04 11:09:00Z raasch
42	! __lcmuk changed to __lc to avoid problems with Intel compiler on sgi-ice
43	!
44	! 164 2008-05-15 08:46:15Z raasch
45	! Arguments removed from transpose routines
46	!
47	! 128 2007-10-26 13:11:14Z raasch
48	! Bugfix: wavenumber calculation for even nx in routines maketri
49	!
50	! 85 2007-05-11 09:35:14Z raasch
51	! Bugfix: work_fft*_vec removed from some PRIVATE-declarations
52	!
53	! 76 2007-03-29 00:58:32Z raasch
54	! Tridiagonal coefficients adjusted for Neumann boundary conditions both at
55	! the bottom and the top.
56	!
57	! RCS Log replace by Id keyword, revision history cleaned up
58	!
59	! Revision 1.24 2006/08/04 15:00:24 raasch
60	! Default setting of the thread number tn in case of not using OpenMP
61	!
62	! Revision 1.23 2006/02/23 12:48:38 raasch
63	! Additional compiler directive in routine tridia_1dd for preventing loop
64	! exchange on NEC-SX6
65	!
66	! Revision 1.20 2004/04/30 12:38:09 raasch
67	! Parts of former poisfft_hybrid moved to this subroutine,
68	! former subroutine changed to a module, renaming of FFT-subroutines and
69	! -module, FFTs completely substituted by calls of fft_x and fft_y,
70	! NAG fft used in the non-parallel case completely removed, l in maketri
71	! is now a 1d-array, variables passed by modules instead of using parameter
72	! lists, enlarged transposition arrays introduced
73	!
74	! Revision 1.1 1997/07/24 11:24:14 raasch
75	! Initial revision
76	!
77	!
78	! Description:
79	! ------------
80	! See below.
81	!------------------------------------------------------------------------------!
82
83	!--------------------------------------------------------------------------!
84	! poisfft !
85	! !
86	! Original version: Stephan Siano (pois3d) !
87	! !
88	! Institute of Meteorology and Climatology, University of Hannover !
89	! Germany !
90	! !
91	! Version as of July 23,1996 !
92	! !
93	! !
94	! Version for parallel computers: Siegfried Raasch !
95	! !
96	! Version as of July 03,1997 !
97	! !
98	! Solves the Poisson equation with a 2D spectral method !
99	! d^2 p / dx^2 + d^2 p / dy^2 + d^2 p / dz^2 = s !
100	! !
101	! Input: !
102	! real ar contains in the (nnx,nny,nnz) elements, !
103	! starting from the element (1,nys,nxl), the !
104	! values for s !
105	! real work Temporary array !
106	! !
107	! Output: !
108	! real ar contains the solution for p !
109	!--------------------------------------------------------------------------!
110
111	USE fft_xy
112	USE indices
113	USE transpose_indices
114
115	IMPLICIT NONE
116
117	PRIVATE
118
119	#if ! defined ( __check )
120	PUBLIC poisfft, poisfft_init
121
122	INTERFACE poisfft
123	MODULE PROCEDURE poisfft
124	END INTERFACE poisfft
125
126	INTERFACE poisfft_init
127	MODULE PROCEDURE poisfft_init
128	END INTERFACE poisfft_init
129	#else
130	PUBLIC poisfft_init
131
132	INTERFACE poisfft_init
133	MODULE PROCEDURE poisfft_init
134	END INTERFACE poisfft_init
135	#endif
136
137	CONTAINS
138
139	SUBROUTINE poisfft_init
140
141	CALL fft_init
142
143	END SUBROUTINE poisfft_init
144
145	#if ! defined ( __check )
146	SUBROUTINE poisfft( ar, work )
147
148	USE cpulog
149	USE interfaces
150	USE pegrid
151
152	IMPLICIT NONE
153
154	REAL, DIMENSION(1:nza,nys:nyna,nxl:nxra) :: ar, work
155
156
157	CALL cpu_log( log_point_s(3), 'poisfft', 'start' )
158
159	!
160	!-- Two-dimensional Fourier Transformation in x- and y-direction.
161	#if defined( __parallel )
162	IF ( pdims(2) == 1 ) THEN
163
164	!
165	!-- 1d-domain-decomposition along x:
166	!-- FFT along y and transposition y --> x
167	CALL ffty_tr_yx( ar, work, ar )
168
169	!
170	!-- FFT along x, solving the tridiagonal system and backward FFT
171	CALL fftx_tri_fftx( ar )
172
173	!
174	!-- Transposition x --> y and backward FFT along y
175	CALL tr_xy_ffty( ar, work, ar )
176
177	ELSEIF ( pdims(1) == 1 ) THEN
178
179	!
180	!-- 1d-domain-decomposition along y:
181	!-- FFT along x and transposition x --> y
182	CALL fftx_tr_xy( ar, work, ar )
183
184	!
185	!-- FFT along y, solving the tridiagonal system and backward FFT
186	CALL ffty_tri_ffty( ar )
187
188	!
189	!-- Transposition y --> x and backward FFT along x
190	CALL tr_yx_fftx( ar, work, ar )
191
192	ELSE
193
194	!
195	!-- 2d-domain-decomposition
196	!-- Transposition z --> x
197	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
198	CALL transpose_zx( ar, work, ar )
199	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
200
201	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
202	CALL fftxp( ar, 'forward' )
203	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
204
205	!
206	!-- Transposition x --> y
207	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
208	CALL transpose_xy( ar, work, ar )
209	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
210
211	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
212	CALL fftyp( ar, 'forward' )
213	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
214
215	!
216	!-- Transposition y --> z
217	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
218	CALL transpose_yz( ar, work, ar )
219	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
220
221	!
222	!-- Solve the Poisson equation in z-direction in cartesian space.
223	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
224	CALL tridia( ar )
225	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
226
227	!
228	!-- Inverse Fourier Transformation
229	!-- Transposition z --> y
230	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
231	CALL transpose_zy( ar, work, ar )
232	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
233
234	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
235	CALL fftyp( ar, 'backward' )
236	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
237
238	!
239	!-- Transposition y --> x
240	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
241	CALL transpose_yx( ar, work, ar )
242	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
243
244	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
245	CALL fftxp( ar, 'backward' )
246	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
247
248	!
249	!-- Transposition x --> z
250	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
251	CALL transpose_xz( ar, work, ar )
252	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
253
254	ENDIF
255
256	#else
257
258	!
259	!-- Two-dimensional Fourier Transformation along x- and y-direction.
260	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
261	CALL fftx( ar, 'forward' )
262	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
263	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
264	CALL ffty( ar, 'forward' )
265	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
266
267	!
268	!-- Solve the Poisson equation in z-direction in cartesian space.
269	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
270	CALL tridia( ar )
271	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
272
273	!
274	!-- Inverse Fourier Transformation.
275	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
276	CALL ffty( ar, 'backward' )
277	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
278	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
279	CALL fftx( ar, 'backward' )
280	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
281
282	#endif
283
284	CALL cpu_log( log_point_s(3), 'poisfft', 'stop' )
285
286	END SUBROUTINE poisfft
287
288
289
290	SUBROUTINE tridia( ar )
291
292	!------------------------------------------------------------------------------!
293	! solves the linear system of equations:
294	!
295	! -(4 pi^2(i^2/(dx^2nnx^2)+j^2/(dy^2nny^2))+
296	! 1/(dzu(k)dzw(k))+1/(dzu(k-1)dzw(k)))*p(i,j,k)+
297	! 1/(dzu(k)dzw(k))p(i,j,k+1)+1/(dzu(k-1)dzw(k))p(i,j,k-1)=d(i,j,k)
298	!
299	! by using the Thomas algorithm
300	!------------------------------------------------------------------------------!
301
302	USE arrays_3d
303
304	IMPLICIT NONE
305
306	INTEGER :: i, j, k, nnyh
307
308	REAL, DIMENSION(nxl_z:nxr_z,0:nz-1) :: ar1
309	REAL, DIMENSION(5,nxl_z:nxr_z,0:nz-1) :: tri
310
311	#if defined( __parallel )
312	REAL :: ar(nxl_z:nxr_za,nys_z:nyn_za,1:nza)
313	#else
314	REAL :: ar(1:nz,nys_z:nyn_z,nxl_z:nxr_z)
315	#endif
316
317
318	nnyh = (ny+1) / 2
319
320	!
321	!-- Define constant elements of the tridiagonal matrix.
322	!$OMP PARALLEL PRIVATE ( k, i )
323	!$OMP DO
324	DO k = 0, nz-1
325	DO i = nxl_z, nxr_z
326	tri(2,i,k) = ddzu_pres(k+1) * ddzw(k+1)
327	tri(3,i,k) = ddzu_pres(k+2) * ddzw(k+1)
328	ENDDO
329	ENDDO
330	!$OMP END PARALLEL
331
332	#if defined( __parallel )
333	!
334	!-- Repeat for all y-levels.
335	!$OMP PARALLEL FIRSTPRIVATE( tri ) PRIVATE ( ar1, j )
336	!$OMP DO
337	DO j = nys_z, nyn_z
338	IF ( j <= nnyh ) THEN
339	CALL maketri( tri, j )
340	ELSE
341	CALL maketri( tri, ny+1-j )
342	ENDIF
343	CALL split( tri )
344	CALL substi( ar, ar1, tri, j )
345	ENDDO
346	!$OMP END PARALLEL
347	#else
348	!
349	!-- First y-level.
350	CALL maketri( tri, nys_z )
351	CALL split( tri )
352	CALL substi( ar, ar1, tri, 0 )
353
354	!
355	!-- Further y-levels.
356	DO j = 1, nnyh - 1
357	CALL maketri( tri, j )
358	CALL split( tri )
359	CALL substi( ar, ar1, tri, j )
360	CALL substi( ar, ar1, tri, ny+1-j )
361	ENDDO
362	CALL maketri( tri, nnyh )
363	CALL split( tri )
364	CALL substi( ar, ar1, tri, nnyh+nys )
365	#endif
366
367	CONTAINS
368
369	SUBROUTINE maketri( tri, j )
370
371	!------------------------------------------------------------------------------!
372	! Computes the i- and j-dependent component of the matrix
373	!------------------------------------------------------------------------------!
374
375	USE arrays_3d
376	USE constants
377	USE control_parameters
378	USE grid_variables
379
380	IMPLICIT NONE
381
382	INTEGER :: i, j, k, nnxh
383	REAL :: a, c
384	REAL :: ll(nxl_z:nxr_z)
385	REAL :: tri(5,nxl_z:nxr_z,0:nz-1)
386
387
388	nnxh = ( nx + 1 ) / 2
389
390	!
391	!-- Provide the tridiagonal matrix for solution of the Poisson equation in
392	!-- Fourier space. The coefficients are computed following the method of
393	!-- Schmidt et al. (DFVLR-Mitteilung 84-15), which departs from Stephan
394	!-- Siano's original version by discretizing the Poisson equation,
395	!-- before it is Fourier-transformed
396	#if defined( __parallel )
397	DO i = nxl_z, nxr_z
398	IF ( i >= 0 .AND. i <= nnxh ) THEN
399	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
400	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
401	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
402	FLOAT( ny+1 ) ) ) / ( dy * dy )
403	ELSE
404	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
405	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
406	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
407	FLOAT( ny+1 ) ) ) / ( dy * dy )
408	ENDIF
409	DO k = 0,nz-1
410	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
411	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
412	tri(1,i,k) = a + c - ll(i)
413	ENDDO
414	ENDDO
415	#else
416	DO i = 0, nnxh
417	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / FLOAT( nx+1 ) ) ) / &
418	( dx * dx ) + &
419	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / FLOAT( ny+1 ) ) ) / &
420	( dy * dy )
421	DO k = 0, nz-1
422	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
423	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
424	tri(1,i,k) = a + c - ll(i)
425	IF ( i >= 1 .and. i < nnxh ) THEN
426	tri(1,nx+1-i,k) = tri(1,i,k)
427	ENDIF
428	ENDDO
429	ENDDO
430	#endif
431	IF ( ibc_p_b == 1 .OR. ibc_p_b == 2 ) THEN
432	DO i = nxl_z, nxr_z
433	tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
434	ENDDO
435	ENDIF
436	IF ( ibc_p_t == 1 ) THEN
437	DO i = nxl_z, nxr_z
438	tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
439	ENDDO
440	ENDIF
441
442	END SUBROUTINE maketri
443
444
445	SUBROUTINE substi( ar, ar1, tri, j )
446
447	!------------------------------------------------------------------------------!
448	! Substitution (Forward and Backward) (Thomas algorithm)
449	!------------------------------------------------------------------------------!
450
451	USE control_parameters
452
453	IMPLICIT NONE
454
455	INTEGER :: i, j, k
456	REAL :: ar1(nxl_z:nxr_z,0:nz-1)
457	REAL :: tri(5,nxl_z:nxr_z,0:nz-1)
458	#if defined( __parallel )
459	REAL :: ar(nxl_z:nxr_za,nys_z:nyn_za,1:nza)
460	#else
461	REAL :: ar(1:nz,nys_z:nyn_z,nxl_z:nxr_z)
462	#endif
463
464	!
465	!-- Forward substitution.
466	DO i = nxl_z, nxr_z
467	#if defined( __parallel )
468	ar1(i,0) = ar(i,j,1)
469	#else
470	ar1(i,0) = ar(1,j,i)
471	#endif
472	ENDDO
473	DO k = 1, nz - 1
474	DO i = nxl_z, nxr_z
475	#if defined( __parallel )
476	ar1(i,k) = ar(i,j,k+1) - tri(5,i,k) * ar1(i,k-1)
477	#else
478	ar1(i,k) = ar(k+1,j,i) - tri(5,i,k) * ar1(i,k-1)
479	#endif
480	ENDDO
481	ENDDO
482
483	!
484	!-- Backward substitution
485	!-- Note, the 1.0E-20 in the denominator is due to avoid divisions
486	!-- by zero appearing if the pressure bc is set to neumann at the top of
487	!-- the model domain.
488	DO i = nxl_z, nxr_z
489	#if defined( __parallel )
490	ar(i,j,nz) = ar1(i,nz-1) / ( tri(4,i,nz-1) + 1.0E-20 )
491	#else
492	ar(nz,j,i) = ar1(i,nz-1) / ( tri(4,i,nz-1) + 1.0E-20 )
493	#endif
494	ENDDO
495	DO k = nz-2, 0, -1
496	DO i = nxl_z, nxr_z
497	#if defined( __parallel )
498	ar(i,j,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,j,k+2) ) &
499	/ tri(4,i,k)
500	#else
501	ar(k+1,j,i) = ( ar1(i,k) - tri(3,i,k) * ar(k+2,j,i) ) &
502	/ tri(4,i,k)
503	#endif
504	ENDDO
505	ENDDO
506
507	!
508	!-- Indices i=0, j=0 correspond to horizontally averaged pressure.
509	!-- The respective values of ar should be zero at all k-levels if
510	!-- acceleration of horizontally averaged vertical velocity is zero.
511	IF ( ibc_p_b == 1 .AND. ibc_p_t == 1 ) THEN
512	IF ( j == 0 .AND. nxl_z == 0 ) THEN
513	#if defined( __parallel )
514	DO k = 1, nz
515	ar(nxl_z,j,k) = 0.0
516	ENDDO
517	#else
518	DO k = 1, nz
519	ar(k,j,nxl_z) = 0.0
520	ENDDO
521	#endif
522	ENDIF
523	ENDIF
524
525	END SUBROUTINE substi
526
527
528	SUBROUTINE split( tri )
529
530	!------------------------------------------------------------------------------!
531	! Splitting of the tridiagonal matrix (Thomas algorithm)
532	!------------------------------------------------------------------------------!
533
534	IMPLICIT NONE
535
536	INTEGER :: i, k
537	REAL :: tri(5,nxl_z:nxr_z,0:nz-1)
538
539	!
540	!-- Splitting.
541	DO i = nxl_z, nxr_z
542	tri(4,i,0) = tri(1,i,0)
543	ENDDO
544	DO k = 1, nz-1
545	DO i = nxl_z, nxr_z
546	tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
547	tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
548	ENDDO
549	ENDDO
550
551	END SUBROUTINE split
552
553	END SUBROUTINE tridia
554
555
556	#if defined( __parallel )
557	SUBROUTINE fftxp( ar, direction )
558
559	!------------------------------------------------------------------------------!
560	! Fourier-transformation along x-direction Parallelized version
561	!------------------------------------------------------------------------------!
562
563	IMPLICIT NONE
564
565	CHARACTER (LEN=*) :: direction
566	INTEGER :: j, k
567	REAL :: ar(0:nxa,nys_x:nyn_xa,nzb_x:nzt_xa)
568
569	!
570	!-- Performing the fft with one of the methods implemented
571	!$OMP PARALLEL PRIVATE ( j, k )
572	!$OMP DO
573	DO k = nzb_x, nzt_x
574	DO j = nys_x, nyn_x
575	CALL fft_x( ar(0:nx,j,k), direction )
576	ENDDO
577	ENDDO
578	!$OMP END PARALLEL
579
580	END SUBROUTINE fftxp
581
582	#else
583	SUBROUTINE fftx( ar, direction )
584
585	!------------------------------------------------------------------------------!
586	! Fourier-transformation along x-direction Non parallel version
587	!------------------------------------------------------------------------------!
588
589	IMPLICIT NONE
590
591	CHARACTER (LEN=*) :: direction
592	INTEGER :: i, j, k
593	REAL :: ar(1:nz,0:ny,0:nx)
594
595	!
596	!-- Performing the fft with one of the methods implemented
597	!$OMP PARALLEL PRIVATE ( j, k )
598	!$OMP DO
599	DO k = 1, nz
600	DO j = 0, ny
601	CALL fft_x( ar(k,j,0:nx), direction )
602	ENDDO
603	ENDDO
604	!$OMP END PARALLEL
605
606	END SUBROUTINE fftx
607	#endif
608
609
610	#if defined( __parallel )
611	SUBROUTINE fftyp( ar, direction )
612
613	!------------------------------------------------------------------------------!
614	! Fourier-transformation along y-direction Parallelized version
615	!------------------------------------------------------------------------------!
616
617	IMPLICIT NONE
618
619	CHARACTER (LEN=*) :: direction
620	INTEGER :: i, k
621	REAL :: ar(0:nya,nxl_y:nxr_ya,nzb_y:nzt_ya)
622
623	!
624	!-- Performing the fft with one of the methods implemented
625	!$OMP PARALLEL PRIVATE ( i, k )
626	!$OMP DO
627	DO k = nzb_y, nzt_y
628	DO i = nxl_y, nxr_y
629	CALL fft_y( ar(0:ny,i,k), direction )
630	ENDDO
631	ENDDO
632	!$OMP END PARALLEL
633
634	END SUBROUTINE fftyp
635
636	#else
637	SUBROUTINE ffty( ar, direction )
638
639	!------------------------------------------------------------------------------!
640	! Fourier-transformation along y-direction Non parallel version
641	!------------------------------------------------------------------------------!
642
643	IMPLICIT NONE
644
645	CHARACTER (LEN=*) :: direction
646	INTEGER :: i, k
647	REAL :: ar(1:nz,0:ny,0:nx)
648
649	!
650	!-- Performing the fft with one of the methods implemented
651	!$OMP PARALLEL PRIVATE ( i, k )
652	!$OMP DO
653	DO k = 1, nz
654	DO i = 0, nx
655	CALL fft_y( ar(k,0:ny,i), direction )
656	ENDDO
657	ENDDO
658	!$OMP END PARALLEL
659
660	END SUBROUTINE ffty
661	#endif
662
663	#if defined( __parallel )
664	SUBROUTINE ffty_tr_yx( f_in, work, f_out )
665
666	!------------------------------------------------------------------------------!
667	! Fourier-transformation along y with subsequent transposition y --> x for
668	! a 1d-decomposition along x
669	!
670	! ATTENTION: The performance of this routine is much faster on the NEC-SX6,
671	! if the first index of work_ffty_vec is odd. Otherwise
672	! memory bank conflicts may occur (especially if the index is a
673	! multiple of 128). That's why work_ffty_vec is dimensioned as
674	! 0:ny+1.
675	! Of course, this will not work if users are using an odd number
676	! of gridpoints along y.
677	!------------------------------------------------------------------------------!
678
679	USE control_parameters
680	USE cpulog
681	USE indices
682	USE interfaces
683	USE pegrid
684	USE transpose_indices
685
686	IMPLICIT NONE
687
688	INTEGER :: i, iend, iouter, ir, j, k
689	INTEGER, PARAMETER :: stridex = 4
690
691	REAL, DIMENSION(0:ny,stridex) :: work_ffty
692	#if defined( __nec )
693	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
694	#endif
695	REAL, DIMENSION(1:nza,0:nya,nxl:nxra) :: f_in
696	REAL, DIMENSION(nnx,1:nza,nys_x:nyn_xa,pdims(1)) :: f_out
697	REAL, DIMENSION(nxl:nxra,1:nza,0:nya) :: work
698
699	!
700	!-- Carry out the FFT along y, where all data are present due to the
701	!-- 1d-decomposition along x. Resort the data in a way that x becomes
702	!-- the first index.
703	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
704
705	IF ( host(1:3) == 'nec' ) THEN
706	#if defined( __nec )
707	!
708	!-- Code optimized for vector processors
709	!$OMP PARALLEL PRIVATE ( i, j, k )
710	!$OMP DO
711	DO i = nxl, nxr
712
713	DO j = 0, ny
714	DO k = 1, nz
715	work_ffty_vec(j,k,i) = f_in(k,j,i)
716	ENDDO
717	ENDDO
718
719	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' )
720
721	ENDDO
722
723	!$OMP DO
724	DO k = 1, nz
725	DO j = 0, ny
726	DO i = nxl, nxr
727	work(i,k,j) = work_ffty_vec(j,k,i)
728	ENDDO
729	ENDDO
730	ENDDO
731	!$OMP END PARALLEL
732	#endif
733
734	ELSE
735
736	!
737	!-- Cache optimized code.
738	!-- The i-(x-)direction is split into a strided outer loop and an inner
739	!-- loop for better cache performance
740	!$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty)
741	!$OMP DO
742	DO iouter = nxl, nxr, stridex
743
744	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
745
746	DO k = 1, nz
747
748	DO i = iouter, iend
749
750	ir = i-iouter+1 ! counter within a stride
751	DO j = 0, ny
752	work_ffty(j,ir) = f_in(k,j,i)
753	ENDDO
754	!
755	!-- FFT along y
756	CALL fft_y( work_ffty(:,ir), 'forward' )
757
758	ENDDO
759
760	!
761	!-- Resort
762	DO j = 0, ny
763	DO i = iouter, iend
764	work(i,k,j) = work_ffty(j,i-iouter+1)
765	ENDDO
766	ENDDO
767
768	ENDDO
769
770	ENDDO
771	!$OMP END PARALLEL
772
773	ENDIF
774	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
775
776	!
777	!-- Transpose array
778	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
779	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
780	CALL MPI_ALLTOALL( work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
781	f_out(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
782	comm1dx, ierr )
783	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
784
785	END SUBROUTINE ffty_tr_yx
786
787
788	SUBROUTINE tr_xy_ffty( f_in, work, f_out )
789
790	!------------------------------------------------------------------------------!
791	! Transposition x --> y with a subsequent backward Fourier transformation for
792	! a 1d-decomposition along x
793	!------------------------------------------------------------------------------!
794
795	USE control_parameters
796	USE cpulog
797	USE indices
798	USE interfaces
799	USE pegrid
800	USE transpose_indices
801
802	IMPLICIT NONE
803
804	INTEGER :: i, iend, iouter, ir, j, k
805	INTEGER, PARAMETER :: stridex = 4
806
807	REAL, DIMENSION(0:ny,stridex) :: work_ffty
808	#if defined( __nec )
809	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
810	#endif
811	REAL, DIMENSION(nnx,1:nza,nys_x:nyn_xa,pdims(1)) :: f_in
812	REAL, DIMENSION(1:nza,0:nya,nxl:nxra) :: f_out
813	REAL, DIMENSION(nxl:nxra,1:nza,0:nya) :: work
814
815	!
816	!-- Transpose array
817	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
818	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
819	CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
820	work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
821	comm1dx, ierr )
822	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
823
824	!
825	!-- Resort the data in a way that y becomes the first index and carry out the
826	!-- backward fft along y.
827	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
828
829	IF ( host(1:3) == 'nec' ) THEN
830	#if defined( __nec )
831	!
832	!-- Code optimized for vector processors
833	!$OMP PARALLEL PRIVATE ( i, j, k )
834	!$OMP DO
835	DO k = 1, nz
836	DO j = 0, ny
837	DO i = nxl, nxr
838	work_ffty_vec(j,k,i) = work(i,k,j)
839	ENDDO
840	ENDDO
841	ENDDO
842
843	!$OMP DO
844	DO i = nxl, nxr
845
846	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' )
847
848	DO j = 0, ny
849	DO k = 1, nz
850	f_out(k,j,i) = work_ffty_vec(j,k,i)
851	ENDDO
852	ENDDO
853
854	ENDDO
855	!$OMP END PARALLEL
856	#endif
857
858	ELSE
859
860	!
861	!-- Cache optimized code.
862	!-- The i-(x-)direction is split into a strided outer loop and an inner
863	!-- loop for better cache performance
864	!$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty )
865	!$OMP DO
866	DO iouter = nxl, nxr, stridex
867
868	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
869
870	DO k = 1, nz
871	!
872	!-- Resort
873	DO j = 0, ny
874	DO i = iouter, iend
875	work_ffty(j,i-iouter+1) = work(i,k,j)
876	ENDDO
877	ENDDO
878
879	DO i = iouter, iend
880
881	!
882	!-- FFT along y
883	ir = i-iouter+1 ! counter within a stride
884	CALL fft_y( work_ffty(:,ir), 'backward' )
885
886	DO j = 0, ny
887	f_out(k,j,i) = work_ffty(j,ir)
888	ENDDO
889	ENDDO
890
891	ENDDO
892
893	ENDDO
894	!$OMP END PARALLEL
895
896	ENDIF
897
898	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
899
900	END SUBROUTINE tr_xy_ffty
901
902
903	SUBROUTINE fftx_tri_fftx( ar )
904
905	!------------------------------------------------------------------------------!
906	! FFT along x, solution of the tridiagonal system and backward FFT for
907	! a 1d-decomposition along x
908	!
909	! WARNING: this subroutine may still not work for hybrid parallelization
910	! with OpenMP (for possible necessary changes see the original
911	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
912	!------------------------------------------------------------------------------!
913
914	USE control_parameters
915	USE cpulog
916	USE grid_variables
917	USE indices
918	USE interfaces
919	USE pegrid
920	USE transpose_indices
921
922	IMPLICIT NONE
923
924	character(len=3) :: myth_char
925
926	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
927
928	REAL, DIMENSION(0:nx) :: work_fftx
929	REAL, DIMENSION(0:nx,1:nz) :: work_trix
930	REAL, DIMENSION(nnx,1:nza,nys_x:nyn_xa,pdims(1)) :: ar
931	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
932
933
934	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'start' )
935
936	ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) )
937
938	tn = 0 ! Default thread number in case of one thread
939	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_fftx, work_trix )
940	DO j = nys_x, nyn_x
941
942	!$ tn = omp_get_thread_num()
943
944	IF ( host(1:3) == 'nec' ) THEN
945	!
946	!-- Code optimized for vector processors
947	DO k = 1, nz
948
949	m = 0
950	DO n = 1, pdims(1)
951	DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
952	work_trix(m,k) = ar(i,k,j,n)
953	m = m + 1
954	ENDDO
955	ENDDO
956
957	ENDDO
958
959	CALL fft_x_m( work_trix, 'forward' )
960
961	ELSE
962	!
963	!-- Cache optimized code
964	DO k = 1, nz
965
966	m = 0
967	DO n = 1, pdims(1)
968	DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
969	work_fftx(m) = ar(i,k,j,n)
970	m = m + 1
971	ENDDO
972	ENDDO
973
974	CALL fft_x( work_fftx, 'forward' )
975
976	DO i = 0, nx
977	work_trix(i,k) = work_fftx(i)
978	ENDDO
979
980	ENDDO
981
982	ENDIF
983
984	!
985	!-- Solve the linear equation system
986	CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) )
987
988	IF ( host(1:3) == 'nec' ) THEN
989	!
990	!-- Code optimized for vector processors
991	CALL fft_x_m( work_trix, 'backward' )
992
993	DO k = 1, nz
994
995	m = 0
996	DO n = 1, pdims(1)
997	DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
998	ar(i,k,j,n) = work_trix(m,k)
999	m = m + 1
1000	ENDDO
1001	ENDDO
1002
1003	ENDDO
1004
1005	ELSE
1006	!
1007	!-- Cache optimized code
1008	DO k = 1, nz
1009
1010	DO i = 0, nx
1011	work_fftx(i) = work_trix(i,k)
1012	ENDDO
1013
1014	CALL fft_x( work_fftx, 'backward' )
1015
1016	m = 0
1017	DO n = 1, pdims(1)
1018	DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
1019	ar(i,k,j,n) = work_fftx(m)
1020	m = m + 1
1021	ENDDO
1022	ENDDO
1023
1024	ENDDO
1025
1026	ENDIF
1027
1028	ENDDO
1029
1030	DEALLOCATE( tri )
1031
1032	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'stop' )
1033
1034	END SUBROUTINE fftx_tri_fftx
1035
1036
1037	SUBROUTINE fftx_tr_xy( f_in, work, f_out )
1038
1039	!------------------------------------------------------------------------------!
1040	! Fourier-transformation along x with subsequent transposition x --> y for
1041	! a 1d-decomposition along y
1042	!
1043	! ATTENTION: The NEC-branch of this routine may significantly profit from
1044	! further optimizations. So far, performance is much worse than
1045	! for routine ffty_tr_yx (more than three times slower).
1046	!------------------------------------------------------------------------------!
1047
1048	USE control_parameters
1049	USE cpulog
1050	USE indices
1051	USE interfaces
1052	USE pegrid
1053	USE transpose_indices
1054
1055	IMPLICIT NONE
1056
1057	INTEGER :: i, j, k
1058
1059	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
1060	REAL, DIMENSION(1:nza,nys:nyna,0:nxa) :: f_in
1061	REAL, DIMENSION(nny,1:nza,nxl_y:nxr_ya,pdims(2)) :: f_out
1062	REAL, DIMENSION(nys:nyna,1:nza,0:nxa) :: work
1063
1064	!
1065	!-- Carry out the FFT along x, where all data are present due to the
1066	!-- 1d-decomposition along y. Resort the data in a way that y becomes
1067	!-- the first index.
1068	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
1069
1070	IF ( host(1:3) == 'nec' ) THEN
1071	!
1072	!-- Code for vector processors
1073	!$OMP PARALLEL PRIVATE ( i, j, k )
1074	!$OMP DO
1075	DO i = 0, nx
1076
1077	DO j = nys, nyn
1078	DO k = 1, nz
1079	work_fftx(i,k,j) = f_in(k,j,i)
1080	ENDDO
1081	ENDDO
1082
1083	ENDDO
1084
1085	!$OMP DO
1086	DO j = nys, nyn
1087
1088	CALL fft_x_m( work_fftx(:,:,j), 'forward' )
1089
1090	DO k = 1, nz
1091	DO i = 0, nx
1092	work(j,k,i) = work_fftx(i,k,j)
1093	ENDDO
1094	ENDDO
1095
1096	ENDDO
1097	!$OMP END PARALLEL
1098
1099	ELSE
1100
1101	!
1102	!-- Cache optimized code (there might be still a potential for better
1103	!-- optimization).
1104	!$OMP PARALLEL PRIVATE (i,j,k)
1105	!$OMP DO
1106	DO i = 0, nx
1107
1108	DO j = nys, nyn
1109	DO k = 1, nz
1110	work_fftx(i,k,j) = f_in(k,j,i)
1111	ENDDO
1112	ENDDO
1113
1114	ENDDO
1115
1116	!$OMP DO
1117	DO j = nys, nyn
1118	DO k = 1, nz
1119
1120	CALL fft_x( work_fftx(0:nx,k,j), 'forward' )
1121
1122	DO i = 0, nx
1123	work(j,k,i) = work_fftx(i,k,j)
1124	ENDDO
1125	ENDDO
1126
1127	ENDDO
1128	!$OMP END PARALLEL
1129
1130	ENDIF
1131	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
1132
1133	!
1134	!-- Transpose array
1135	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1136	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1137	CALL MPI_ALLTOALL( work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
1138	f_out(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
1139	comm1dy, ierr )
1140	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1141
1142	END SUBROUTINE fftx_tr_xy
1143
1144
1145	SUBROUTINE tr_yx_fftx( f_in, work, f_out )
1146
1147	!------------------------------------------------------------------------------!
1148	! Transposition y --> x with a subsequent backward Fourier transformation for
1149	! a 1d-decomposition along x
1150	!------------------------------------------------------------------------------!
1151
1152	USE control_parameters
1153	USE cpulog
1154	USE indices
1155	USE interfaces
1156	USE pegrid
1157	USE transpose_indices
1158
1159	IMPLICIT NONE
1160
1161	INTEGER :: i, j, k
1162
1163	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
1164	REAL, DIMENSION(nny,1:nza,nxl_y:nxr_ya,pdims(2)) :: f_in
1165	REAL, DIMENSION(1:nza,nys:nyna,0:nxa) :: f_out
1166	REAL, DIMENSION(nys:nyna,1:nza,0:nxa) :: work
1167
1168	!
1169	!-- Transpose array
1170	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1171	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1172	CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
1173	work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
1174	comm1dy, ierr )
1175	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1176
1177	!
1178	!-- Carry out the FFT along x, where all data are present due to the
1179	!-- 1d-decomposition along y. Resort the data in a way that y becomes
1180	!-- the first index.
1181	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
1182
1183	IF ( host(1:3) == 'nec' ) THEN
1184	!
1185	!-- Code optimized for vector processors
1186	!$OMP PARALLEL PRIVATE ( i, j, k )
1187	!$OMP DO
1188	DO j = nys, nyn
1189
1190	DO k = 1, nz
1191	DO i = 0, nx
1192	work_fftx(i,k,j) = work(j,k,i)
1193	ENDDO
1194	ENDDO
1195
1196	CALL fft_x_m( work_fftx(:,:,j), 'backward' )
1197
1198	ENDDO
1199
1200	!$OMP DO
1201	DO i = 0, nx
1202	DO j = nys, nyn
1203	DO k = 1, nz
1204	f_out(k,j,i) = work_fftx(i,k,j)
1205	ENDDO
1206	ENDDO
1207	ENDDO
1208	!$OMP END PARALLEL
1209
1210	ELSE
1211
1212	!
1213	!-- Cache optimized code (there might be still a potential for better
1214	!-- optimization).
1215	!$OMP PARALLEL PRIVATE (i,j,k)
1216	!$OMP DO
1217	DO j = nys, nyn
1218	DO k = 1, nz
1219
1220	DO i = 0, nx
1221	work_fftx(i,k,j) = work(j,k,i)
1222	ENDDO
1223
1224	CALL fft_x( work_fftx(0:nx,k,j), 'backward' )
1225
1226	ENDDO
1227	ENDDO
1228
1229	!$OMP DO
1230	DO i = 0, nx
1231	DO j = nys, nyn
1232	DO k = 1, nz
1233	f_out(k,j,i) = work_fftx(i,k,j)
1234	ENDDO
1235	ENDDO
1236	ENDDO
1237	!$OMP END PARALLEL
1238
1239	ENDIF
1240	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
1241
1242	END SUBROUTINE tr_yx_fftx
1243
1244
1245	SUBROUTINE ffty_tri_ffty( ar )
1246
1247	!------------------------------------------------------------------------------!
1248	! FFT along y, solution of the tridiagonal system and backward FFT for
1249	! a 1d-decomposition along y
1250	!
1251	! WARNING: this subroutine may still not work for hybrid parallelization
1252	! with OpenMP (for possible necessary changes see the original
1253	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
1254	!------------------------------------------------------------------------------!
1255
1256	USE control_parameters
1257	USE cpulog
1258	USE grid_variables
1259	USE indices
1260	USE interfaces
1261	USE pegrid
1262	USE transpose_indices
1263
1264	IMPLICIT NONE
1265
1266	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
1267
1268	REAL, DIMENSION(0:ny) :: work_ffty
1269	REAL, DIMENSION(0:ny,1:nz) :: work_triy
1270	REAL, DIMENSION(nny,1:nza,nxl_y:nxr_ya,pdims(2)) :: ar
1271	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
1272
1273
1274	CALL cpu_log( log_point_s(39), 'fft_y + tridia', 'start' )
1275
1276	ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) )
1277
1278	tn = 0 ! Default thread number in case of one thread
1279	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy )
1280	DO i = nxl_y, nxr_y
1281
1282	!$ tn = omp_get_thread_num()
1283
1284	IF ( host(1:3) == 'nec' ) THEN
1285	!
1286	!-- Code optimized for vector processors
1287	DO k = 1, nz
1288
1289	m = 0
1290	DO n = 1, pdims(2)
1291	DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
1292	work_triy(m,k) = ar(j,k,i,n)
1293	m = m + 1
1294	ENDDO
1295	ENDDO
1296
1297	ENDDO
1298
1299	CALL fft_y_m( work_triy, ny, 'forward' )
1300
1301	ELSE
1302	!
1303	!-- Cache optimized code
1304	DO k = 1, nz
1305
1306	m = 0
1307	DO n = 1, pdims(2)
1308	DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
1309	work_ffty(m) = ar(j,k,i,n)
1310	m = m + 1
1311	ENDDO
1312	ENDDO
1313
1314	CALL fft_y( work_ffty, 'forward' )
1315
1316	DO j = 0, ny
1317	work_triy(j,k) = work_ffty(j)
1318	ENDDO
1319
1320	ENDDO
1321
1322	ENDIF
1323
1324	!
1325	!-- Solve the linear equation system
1326	CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) )
1327
1328	IF ( host(1:3) == 'nec' ) THEN
1329	!
1330	!-- Code optimized for vector processors
1331	CALL fft_y_m( work_triy, ny, 'backward' )
1332
1333	DO k = 1, nz
1334
1335	m = 0
1336	DO n = 1, pdims(2)
1337	DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
1338	ar(j,k,i,n) = work_triy(m,k)
1339	m = m + 1
1340	ENDDO
1341	ENDDO
1342
1343	ENDDO
1344
1345	ELSE
1346	!
1347	!-- Cache optimized code
1348	DO k = 1, nz
1349
1350	DO j = 0, ny
1351	work_ffty(j) = work_triy(j,k)
1352	ENDDO
1353
1354	CALL fft_y( work_ffty, 'backward' )
1355
1356	m = 0
1357	DO n = 1, pdims(2)
1358	DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
1359	ar(j,k,i,n) = work_ffty(m)
1360	m = m + 1
1361	ENDDO
1362	ENDDO
1363
1364	ENDDO
1365
1366	ENDIF
1367
1368	ENDDO
1369
1370	DEALLOCATE( tri )
1371
1372	CALL cpu_log( log_point_s(39), 'fft_y + tridia', 'stop' )
1373
1374	END SUBROUTINE ffty_tri_ffty
1375
1376
1377	SUBROUTINE tridia_1dd( ddx2, ddy2, nx, ny, j, ar, tri )
1378
1379	!------------------------------------------------------------------------------!
1380	! Solves the linear system of equations for a 1d-decomposition along x (see
1381	! tridia)
1382	!
1383	! Attention: when using the intel compiler, array tri must be passed as an
1384	! argument to the contained subroutines. Otherwise addres faults
1385	! will occur.
1386	! On NEC, tri should not be passed (except for routine substi_1dd)
1387	! because this causes very bad performance.
1388	!------------------------------------------------------------------------------!
1389
1390	USE arrays_3d
1391	USE control_parameters
1392
1393	USE pegrid
1394
1395	IMPLICIT NONE
1396
1397	INTEGER :: i, j, k, nnyh, nx, ny, omp_get_thread_num, tn
1398
1399	REAL :: ddx2, ddy2
1400
1401	REAL, DIMENSION(0:nx,1:nz) :: ar
1402	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
1403	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
1404
1405
1406	nnyh = ( ny + 1 ) / 2
1407
1408	!
1409	!-- Define constant elements of the tridiagonal matrix.
1410	!-- The compiler on SX6 does loop exchange. If 0:nx is a high power of 2,
1411	!-- the exchanged loops create bank conflicts. The following directive
1412	!-- prohibits loop exchange and the loops perform much better.
1413	! tn = omp_get_thread_num()
1414	! WRITE( 120+tn, * ) '+++ id=',myid,' nx=',nx,' thread=', omp_get_thread_num()
1415	! CALL local_flush( 120+tn )
1416	!CDIR NOLOOPCHG
1417	DO k = 0, nz-1
1418	DO i = 0,nx
1419	tri(2,i,k) = ddzu_pres(k+1) * ddzw(k+1)
1420	tri(3,i,k) = ddzu_pres(k+2) * ddzw(k+1)
1421	ENDDO
1422	ENDDO
1423	! WRITE( 120+tn, * ) '+++ id=',myid,' end of first tridia loop thread=', omp_get_thread_num()
1424	! CALL local_flush( 120+tn )
1425
1426	IF ( j <= nnyh ) THEN
1427	#if defined( __lc )
1428	CALL maketri_1dd( j, tri )
1429	#else
1430	CALL maketri_1dd( j )
1431	#endif
1432	ELSE
1433	#if defined( __lc )
1434	CALL maketri_1dd( ny+1-j, tri )
1435	#else
1436	CALL maketri_1dd( ny+1-j )
1437	#endif
1438	ENDIF
1439	#if defined( __lc )
1440	CALL split_1dd( tri )
1441	#else
1442	CALL split_1dd
1443	#endif
1444	CALL substi_1dd( ar, tri )
1445
1446	CONTAINS
1447
1448	#if defined( __lc )
1449	SUBROUTINE maketri_1dd( j, tri )
1450	#else
1451	SUBROUTINE maketri_1dd( j )
1452	#endif
1453
1454	!------------------------------------------------------------------------------!
1455	! computes the i- and j-dependent component of the matrix
1456	!------------------------------------------------------------------------------!
1457
1458	USE constants
1459
1460	IMPLICIT NONE
1461
1462	INTEGER :: i, j, k, nnxh
1463	REAL :: a, c
1464
1465	REAL, DIMENSION(0:nx) :: l
1466
1467	#if defined( __lc )
1468	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
1469	#endif
1470
1471
1472	nnxh = ( nx + 1 ) / 2
1473	!
1474	!-- Provide the tridiagonal matrix for solution of the Poisson equation in
1475	!-- Fourier space. The coefficients are computed following the method of
1476	!-- Schmidt et al. (DFVLR-Mitteilung 84-15), which departs from Stephan
1477	!-- Siano's original version by discretizing the Poisson equation,
1478	!-- before it is Fourier-transformed
1479	DO i = 0, nx
1480	IF ( i >= 0 .AND. i <= nnxh ) THEN
1481	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
1482	FLOAT( nx+1 ) ) ) * ddx2 + &
1483	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
1484	FLOAT( ny+1 ) ) ) * ddy2
1485	ELSE
1486	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
1487	FLOAT( nx+1 ) ) ) * ddx2 + &
1488	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
1489	FLOAT( ny+1 ) ) ) * ddy2
1490	ENDIF
1491	ENDDO
1492
1493	DO k = 0, nz-1
1494	DO i = 0, nx
1495	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
1496	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
1497	tri(1,i,k) = a + c - l(i)
1498	ENDDO
1499	ENDDO
1500	IF ( ibc_p_b == 1 .OR. ibc_p_b == 2 ) THEN
1501	DO i = 0, nx
1502	tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
1503	ENDDO
1504	ENDIF
1505	IF ( ibc_p_t == 1 ) THEN
1506	DO i = 0, nx
1507	tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
1508	ENDDO
1509	ENDIF
1510
1511	END SUBROUTINE maketri_1dd
1512
1513
1514	#if defined( __lc )
1515	SUBROUTINE split_1dd( tri )
1516	#else
1517	SUBROUTINE split_1dd
1518	#endif
1519
1520	!------------------------------------------------------------------------------!
1521	! Splitting of the tridiagonal matrix (Thomas algorithm)
1522	!------------------------------------------------------------------------------!
1523
1524	IMPLICIT NONE
1525
1526	INTEGER :: i, k
1527
1528	#if defined( __lc )
1529	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
1530	#endif
1531
1532
1533	!
1534	!-- Splitting
1535	DO i = 0, nx
1536	tri(4,i,0) = tri(1,i,0)
1537	ENDDO
1538	DO k = 1, nz-1
1539	DO i = 0, nx
1540	tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
1541	tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
1542	ENDDO
1543	ENDDO
1544
1545	END SUBROUTINE split_1dd
1546
1547
1548	SUBROUTINE substi_1dd( ar, tri )
1549
1550	!------------------------------------------------------------------------------!
1551	! Substitution (Forward and Backward) (Thomas algorithm)
1552	!------------------------------------------------------------------------------!
1553
1554	IMPLICIT NONE
1555
1556	INTEGER :: i, k
1557
1558	REAL, DIMENSION(0:nx,nz) :: ar
1559	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
1560	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
1561
1562	!
1563	!-- Forward substitution
1564	DO i = 0, nx
1565	ar1(i,0) = ar(i,1)
1566	ENDDO
1567	DO k = 1, nz-1
1568	DO i = 0, nx
1569	ar1(i,k) = ar(i,k+1) - tri(5,i,k) * ar1(i,k-1)
1570	ENDDO
1571	ENDDO
1572
1573	!
1574	!-- Backward substitution
1575	!-- Note, the add of 1.0E-20 in the denominator is due to avoid divisions
1576	!-- by zero appearing if the pressure bc is set to neumann at the top of
1577	!-- the model domain.
1578	DO i = 0, nx
1579	ar(i,nz) = ar1(i,nz-1) / ( tri(4,i,nz-1) + 1.0E-20 )
1580	ENDDO
1581	DO k = nz-2, 0, -1
1582	DO i = 0, nx
1583	ar(i,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,k+2) ) &
1584	/ tri(4,i,k)
1585	ENDDO
1586	ENDDO
1587
1588	!
1589	!-- Indices i=0, j=0 correspond to horizontally averaged pressure.
1590	!-- The respective values of ar should be zero at all k-levels if
1591	!-- acceleration of horizontally averaged vertical velocity is zero.
1592	IF ( ibc_p_b == 1 .AND. ibc_p_t == 1 ) THEN
1593	IF ( j == 0 ) THEN
1594	DO k = 1, nz
1595	ar(0,k) = 0.0
1596	ENDDO
1597	ENDIF
1598	ENDIF
1599
1600	END SUBROUTINE substi_1dd
1601
1602	END SUBROUTINE tridia_1dd
1603
1604	#endif
1605	#endif
1606	END MODULE poisfft_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |