Home

Context Navigation

source: palm/trunk/SOURCE/poisfft.f90 @ 990

Last change on this file since 990 was 941, checked in by raasch, 12 years ago
last commit documented
Property svn:keywords set to `Id`
File size: 47.2 KB

Line
1	MODULE poisfft_mod
2
3	!------------------------------------------------------------------------------!
4	! Current revisions:
5	! -----------------
6	!
7	!
8	! Former revisions:
9	! -----------------
10	! $Id: poisfft.f90 941 2012-07-09 14:50:21Z maronga $
11	!
12	! 940 2012-07-09 14:31:00Z raasch
13	! special handling of tri-array as an argument in tridia_1dd routines switched
14	! off because it caused segmentation faults with intel 12.1 compiler
15	!
16	! 877 2012-04-03 11:21:44Z suehring
17	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
18	! pressure at the top of the model domain.
19	!
20	! 809 2012-01-30 13:32:58Z maronga
21	! Bugfix: replaced .AND. and .NOT. with && and ! in the preprocessor directives
22	!
23	! 807 2012-01-25 11:53:51Z maronga
24	! New cpp directive "__check" implemented which is used by check_namelist_files
25	! (most of the code is unneeded by check_namelist_files).
26	!
27	! 763 2011-10-06 09:32:09Z suehring
28	! Comment added concerning the last change.
29	!
30	! 761 2011-10-05 17:58:52Z suehring
31	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
32	! pressure at the top of the model domain.
33	!
34	! 696 2011-03-18 07:03:49Z raasch
35	! work_fftx removed from PRIVATE clauses in fftx_tr_xy and tr_yx_fftx
36	!
37	! 683 2011-02-09 14:25:15Z raasch
38	! openMP parallelization for 2d-domain-decomposition
39	!
40	! 667 2010-12-23 12:06:00Z suehring/gryschka
41	! ddzu replaced by ddzu_pres due to changes in zu(0)
42	!
43	! 622 2010-12-10 08:08:13Z raasch
44	! optional barriers included in order to speed up collective operations
45	!
46	! 377 2009-09-04 11:09:00Z raasch
47	! __lcmuk changed to __lc to avoid problems with Intel compiler on sgi-ice
48	!
49	! 164 2008-05-15 08:46:15Z raasch
50	! Arguments removed from transpose routines
51	!
52	! 128 2007-10-26 13:11:14Z raasch
53	! Bugfix: wavenumber calculation for even nx in routines maketri
54	!
55	! 85 2007-05-11 09:35:14Z raasch
56	! Bugfix: work_fft*_vec removed from some PRIVATE-declarations
57	!
58	! 76 2007-03-29 00:58:32Z raasch
59	! Tridiagonal coefficients adjusted for Neumann boundary conditions both at
60	! the bottom and the top.
61	!
62	! RCS Log replace by Id keyword, revision history cleaned up
63	!
64	! Revision 1.24 2006/08/04 15:00:24 raasch
65	! Default setting of the thread number tn in case of not using OpenMP
66	!
67	! Revision 1.23 2006/02/23 12:48:38 raasch
68	! Additional compiler directive in routine tridia_1dd for preventing loop
69	! exchange on NEC-SX6
70	!
71	! Revision 1.20 2004/04/30 12:38:09 raasch
72	! Parts of former poisfft_hybrid moved to this subroutine,
73	! former subroutine changed to a module, renaming of FFT-subroutines and
74	! -module, FFTs completely substituted by calls of fft_x and fft_y,
75	! NAG fft used in the non-parallel case completely removed, l in maketri
76	! is now a 1d-array, variables passed by modules instead of using parameter
77	! lists, enlarged transposition arrays introduced
78	!
79	! Revision 1.1 1997/07/24 11:24:14 raasch
80	! Initial revision
81	!
82	!
83	! Description:
84	! ------------
85	! See below.
86	!------------------------------------------------------------------------------!
87
88	!--------------------------------------------------------------------------!
89	! poisfft !
90	! !
91	! Original version: Stephan Siano (pois3d) !
92	! !
93	! Institute of Meteorology and Climatology, University of Hannover !
94	! Germany !
95	! !
96	! Version as of July 23,1996 !
97	! !
98	! !
99	! Version for parallel computers: Siegfried Raasch !
100	! !
101	! Version as of July 03,1997 !
102	! !
103	! Solves the Poisson equation with a 2D spectral method !
104	! d^2 p / dx^2 + d^2 p / dy^2 + d^2 p / dz^2 = s !
105	! !
106	! Input: !
107	! real ar contains in the (nnx,nny,nnz) elements, !
108	! starting from the element (1,nys,nxl), the !
109	! values for s !
110	! real work Temporary array !
111	! !
112	! Output: !
113	! real ar contains the solution for p !
114	!--------------------------------------------------------------------------!
115
116	USE fft_xy
117	USE indices
118	USE transpose_indices
119
120	IMPLICIT NONE
121
122	PRIVATE
123
124	#if ! defined ( __check )
125	PUBLIC poisfft, poisfft_init
126
127	INTERFACE poisfft
128	MODULE PROCEDURE poisfft
129	END INTERFACE poisfft
130
131	INTERFACE poisfft_init
132	MODULE PROCEDURE poisfft_init
133	END INTERFACE poisfft_init
134	#else
135	PUBLIC poisfft_init
136
137	INTERFACE poisfft_init
138	MODULE PROCEDURE poisfft_init
139	END INTERFACE poisfft_init
140	#endif
141
142	CONTAINS
143
144	SUBROUTINE poisfft_init
145
146	CALL fft_init
147
148	END SUBROUTINE poisfft_init
149
150	#if ! defined ( __check )
151	SUBROUTINE poisfft( ar, work )
152
153	USE cpulog
154	USE interfaces
155	USE pegrid
156
157	IMPLICIT NONE
158
159	REAL, DIMENSION(1:nza,nys:nyna,nxl:nxra) :: ar, work
160
161
162	CALL cpu_log( log_point_s(3), 'poisfft', 'start' )
163
164	!
165	!-- Two-dimensional Fourier Transformation in x- and y-direction.
166	#if defined( __parallel )
167	IF ( pdims(2) == 1 ) THEN
168
169	!
170	!-- 1d-domain-decomposition along x:
171	!-- FFT along y and transposition y --> x
172	CALL ffty_tr_yx( ar, work, ar )
173
174	!
175	!-- FFT along x, solving the tridiagonal system and backward FFT
176	CALL fftx_tri_fftx( ar )
177
178	!
179	!-- Transposition x --> y and backward FFT along y
180	CALL tr_xy_ffty( ar, work, ar )
181
182	ELSEIF ( pdims(1) == 1 ) THEN
183
184	!
185	!-- 1d-domain-decomposition along y:
186	!-- FFT along x and transposition x --> y
187	CALL fftx_tr_xy( ar, work, ar )
188
189	!
190	!-- FFT along y, solving the tridiagonal system and backward FFT
191	CALL ffty_tri_ffty( ar )
192
193	!
194	!-- Transposition y --> x and backward FFT along x
195	CALL tr_yx_fftx( ar, work, ar )
196
197	ELSE
198
199	!
200	!-- 2d-domain-decomposition
201	!-- Transposition z --> x
202	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
203	CALL transpose_zx( ar, work, ar )
204	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
205
206	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
207	CALL fftxp( ar, 'forward' )
208	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
209
210	!
211	!-- Transposition x --> y
212	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
213	CALL transpose_xy( ar, work, ar )
214	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
215
216	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
217	CALL fftyp( ar, 'forward' )
218	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
219
220	!
221	!-- Transposition y --> z
222	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
223	CALL transpose_yz( ar, work, ar )
224	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
225
226	!
227	!-- Solve the Poisson equation in z-direction in cartesian space.
228	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
229	CALL tridia( ar )
230	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
231
232	!
233	!-- Inverse Fourier Transformation
234	!-- Transposition z --> y
235	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
236	CALL transpose_zy( ar, work, ar )
237	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
238
239	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
240	CALL fftyp( ar, 'backward' )
241	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
242
243	!
244	!-- Transposition y --> x
245	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
246	CALL transpose_yx( ar, work, ar )
247	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
248
249	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
250	CALL fftxp( ar, 'backward' )
251	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
252
253	!
254	!-- Transposition x --> z
255	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
256	CALL transpose_xz( ar, work, ar )
257	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
258
259	ENDIF
260
261	#else
262
263	!
264	!-- Two-dimensional Fourier Transformation along x- and y-direction.
265	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
266	CALL fftx( ar, 'forward' )
267	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
268	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
269	CALL ffty( ar, 'forward' )
270	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
271
272	!
273	!-- Solve the Poisson equation in z-direction in cartesian space.
274	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
275	CALL tridia( ar )
276	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
277
278	!
279	!-- Inverse Fourier Transformation.
280	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
281	CALL ffty( ar, 'backward' )
282	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
283	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
284	CALL fftx( ar, 'backward' )
285	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
286
287	#endif
288
289	CALL cpu_log( log_point_s(3), 'poisfft', 'stop' )
290
291	END SUBROUTINE poisfft
292
293
294
295	SUBROUTINE tridia( ar )
296
297	!------------------------------------------------------------------------------!
298	! solves the linear system of equations:
299	!
300	! -(4 pi^2(i^2/(dx^2nnx^2)+j^2/(dy^2nny^2))+
301	! 1/(dzu(k)dzw(k))+1/(dzu(k-1)dzw(k)))*p(i,j,k)+
302	! 1/(dzu(k)dzw(k))p(i,j,k+1)+1/(dzu(k-1)dzw(k))p(i,j,k-1)=d(i,j,k)
303	!
304	! by using the Thomas algorithm
305	!------------------------------------------------------------------------------!
306
307	USE arrays_3d
308
309	IMPLICIT NONE
310
311	INTEGER :: i, j, k, nnyh
312
313	REAL, DIMENSION(nxl_z:nxr_z,0:nz-1) :: ar1
314	REAL, DIMENSION(5,nxl_z:nxr_z,0:nz-1) :: tri
315
316	#if defined( __parallel )
317	REAL :: ar(nxl_z:nxr_za,nys_z:nyn_za,1:nza)
318	#else
319	REAL :: ar(1:nz,nys_z:nyn_z,nxl_z:nxr_z)
320	#endif
321
322
323	nnyh = (ny+1) / 2
324
325	!
326	!-- Define constant elements of the tridiagonal matrix.
327	!$OMP PARALLEL PRIVATE ( k, i )
328	!$OMP DO
329	DO k = 0, nz-1
330	DO i = nxl_z, nxr_z
331	tri(2,i,k) = ddzu_pres(k+1) * ddzw(k+1)
332	tri(3,i,k) = ddzu_pres(k+2) * ddzw(k+1)
333	ENDDO
334	ENDDO
335	!$OMP END PARALLEL
336
337	#if defined( __parallel )
338	!
339	!-- Repeat for all y-levels.
340	!$OMP PARALLEL FIRSTPRIVATE( tri ) PRIVATE ( ar1, j )
341	!$OMP DO
342	DO j = nys_z, nyn_z
343	IF ( j <= nnyh ) THEN
344	CALL maketri( tri, j )
345	ELSE
346	CALL maketri( tri, ny+1-j )
347	ENDIF
348	CALL split( tri )
349	CALL substi( ar, ar1, tri, j )
350	ENDDO
351	!$OMP END PARALLEL
352	#else
353	!
354	!-- First y-level.
355	CALL maketri( tri, nys_z )
356	CALL split( tri )
357	CALL substi( ar, ar1, tri, 0 )
358
359	!
360	!-- Further y-levels.
361	DO j = 1, nnyh - 1
362	CALL maketri( tri, j )
363	CALL split( tri )
364	CALL substi( ar, ar1, tri, j )
365	CALL substi( ar, ar1, tri, ny+1-j )
366	ENDDO
367	CALL maketri( tri, nnyh )
368	CALL split( tri )
369	CALL substi( ar, ar1, tri, nnyh+nys )
370	#endif
371
372	CONTAINS
373
374	SUBROUTINE maketri( tri, j )
375
376	!------------------------------------------------------------------------------!
377	! Computes the i- and j-dependent component of the matrix
378	!------------------------------------------------------------------------------!
379
380	USE arrays_3d
381	USE constants
382	USE control_parameters
383	USE grid_variables
384
385	IMPLICIT NONE
386
387	INTEGER :: i, j, k, nnxh
388	REAL :: a, c
389	REAL :: ll(nxl_z:nxr_z)
390	REAL :: tri(5,nxl_z:nxr_z,0:nz-1)
391
392
393	nnxh = ( nx + 1 ) / 2
394
395	!
396	!-- Provide the tridiagonal matrix for solution of the Poisson equation in
397	!-- Fourier space. The coefficients are computed following the method of
398	!-- Schmidt et al. (DFVLR-Mitteilung 84-15), which departs from Stephan
399	!-- Siano's original version by discretizing the Poisson equation,
400	!-- before it is Fourier-transformed
401	#if defined( __parallel )
402	DO i = nxl_z, nxr_z
403	IF ( i >= 0 .AND. i <= nnxh ) THEN
404	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
405	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
406	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
407	FLOAT( ny+1 ) ) ) / ( dy * dy )
408	ELSE
409	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
410	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
411	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
412	FLOAT( ny+1 ) ) ) / ( dy * dy )
413	ENDIF
414	DO k = 0,nz-1
415	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
416	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
417	tri(1,i,k) = a + c - ll(i)
418	ENDDO
419	ENDDO
420	#else
421	DO i = 0, nnxh
422	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / FLOAT( nx+1 ) ) ) / &
423	( dx * dx ) + &
424	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / FLOAT( ny+1 ) ) ) / &
425	( dy * dy )
426	DO k = 0, nz-1
427	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
428	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
429	tri(1,i,k) = a + c - ll(i)
430	IF ( i >= 1 .and. i < nnxh ) THEN
431	tri(1,nx+1-i,k) = tri(1,i,k)
432	ENDIF
433	ENDDO
434	ENDDO
435	#endif
436	IF ( ibc_p_b == 1 .OR. ibc_p_b == 2 ) THEN
437	DO i = nxl_z, nxr_z
438	tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
439	ENDDO
440	ENDIF
441	IF ( ibc_p_t == 1 ) THEN
442	DO i = nxl_z, nxr_z
443	tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
444	ENDDO
445	ENDIF
446
447	END SUBROUTINE maketri
448
449
450	SUBROUTINE substi( ar, ar1, tri, j )
451
452	!------------------------------------------------------------------------------!
453	! Substitution (Forward and Backward) (Thomas algorithm)
454	!------------------------------------------------------------------------------!
455
456	USE control_parameters
457
458	IMPLICIT NONE
459
460	INTEGER :: i, j, k
461	REAL :: ar1(nxl_z:nxr_z,0:nz-1)
462	REAL :: tri(5,nxl_z:nxr_z,0:nz-1)
463	#if defined( __parallel )
464	REAL :: ar(nxl_z:nxr_za,nys_z:nyn_za,1:nza)
465	#else
466	REAL :: ar(1:nz,nys_z:nyn_z,nxl_z:nxr_z)
467	#endif
468
469	!
470	!-- Forward substitution.
471	DO i = nxl_z, nxr_z
472	#if defined( __parallel )
473	ar1(i,0) = ar(i,j,1)
474	#else
475	ar1(i,0) = ar(1,j,i)
476	#endif
477	ENDDO
478	DO k = 1, nz - 1
479	DO i = nxl_z, nxr_z
480	#if defined( __parallel )
481	ar1(i,k) = ar(i,j,k+1) - tri(5,i,k) * ar1(i,k-1)
482	#else
483	ar1(i,k) = ar(k+1,j,i) - tri(5,i,k) * ar1(i,k-1)
484	#endif
485	ENDDO
486	ENDDO
487
488	!
489	!-- Backward substitution
490	!-- Note, the 1.0E-20 in the denominator is due to avoid divisions
491	!-- by zero appearing if the pressure bc is set to neumann at the top of
492	!-- the model domain.
493	DO i = nxl_z, nxr_z
494	#if defined( __parallel )
495	ar(i,j,nz) = ar1(i,nz-1) / ( tri(4,i,nz-1) + 1.0E-20 )
496	#else
497	ar(nz,j,i) = ar1(i,nz-1) / ( tri(4,i,nz-1) + 1.0E-20 )
498	#endif
499	ENDDO
500	DO k = nz-2, 0, -1
501	DO i = nxl_z, nxr_z
502	#if defined( __parallel )
503	ar(i,j,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,j,k+2) ) &
504	/ tri(4,i,k)
505	#else
506	ar(k+1,j,i) = ( ar1(i,k) - tri(3,i,k) * ar(k+2,j,i) ) &
507	/ tri(4,i,k)
508	#endif
509	ENDDO
510	ENDDO
511
512	!
513	!-- Indices i=0, j=0 correspond to horizontally averaged pressure.
514	!-- The respective values of ar should be zero at all k-levels if
515	!-- acceleration of horizontally averaged vertical velocity is zero.
516	IF ( ibc_p_b == 1 .AND. ibc_p_t == 1 ) THEN
517	IF ( j == 0 .AND. nxl_z == 0 ) THEN
518	#if defined( __parallel )
519	DO k = 1, nz
520	ar(nxl_z,j,k) = 0.0
521	ENDDO
522	#else
523	DO k = 1, nz
524	ar(k,j,nxl_z) = 0.0
525	ENDDO
526	#endif
527	ENDIF
528	ENDIF
529
530	END SUBROUTINE substi
531
532
533	SUBROUTINE split( tri )
534
535	!------------------------------------------------------------------------------!
536	! Splitting of the tridiagonal matrix (Thomas algorithm)
537	!------------------------------------------------------------------------------!
538
539	IMPLICIT NONE
540
541	INTEGER :: i, k
542	REAL :: tri(5,nxl_z:nxr_z,0:nz-1)
543
544	!
545	!-- Splitting.
546	DO i = nxl_z, nxr_z
547	tri(4,i,0) = tri(1,i,0)
548	ENDDO
549	DO k = 1, nz-1
550	DO i = nxl_z, nxr_z
551	tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
552	tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
553	ENDDO
554	ENDDO
555
556	END SUBROUTINE split
557
558	END SUBROUTINE tridia
559
560
561	#if defined( __parallel )
562	SUBROUTINE fftxp( ar, direction )
563
564	!------------------------------------------------------------------------------!
565	! Fourier-transformation along x-direction Parallelized version
566	!------------------------------------------------------------------------------!
567
568	IMPLICIT NONE
569
570	CHARACTER (LEN=*) :: direction
571	INTEGER :: j, k
572	REAL :: ar(0:nxa,nys_x:nyn_xa,nzb_x:nzt_xa)
573
574	!
575	!-- Performing the fft with one of the methods implemented
576	!$OMP PARALLEL PRIVATE ( j, k )
577	!$OMP DO
578	DO k = nzb_x, nzt_x
579	DO j = nys_x, nyn_x
580	CALL fft_x( ar(0:nx,j,k), direction )
581	ENDDO
582	ENDDO
583	!$OMP END PARALLEL
584
585	END SUBROUTINE fftxp
586
587	#else
588	SUBROUTINE fftx( ar, direction )
589
590	!------------------------------------------------------------------------------!
591	! Fourier-transformation along x-direction Non parallel version
592	!------------------------------------------------------------------------------!
593
594	IMPLICIT NONE
595
596	CHARACTER (LEN=*) :: direction
597	INTEGER :: i, j, k
598	REAL :: ar(1:nz,0:ny,0:nx)
599
600	!
601	!-- Performing the fft with one of the methods implemented
602	!$OMP PARALLEL PRIVATE ( j, k )
603	!$OMP DO
604	DO k = 1, nz
605	DO j = 0, ny
606	CALL fft_x( ar(k,j,0:nx), direction )
607	ENDDO
608	ENDDO
609	!$OMP END PARALLEL
610
611	END SUBROUTINE fftx
612	#endif
613
614
615	#if defined( __parallel )
616	SUBROUTINE fftyp( ar, direction )
617
618	!------------------------------------------------------------------------------!
619	! Fourier-transformation along y-direction Parallelized version
620	!------------------------------------------------------------------------------!
621
622	IMPLICIT NONE
623
624	CHARACTER (LEN=*) :: direction
625	INTEGER :: i, k
626	REAL :: ar(0:nya,nxl_y:nxr_ya,nzb_y:nzt_ya)
627
628	!
629	!-- Performing the fft with one of the methods implemented
630	!$OMP PARALLEL PRIVATE ( i, k )
631	!$OMP DO
632	DO k = nzb_y, nzt_y
633	DO i = nxl_y, nxr_y
634	CALL fft_y( ar(0:ny,i,k), direction )
635	ENDDO
636	ENDDO
637	!$OMP END PARALLEL
638
639	END SUBROUTINE fftyp
640
641	#else
642	SUBROUTINE ffty( ar, direction )
643
644	!------------------------------------------------------------------------------!
645	! Fourier-transformation along y-direction Non parallel version
646	!------------------------------------------------------------------------------!
647
648	IMPLICIT NONE
649
650	CHARACTER (LEN=*) :: direction
651	INTEGER :: i, k
652	REAL :: ar(1:nz,0:ny,0:nx)
653
654	!
655	!-- Performing the fft with one of the methods implemented
656	!$OMP PARALLEL PRIVATE ( i, k )
657	!$OMP DO
658	DO k = 1, nz
659	DO i = 0, nx
660	CALL fft_y( ar(k,0:ny,i), direction )
661	ENDDO
662	ENDDO
663	!$OMP END PARALLEL
664
665	END SUBROUTINE ffty
666	#endif
667
668	#if defined( __parallel )
669	SUBROUTINE ffty_tr_yx( f_in, work, f_out )
670
671	!------------------------------------------------------------------------------!
672	! Fourier-transformation along y with subsequent transposition y --> x for
673	! a 1d-decomposition along x
674	!
675	! ATTENTION: The performance of this routine is much faster on the NEC-SX6,
676	! if the first index of work_ffty_vec is odd. Otherwise
677	! memory bank conflicts may occur (especially if the index is a
678	! multiple of 128). That's why work_ffty_vec is dimensioned as
679	! 0:ny+1.
680	! Of course, this will not work if users are using an odd number
681	! of gridpoints along y.
682	!------------------------------------------------------------------------------!
683
684	USE control_parameters
685	USE cpulog
686	USE indices
687	USE interfaces
688	USE pegrid
689	USE transpose_indices
690
691	IMPLICIT NONE
692
693	INTEGER :: i, iend, iouter, ir, j, k
694	INTEGER, PARAMETER :: stridex = 4
695
696	REAL, DIMENSION(0:ny,stridex) :: work_ffty
697	#if defined( __nec )
698	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
699	#endif
700	REAL, DIMENSION(1:nza,0:nya,nxl:nxra) :: f_in
701	REAL, DIMENSION(nnx,1:nza,nys_x:nyn_xa,pdims(1)) :: f_out
702	REAL, DIMENSION(nxl:nxra,1:nza,0:nya) :: work
703
704	!
705	!-- Carry out the FFT along y, where all data are present due to the
706	!-- 1d-decomposition along x. Resort the data in a way that x becomes
707	!-- the first index.
708	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
709
710	IF ( host(1:3) == 'nec' ) THEN
711	#if defined( __nec )
712	!
713	!-- Code optimized for vector processors
714	!$OMP PARALLEL PRIVATE ( i, j, k )
715	!$OMP DO
716	DO i = nxl, nxr
717
718	DO j = 0, ny
719	DO k = 1, nz
720	work_ffty_vec(j,k,i) = f_in(k,j,i)
721	ENDDO
722	ENDDO
723
724	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' )
725
726	ENDDO
727
728	!$OMP DO
729	DO k = 1, nz
730	DO j = 0, ny
731	DO i = nxl, nxr
732	work(i,k,j) = work_ffty_vec(j,k,i)
733	ENDDO
734	ENDDO
735	ENDDO
736	!$OMP END PARALLEL
737	#endif
738
739	ELSE
740
741	!
742	!-- Cache optimized code.
743	!-- The i-(x-)direction is split into a strided outer loop and an inner
744	!-- loop for better cache performance
745	!$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty)
746	!$OMP DO
747	DO iouter = nxl, nxr, stridex
748
749	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
750
751	DO k = 1, nz
752
753	DO i = iouter, iend
754
755	ir = i-iouter+1 ! counter within a stride
756	DO j = 0, ny
757	work_ffty(j,ir) = f_in(k,j,i)
758	ENDDO
759	!
760	!-- FFT along y
761	CALL fft_y( work_ffty(:,ir), 'forward' )
762
763	ENDDO
764
765	!
766	!-- Resort
767	DO j = 0, ny
768	DO i = iouter, iend
769	work(i,k,j) = work_ffty(j,i-iouter+1)
770	ENDDO
771	ENDDO
772
773	ENDDO
774
775	ENDDO
776	!$OMP END PARALLEL
777
778	ENDIF
779	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
780
781	!
782	!-- Transpose array
783	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
784	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
785	CALL MPI_ALLTOALL( work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
786	f_out(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
787	comm1dx, ierr )
788	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
789
790	END SUBROUTINE ffty_tr_yx
791
792
793	SUBROUTINE tr_xy_ffty( f_in, work, f_out )
794
795	!------------------------------------------------------------------------------!
796	! Transposition x --> y with a subsequent backward Fourier transformation for
797	! a 1d-decomposition along x
798	!------------------------------------------------------------------------------!
799
800	USE control_parameters
801	USE cpulog
802	USE indices
803	USE interfaces
804	USE pegrid
805	USE transpose_indices
806
807	IMPLICIT NONE
808
809	INTEGER :: i, iend, iouter, ir, j, k
810	INTEGER, PARAMETER :: stridex = 4
811
812	REAL, DIMENSION(0:ny,stridex) :: work_ffty
813	#if defined( __nec )
814	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
815	#endif
816	REAL, DIMENSION(nnx,1:nza,nys_x:nyn_xa,pdims(1)) :: f_in
817	REAL, DIMENSION(1:nza,0:nya,nxl:nxra) :: f_out
818	REAL, DIMENSION(nxl:nxra,1:nza,0:nya) :: work
819
820	!
821	!-- Transpose array
822	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
823	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
824	CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
825	work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
826	comm1dx, ierr )
827	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
828
829	!
830	!-- Resort the data in a way that y becomes the first index and carry out the
831	!-- backward fft along y.
832	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
833
834	IF ( host(1:3) == 'nec' ) THEN
835	#if defined( __nec )
836	!
837	!-- Code optimized for vector processors
838	!$OMP PARALLEL PRIVATE ( i, j, k )
839	!$OMP DO
840	DO k = 1, nz
841	DO j = 0, ny
842	DO i = nxl, nxr
843	work_ffty_vec(j,k,i) = work(i,k,j)
844	ENDDO
845	ENDDO
846	ENDDO
847
848	!$OMP DO
849	DO i = nxl, nxr
850
851	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' )
852
853	DO j = 0, ny
854	DO k = 1, nz
855	f_out(k,j,i) = work_ffty_vec(j,k,i)
856	ENDDO
857	ENDDO
858
859	ENDDO
860	!$OMP END PARALLEL
861	#endif
862
863	ELSE
864
865	!
866	!-- Cache optimized code.
867	!-- The i-(x-)direction is split into a strided outer loop and an inner
868	!-- loop for better cache performance
869	!$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty )
870	!$OMP DO
871	DO iouter = nxl, nxr, stridex
872
873	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
874
875	DO k = 1, nz
876	!
877	!-- Resort
878	DO j = 0, ny
879	DO i = iouter, iend
880	work_ffty(j,i-iouter+1) = work(i,k,j)
881	ENDDO
882	ENDDO
883
884	DO i = iouter, iend
885
886	!
887	!-- FFT along y
888	ir = i-iouter+1 ! counter within a stride
889	CALL fft_y( work_ffty(:,ir), 'backward' )
890
891	DO j = 0, ny
892	f_out(k,j,i) = work_ffty(j,ir)
893	ENDDO
894	ENDDO
895
896	ENDDO
897
898	ENDDO
899	!$OMP END PARALLEL
900
901	ENDIF
902
903	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
904
905	END SUBROUTINE tr_xy_ffty
906
907
908	SUBROUTINE fftx_tri_fftx( ar )
909
910	!------------------------------------------------------------------------------!
911	! FFT along x, solution of the tridiagonal system and backward FFT for
912	! a 1d-decomposition along x
913	!
914	! WARNING: this subroutine may still not work for hybrid parallelization
915	! with OpenMP (for possible necessary changes see the original
916	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
917	!------------------------------------------------------------------------------!
918
919	USE control_parameters
920	USE cpulog
921	USE grid_variables
922	USE indices
923	USE interfaces
924	USE pegrid
925	USE transpose_indices
926
927	IMPLICIT NONE
928
929	character(len=3) :: myth_char
930
931	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
932
933	REAL, DIMENSION(0:nx) :: work_fftx
934	REAL, DIMENSION(0:nx,1:nz) :: work_trix
935	REAL, DIMENSION(nnx,1:nza,nys_x:nyn_xa,pdims(1)) :: ar
936	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
937
938
939	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'start' )
940
941	ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) )
942
943	tn = 0 ! Default thread number in case of one thread
944	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_fftx, work_trix )
945	DO j = nys_x, nyn_x
946
947	!$ tn = omp_get_thread_num()
948
949	IF ( host(1:3) == 'nec' ) THEN
950	!
951	!-- Code optimized for vector processors
952	DO k = 1, nz
953
954	m = 0
955	DO n = 1, pdims(1)
956	DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
957	work_trix(m,k) = ar(i,k,j,n)
958	m = m + 1
959	ENDDO
960	ENDDO
961
962	ENDDO
963
964	CALL fft_x_m( work_trix, 'forward' )
965
966	ELSE
967	!
968	!-- Cache optimized code
969	DO k = 1, nz
970
971	m = 0
972	DO n = 1, pdims(1)
973	DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
974	work_fftx(m) = ar(i,k,j,n)
975	m = m + 1
976	ENDDO
977	ENDDO
978
979	CALL fft_x( work_fftx, 'forward' )
980
981	DO i = 0, nx
982	work_trix(i,k) = work_fftx(i)
983	ENDDO
984
985	ENDDO
986
987	ENDIF
988
989	!
990	!-- Solve the linear equation system
991	CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) )
992
993	IF ( host(1:3) == 'nec' ) THEN
994	!
995	!-- Code optimized for vector processors
996	CALL fft_x_m( work_trix, 'backward' )
997
998	DO k = 1, nz
999
1000	m = 0
1001	DO n = 1, pdims(1)
1002	DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
1003	ar(i,k,j,n) = work_trix(m,k)
1004	m = m + 1
1005	ENDDO
1006	ENDDO
1007
1008	ENDDO
1009
1010	ELSE
1011	!
1012	!-- Cache optimized code
1013	DO k = 1, nz
1014
1015	DO i = 0, nx
1016	work_fftx(i) = work_trix(i,k)
1017	ENDDO
1018
1019	CALL fft_x( work_fftx, 'backward' )
1020
1021	m = 0
1022	DO n = 1, pdims(1)
1023	DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
1024	ar(i,k,j,n) = work_fftx(m)
1025	m = m + 1
1026	ENDDO
1027	ENDDO
1028
1029	ENDDO
1030
1031	ENDIF
1032
1033	ENDDO
1034
1035	DEALLOCATE( tri )
1036
1037	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'stop' )
1038
1039	END SUBROUTINE fftx_tri_fftx
1040
1041
1042	SUBROUTINE fftx_tr_xy( f_in, work, f_out )
1043
1044	!------------------------------------------------------------------------------!
1045	! Fourier-transformation along x with subsequent transposition x --> y for
1046	! a 1d-decomposition along y
1047	!
1048	! ATTENTION: The NEC-branch of this routine may significantly profit from
1049	! further optimizations. So far, performance is much worse than
1050	! for routine ffty_tr_yx (more than three times slower).
1051	!------------------------------------------------------------------------------!
1052
1053	USE control_parameters
1054	USE cpulog
1055	USE indices
1056	USE interfaces
1057	USE pegrid
1058	USE transpose_indices
1059
1060	IMPLICIT NONE
1061
1062	INTEGER :: i, j, k
1063
1064	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
1065	REAL, DIMENSION(1:nza,nys:nyna,0:nxa) :: f_in
1066	REAL, DIMENSION(nny,1:nza,nxl_y:nxr_ya,pdims(2)) :: f_out
1067	REAL, DIMENSION(nys:nyna,1:nza,0:nxa) :: work
1068
1069	!
1070	!-- Carry out the FFT along x, where all data are present due to the
1071	!-- 1d-decomposition along y. Resort the data in a way that y becomes
1072	!-- the first index.
1073	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
1074
1075	IF ( host(1:3) == 'nec' ) THEN
1076	!
1077	!-- Code for vector processors
1078	!$OMP PARALLEL PRIVATE ( i, j, k )
1079	!$OMP DO
1080	DO i = 0, nx
1081
1082	DO j = nys, nyn
1083	DO k = 1, nz
1084	work_fftx(i,k,j) = f_in(k,j,i)
1085	ENDDO
1086	ENDDO
1087
1088	ENDDO
1089
1090	!$OMP DO
1091	DO j = nys, nyn
1092
1093	CALL fft_x_m( work_fftx(:,:,j), 'forward' )
1094
1095	DO k = 1, nz
1096	DO i = 0, nx
1097	work(j,k,i) = work_fftx(i,k,j)
1098	ENDDO
1099	ENDDO
1100
1101	ENDDO
1102	!$OMP END PARALLEL
1103
1104	ELSE
1105
1106	!
1107	!-- Cache optimized code (there might be still a potential for better
1108	!-- optimization).
1109	!$OMP PARALLEL PRIVATE (i,j,k)
1110	!$OMP DO
1111	DO i = 0, nx
1112
1113	DO j = nys, nyn
1114	DO k = 1, nz
1115	work_fftx(i,k,j) = f_in(k,j,i)
1116	ENDDO
1117	ENDDO
1118
1119	ENDDO
1120
1121	!$OMP DO
1122	DO j = nys, nyn
1123	DO k = 1, nz
1124
1125	CALL fft_x( work_fftx(0:nx,k,j), 'forward' )
1126
1127	DO i = 0, nx
1128	work(j,k,i) = work_fftx(i,k,j)
1129	ENDDO
1130	ENDDO
1131
1132	ENDDO
1133	!$OMP END PARALLEL
1134
1135	ENDIF
1136	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
1137
1138	!
1139	!-- Transpose array
1140	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1141	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1142	CALL MPI_ALLTOALL( work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
1143	f_out(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
1144	comm1dy, ierr )
1145	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1146
1147	END SUBROUTINE fftx_tr_xy
1148
1149
1150	SUBROUTINE tr_yx_fftx( f_in, work, f_out )
1151
1152	!------------------------------------------------------------------------------!
1153	! Transposition y --> x with a subsequent backward Fourier transformation for
1154	! a 1d-decomposition along x
1155	!------------------------------------------------------------------------------!
1156
1157	USE control_parameters
1158	USE cpulog
1159	USE indices
1160	USE interfaces
1161	USE pegrid
1162	USE transpose_indices
1163
1164	IMPLICIT NONE
1165
1166	INTEGER :: i, j, k
1167
1168	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
1169	REAL, DIMENSION(nny,1:nza,nxl_y:nxr_ya,pdims(2)) :: f_in
1170	REAL, DIMENSION(1:nza,nys:nyna,0:nxa) :: f_out
1171	REAL, DIMENSION(nys:nyna,1:nza,0:nxa) :: work
1172
1173	!
1174	!-- Transpose array
1175	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1176	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1177	CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
1178	work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
1179	comm1dy, ierr )
1180	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1181
1182	!
1183	!-- Carry out the FFT along x, where all data are present due to the
1184	!-- 1d-decomposition along y. Resort the data in a way that y becomes
1185	!-- the first index.
1186	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
1187
1188	IF ( host(1:3) == 'nec' ) THEN
1189	!
1190	!-- Code optimized for vector processors
1191	!$OMP PARALLEL PRIVATE ( i, j, k )
1192	!$OMP DO
1193	DO j = nys, nyn
1194
1195	DO k = 1, nz
1196	DO i = 0, nx
1197	work_fftx(i,k,j) = work(j,k,i)
1198	ENDDO
1199	ENDDO
1200
1201	CALL fft_x_m( work_fftx(:,:,j), 'backward' )
1202
1203	ENDDO
1204
1205	!$OMP DO
1206	DO i = 0, nx
1207	DO j = nys, nyn
1208	DO k = 1, nz
1209	f_out(k,j,i) = work_fftx(i,k,j)
1210	ENDDO
1211	ENDDO
1212	ENDDO
1213	!$OMP END PARALLEL
1214
1215	ELSE
1216
1217	!
1218	!-- Cache optimized code (there might be still a potential for better
1219	!-- optimization).
1220	!$OMP PARALLEL PRIVATE (i,j,k)
1221	!$OMP DO
1222	DO j = nys, nyn
1223	DO k = 1, nz
1224
1225	DO i = 0, nx
1226	work_fftx(i,k,j) = work(j,k,i)
1227	ENDDO
1228
1229	CALL fft_x( work_fftx(0:nx,k,j), 'backward' )
1230
1231	ENDDO
1232	ENDDO
1233
1234	!$OMP DO
1235	DO i = 0, nx
1236	DO j = nys, nyn
1237	DO k = 1, nz
1238	f_out(k,j,i) = work_fftx(i,k,j)
1239	ENDDO
1240	ENDDO
1241	ENDDO
1242	!$OMP END PARALLEL
1243
1244	ENDIF
1245	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
1246
1247	END SUBROUTINE tr_yx_fftx
1248
1249
1250	SUBROUTINE ffty_tri_ffty( ar )
1251
1252	!------------------------------------------------------------------------------!
1253	! FFT along y, solution of the tridiagonal system and backward FFT for
1254	! a 1d-decomposition along y
1255	!
1256	! WARNING: this subroutine may still not work for hybrid parallelization
1257	! with OpenMP (for possible necessary changes see the original
1258	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
1259	!------------------------------------------------------------------------------!
1260
1261	USE control_parameters
1262	USE cpulog
1263	USE grid_variables
1264	USE indices
1265	USE interfaces
1266	USE pegrid
1267	USE transpose_indices
1268
1269	IMPLICIT NONE
1270
1271	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
1272
1273	REAL, DIMENSION(0:ny) :: work_ffty
1274	REAL, DIMENSION(0:ny,1:nz) :: work_triy
1275	REAL, DIMENSION(nny,1:nza,nxl_y:nxr_ya,pdims(2)) :: ar
1276	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
1277
1278
1279	CALL cpu_log( log_point_s(39), 'fft_y + tridia', 'start' )
1280
1281	ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) )
1282
1283	tn = 0 ! Default thread number in case of one thread
1284	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy )
1285	DO i = nxl_y, nxr_y
1286
1287	!$ tn = omp_get_thread_num()
1288
1289	IF ( host(1:3) == 'nec' ) THEN
1290	!
1291	!-- Code optimized for vector processors
1292	DO k = 1, nz
1293
1294	m = 0
1295	DO n = 1, pdims(2)
1296	DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
1297	work_triy(m,k) = ar(j,k,i,n)
1298	m = m + 1
1299	ENDDO
1300	ENDDO
1301
1302	ENDDO
1303
1304	CALL fft_y_m( work_triy, ny, 'forward' )
1305
1306	ELSE
1307	!
1308	!-- Cache optimized code
1309	DO k = 1, nz
1310
1311	m = 0
1312	DO n = 1, pdims(2)
1313	DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
1314	work_ffty(m) = ar(j,k,i,n)
1315	m = m + 1
1316	ENDDO
1317	ENDDO
1318
1319	CALL fft_y( work_ffty, 'forward' )
1320
1321	DO j = 0, ny
1322	work_triy(j,k) = work_ffty(j)
1323	ENDDO
1324
1325	ENDDO
1326
1327	ENDIF
1328
1329	!
1330	!-- Solve the linear equation system
1331	CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) )
1332
1333	IF ( host(1:3) == 'nec' ) THEN
1334	!
1335	!-- Code optimized for vector processors
1336	CALL fft_y_m( work_triy, ny, 'backward' )
1337
1338	DO k = 1, nz
1339
1340	m = 0
1341	DO n = 1, pdims(2)
1342	DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
1343	ar(j,k,i,n) = work_triy(m,k)
1344	m = m + 1
1345	ENDDO
1346	ENDDO
1347
1348	ENDDO
1349
1350	ELSE
1351	!
1352	!-- Cache optimized code
1353	DO k = 1, nz
1354
1355	DO j = 0, ny
1356	work_ffty(j) = work_triy(j,k)
1357	ENDDO
1358
1359	CALL fft_y( work_ffty, 'backward' )
1360
1361	m = 0
1362	DO n = 1, pdims(2)
1363	DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
1364	ar(j,k,i,n) = work_ffty(m)
1365	m = m + 1
1366	ENDDO
1367	ENDDO
1368
1369	ENDDO
1370
1371	ENDIF
1372
1373	ENDDO
1374
1375	DEALLOCATE( tri )
1376
1377	CALL cpu_log( log_point_s(39), 'fft_y + tridia', 'stop' )
1378
1379	END SUBROUTINE ffty_tri_ffty
1380
1381
1382	SUBROUTINE tridia_1dd( ddx2, ddy2, nx, ny, j, ar, tri )
1383
1384	!------------------------------------------------------------------------------!
1385	! Solves the linear system of equations for a 1d-decomposition along x (see
1386	! tridia)
1387	!
1388	! Attention: when using the intel compilers older than 12.0, array tri must
1389	! be passed as an argument to the contained subroutines. Otherwise
1390	! addres faults will occur. This feature can be activated with
1391	! cpp-switch __intel11
1392	! On NEC, tri should not be passed (except for routine substi_1dd)
1393	! because this causes very bad performance.
1394	!------------------------------------------------------------------------------!
1395
1396	USE arrays_3d
1397	USE control_parameters
1398
1399	USE pegrid
1400
1401	IMPLICIT NONE
1402
1403	INTEGER :: i, j, k, nnyh, nx, ny, omp_get_thread_num, tn
1404
1405	REAL :: ddx2, ddy2
1406
1407	REAL, DIMENSION(0:nx,1:nz) :: ar
1408	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
1409	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
1410
1411
1412	nnyh = ( ny + 1 ) / 2
1413
1414	!
1415	!-- Define constant elements of the tridiagonal matrix.
1416	!-- The compiler on SX6 does loop exchange. If 0:nx is a high power of 2,
1417	!-- the exchanged loops create bank conflicts. The following directive
1418	!-- prohibits loop exchange and the loops perform much better.
1419	! tn = omp_get_thread_num()
1420	! WRITE( 120+tn, * ) '+++ id=',myid,' nx=',nx,' thread=', omp_get_thread_num()
1421	! CALL local_flush( 120+tn )
1422	!CDIR NOLOOPCHG
1423	DO k = 0, nz-1
1424	DO i = 0,nx
1425	tri(2,i,k) = ddzu_pres(k+1) * ddzw(k+1)
1426	tri(3,i,k) = ddzu_pres(k+2) * ddzw(k+1)
1427	ENDDO
1428	ENDDO
1429	! WRITE( 120+tn, * ) '+++ id=',myid,' end of first tridia loop thread=', omp_get_thread_num()
1430	! CALL local_flush( 120+tn )
1431
1432	IF ( j <= nnyh ) THEN
1433	#if defined( __intel11 )
1434	CALL maketri_1dd( j, tri )
1435	#else
1436	CALL maketri_1dd( j )
1437	#endif
1438	ELSE
1439	#if defined( __intel11 )
1440	CALL maketri_1dd( ny+1-j, tri )
1441	#else
1442	CALL maketri_1dd( ny+1-j )
1443	#endif
1444	ENDIF
1445	#if defined( __intel11 )
1446	CALL split_1dd( tri )
1447	#else
1448	CALL split_1dd
1449	#endif
1450	CALL substi_1dd( ar, tri )
1451
1452	CONTAINS
1453
1454	#if defined( __intel11 )
1455	SUBROUTINE maketri_1dd( j, tri )
1456	#else
1457	SUBROUTINE maketri_1dd( j )
1458	#endif
1459
1460	!------------------------------------------------------------------------------!
1461	! computes the i- and j-dependent component of the matrix
1462	!------------------------------------------------------------------------------!
1463
1464	USE constants
1465
1466	IMPLICIT NONE
1467
1468	INTEGER :: i, j, k, nnxh
1469	REAL :: a, c
1470
1471	REAL, DIMENSION(0:nx) :: l
1472
1473	#if defined( __intel11 )
1474	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
1475	#endif
1476
1477
1478	nnxh = ( nx + 1 ) / 2
1479	!
1480	!-- Provide the tridiagonal matrix for solution of the Poisson equation in
1481	!-- Fourier space. The coefficients are computed following the method of
1482	!-- Schmidt et al. (DFVLR-Mitteilung 84-15), which departs from Stephan
1483	!-- Siano's original version by discretizing the Poisson equation,
1484	!-- before it is Fourier-transformed
1485	DO i = 0, nx
1486	IF ( i >= 0 .AND. i <= nnxh ) THEN
1487	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
1488	FLOAT( nx+1 ) ) ) * ddx2 + &
1489	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
1490	FLOAT( ny+1 ) ) ) * ddy2
1491	ELSE
1492	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
1493	FLOAT( nx+1 ) ) ) * ddx2 + &
1494	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
1495	FLOAT( ny+1 ) ) ) * ddy2
1496	ENDIF
1497	ENDDO
1498
1499	DO k = 0, nz-1
1500	DO i = 0, nx
1501	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
1502	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
1503	tri(1,i,k) = a + c - l(i)
1504	ENDDO
1505	ENDDO
1506	IF ( ibc_p_b == 1 .OR. ibc_p_b == 2 ) THEN
1507	DO i = 0, nx
1508	tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
1509	ENDDO
1510	ENDIF
1511	IF ( ibc_p_t == 1 ) THEN
1512	DO i = 0, nx
1513	tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
1514	ENDDO
1515	ENDIF
1516
1517	END SUBROUTINE maketri_1dd
1518
1519
1520	#if defined( __intel11 )
1521	SUBROUTINE split_1dd( tri )
1522	#else
1523	SUBROUTINE split_1dd
1524	#endif
1525
1526	!------------------------------------------------------------------------------!
1527	! Splitting of the tridiagonal matrix (Thomas algorithm)
1528	!------------------------------------------------------------------------------!
1529
1530	IMPLICIT NONE
1531
1532	INTEGER :: i, k
1533
1534	#if defined( __intel11 )
1535	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
1536	#endif
1537
1538
1539	!
1540	!-- Splitting
1541	DO i = 0, nx
1542	tri(4,i,0) = tri(1,i,0)
1543	ENDDO
1544	DO k = 1, nz-1
1545	DO i = 0, nx
1546	tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
1547	tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
1548	ENDDO
1549	ENDDO
1550
1551	END SUBROUTINE split_1dd
1552
1553
1554	SUBROUTINE substi_1dd( ar, tri )
1555
1556	!------------------------------------------------------------------------------!
1557	! Substitution (Forward and Backward) (Thomas algorithm)
1558	!------------------------------------------------------------------------------!
1559
1560	IMPLICIT NONE
1561
1562	INTEGER :: i, k
1563
1564	REAL, DIMENSION(0:nx,nz) :: ar
1565	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
1566	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
1567
1568	!
1569	!-- Forward substitution
1570	DO i = 0, nx
1571	ar1(i,0) = ar(i,1)
1572	ENDDO
1573	DO k = 1, nz-1
1574	DO i = 0, nx
1575	ar1(i,k) = ar(i,k+1) - tri(5,i,k) * ar1(i,k-1)
1576	ENDDO
1577	ENDDO
1578
1579	!
1580	!-- Backward substitution
1581	!-- Note, the add of 1.0E-20 in the denominator is due to avoid divisions
1582	!-- by zero appearing if the pressure bc is set to neumann at the top of
1583	!-- the model domain.
1584	DO i = 0, nx
1585	ar(i,nz) = ar1(i,nz-1) / ( tri(4,i,nz-1) + 1.0E-20 )
1586	ENDDO
1587	DO k = nz-2, 0, -1
1588	DO i = 0, nx
1589	ar(i,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,k+2) ) &
1590	/ tri(4,i,k)
1591	ENDDO
1592	ENDDO
1593
1594	!
1595	!-- Indices i=0, j=0 correspond to horizontally averaged pressure.
1596	!-- The respective values of ar should be zero at all k-levels if
1597	!-- acceleration of horizontally averaged vertical velocity is zero.
1598	IF ( ibc_p_b == 1 .AND. ibc_p_t == 1 ) THEN
1599	IF ( j == 0 ) THEN
1600	DO k = 1, nz
1601	ar(0,k) = 0.0
1602	ENDDO
1603	ENDIF
1604	ENDIF
1605
1606	END SUBROUTINE substi_1dd
1607
1608	END SUBROUTINE tridia_1dd
1609
1610	#endif
1611	#endif
1612	END MODULE poisfft_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |