Home

Context Navigation

source: palm/trunk/SOURCE/poisfft.f90 @ 940

Last change on this file since 940 was 940, checked in by raasch, 12 years ago
temperature equation can be switched off; bugfix of tridia_1dd for current Intel (12.1) compilers
Property svn:keywords set to `Id`
File size: 47.1 KB

Line
1	MODULE poisfft_mod
2
3	!------------------------------------------------------------------------------!
4	! Current revisions:
5	! -----------------
6	! special handling of tri-array as an argument in tridia_1dd routines switched
7	! off because it caused segmentation faults with intel 12.1 compiler
8	!
9	! Former revisions:
10	! -----------------
11	! $Id: poisfft.f90 940 2012-07-09 14:31:00Z raasch $
12	!
13	! 877 2012-04-03 11:21:44Z suehring
14	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
15	! pressure at the top of the model domain.
16	!
17	! 809 2012-01-30 13:32:58Z maronga
18	! Bugfix: replaced .AND. and .NOT. with && and ! in the preprocessor directives
19	!
20	! 807 2012-01-25 11:53:51Z maronga
21	! New cpp directive "__check" implemented which is used by check_namelist_files
22	! (most of the code is unneeded by check_namelist_files).
23	!
24	! 763 2011-10-06 09:32:09Z suehring
25	! Comment added concerning the last change.
26	!
27	! 761 2011-10-05 17:58:52Z suehring
28	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
29	! pressure at the top of the model domain.
30	!
31	! 696 2011-03-18 07:03:49Z raasch
32	! work_fftx removed from PRIVATE clauses in fftx_tr_xy and tr_yx_fftx
33	!
34	! 683 2011-02-09 14:25:15Z raasch
35	! openMP parallelization for 2d-domain-decomposition
36	!
37	! 667 2010-12-23 12:06:00Z suehring/gryschka
38	! ddzu replaced by ddzu_pres due to changes in zu(0)
39	!
40	! 622 2010-12-10 08:08:13Z raasch
41	! optional barriers included in order to speed up collective operations
42	!
43	! 377 2009-09-04 11:09:00Z raasch
44	! __lcmuk changed to __lc to avoid problems with Intel compiler on sgi-ice
45	!
46	! 164 2008-05-15 08:46:15Z raasch
47	! Arguments removed from transpose routines
48	!
49	! 128 2007-10-26 13:11:14Z raasch
50	! Bugfix: wavenumber calculation for even nx in routines maketri
51	!
52	! 85 2007-05-11 09:35:14Z raasch
53	! Bugfix: work_fft*_vec removed from some PRIVATE-declarations
54	!
55	! 76 2007-03-29 00:58:32Z raasch
56	! Tridiagonal coefficients adjusted for Neumann boundary conditions both at
57	! the bottom and the top.
58	!
59	! RCS Log replace by Id keyword, revision history cleaned up
60	!
61	! Revision 1.24 2006/08/04 15:00:24 raasch
62	! Default setting of the thread number tn in case of not using OpenMP
63	!
64	! Revision 1.23 2006/02/23 12:48:38 raasch
65	! Additional compiler directive in routine tridia_1dd for preventing loop
66	! exchange on NEC-SX6
67	!
68	! Revision 1.20 2004/04/30 12:38:09 raasch
69	! Parts of former poisfft_hybrid moved to this subroutine,
70	! former subroutine changed to a module, renaming of FFT-subroutines and
71	! -module, FFTs completely substituted by calls of fft_x and fft_y,
72	! NAG fft used in the non-parallel case completely removed, l in maketri
73	! is now a 1d-array, variables passed by modules instead of using parameter
74	! lists, enlarged transposition arrays introduced
75	!
76	! Revision 1.1 1997/07/24 11:24:14 raasch
77	! Initial revision
78	!
79	!
80	! Description:
81	! ------------
82	! See below.
83	!------------------------------------------------------------------------------!
84
85	!--------------------------------------------------------------------------!
86	! poisfft !
87	! !
88	! Original version: Stephan Siano (pois3d) !
89	! !
90	! Institute of Meteorology and Climatology, University of Hannover !
91	! Germany !
92	! !
93	! Version as of July 23,1996 !
94	! !
95	! !
96	! Version for parallel computers: Siegfried Raasch !
97	! !
98	! Version as of July 03,1997 !
99	! !
100	! Solves the Poisson equation with a 2D spectral method !
101	! d^2 p / dx^2 + d^2 p / dy^2 + d^2 p / dz^2 = s !
102	! !
103	! Input: !
104	! real ar contains in the (nnx,nny,nnz) elements, !
105	! starting from the element (1,nys,nxl), the !
106	! values for s !
107	! real work Temporary array !
108	! !
109	! Output: !
110	! real ar contains the solution for p !
111	!--------------------------------------------------------------------------!
112
113	USE fft_xy
114	USE indices
115	USE transpose_indices
116
117	IMPLICIT NONE
118
119	PRIVATE
120
121	#if ! defined ( __check )
122	PUBLIC poisfft, poisfft_init
123
124	INTERFACE poisfft
125	MODULE PROCEDURE poisfft
126	END INTERFACE poisfft
127
128	INTERFACE poisfft_init
129	MODULE PROCEDURE poisfft_init
130	END INTERFACE poisfft_init
131	#else
132	PUBLIC poisfft_init
133
134	INTERFACE poisfft_init
135	MODULE PROCEDURE poisfft_init
136	END INTERFACE poisfft_init
137	#endif
138
139	CONTAINS
140
141	SUBROUTINE poisfft_init
142
143	CALL fft_init
144
145	END SUBROUTINE poisfft_init
146
147	#if ! defined ( __check )
148	SUBROUTINE poisfft( ar, work )
149
150	USE cpulog
151	USE interfaces
152	USE pegrid
153
154	IMPLICIT NONE
155
156	REAL, DIMENSION(1:nza,nys:nyna,nxl:nxra) :: ar, work
157
158
159	CALL cpu_log( log_point_s(3), 'poisfft', 'start' )
160
161	!
162	!-- Two-dimensional Fourier Transformation in x- and y-direction.
163	#if defined( __parallel )
164	IF ( pdims(2) == 1 ) THEN
165
166	!
167	!-- 1d-domain-decomposition along x:
168	!-- FFT along y and transposition y --> x
169	CALL ffty_tr_yx( ar, work, ar )
170
171	!
172	!-- FFT along x, solving the tridiagonal system and backward FFT
173	CALL fftx_tri_fftx( ar )
174
175	!
176	!-- Transposition x --> y and backward FFT along y
177	CALL tr_xy_ffty( ar, work, ar )
178
179	ELSEIF ( pdims(1) == 1 ) THEN
180
181	!
182	!-- 1d-domain-decomposition along y:
183	!-- FFT along x and transposition x --> y
184	CALL fftx_tr_xy( ar, work, ar )
185
186	!
187	!-- FFT along y, solving the tridiagonal system and backward FFT
188	CALL ffty_tri_ffty( ar )
189
190	!
191	!-- Transposition y --> x and backward FFT along x
192	CALL tr_yx_fftx( ar, work, ar )
193
194	ELSE
195
196	!
197	!-- 2d-domain-decomposition
198	!-- Transposition z --> x
199	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
200	CALL transpose_zx( ar, work, ar )
201	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
202
203	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
204	CALL fftxp( ar, 'forward' )
205	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
206
207	!
208	!-- Transposition x --> y
209	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
210	CALL transpose_xy( ar, work, ar )
211	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
212
213	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
214	CALL fftyp( ar, 'forward' )
215	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
216
217	!
218	!-- Transposition y --> z
219	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
220	CALL transpose_yz( ar, work, ar )
221	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
222
223	!
224	!-- Solve the Poisson equation in z-direction in cartesian space.
225	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
226	CALL tridia( ar )
227	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
228
229	!
230	!-- Inverse Fourier Transformation
231	!-- Transposition z --> y
232	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
233	CALL transpose_zy( ar, work, ar )
234	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
235
236	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
237	CALL fftyp( ar, 'backward' )
238	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
239
240	!
241	!-- Transposition y --> x
242	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
243	CALL transpose_yx( ar, work, ar )
244	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
245
246	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
247	CALL fftxp( ar, 'backward' )
248	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
249
250	!
251	!-- Transposition x --> z
252	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
253	CALL transpose_xz( ar, work, ar )
254	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
255
256	ENDIF
257
258	#else
259
260	!
261	!-- Two-dimensional Fourier Transformation along x- and y-direction.
262	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
263	CALL fftx( ar, 'forward' )
264	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
265	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
266	CALL ffty( ar, 'forward' )
267	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
268
269	!
270	!-- Solve the Poisson equation in z-direction in cartesian space.
271	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
272	CALL tridia( ar )
273	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
274
275	!
276	!-- Inverse Fourier Transformation.
277	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
278	CALL ffty( ar, 'backward' )
279	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
280	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
281	CALL fftx( ar, 'backward' )
282	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
283
284	#endif
285
286	CALL cpu_log( log_point_s(3), 'poisfft', 'stop' )
287
288	END SUBROUTINE poisfft
289
290
291
292	SUBROUTINE tridia( ar )
293
294	!------------------------------------------------------------------------------!
295	! solves the linear system of equations:
296	!
297	! -(4 pi^2(i^2/(dx^2nnx^2)+j^2/(dy^2nny^2))+
298	! 1/(dzu(k)dzw(k))+1/(dzu(k-1)dzw(k)))*p(i,j,k)+
299	! 1/(dzu(k)dzw(k))p(i,j,k+1)+1/(dzu(k-1)dzw(k))p(i,j,k-1)=d(i,j,k)
300	!
301	! by using the Thomas algorithm
302	!------------------------------------------------------------------------------!
303
304	USE arrays_3d
305
306	IMPLICIT NONE
307
308	INTEGER :: i, j, k, nnyh
309
310	REAL, DIMENSION(nxl_z:nxr_z,0:nz-1) :: ar1
311	REAL, DIMENSION(5,nxl_z:nxr_z,0:nz-1) :: tri
312
313	#if defined( __parallel )
314	REAL :: ar(nxl_z:nxr_za,nys_z:nyn_za,1:nza)
315	#else
316	REAL :: ar(1:nz,nys_z:nyn_z,nxl_z:nxr_z)
317	#endif
318
319
320	nnyh = (ny+1) / 2
321
322	!
323	!-- Define constant elements of the tridiagonal matrix.
324	!$OMP PARALLEL PRIVATE ( k, i )
325	!$OMP DO
326	DO k = 0, nz-1
327	DO i = nxl_z, nxr_z
328	tri(2,i,k) = ddzu_pres(k+1) * ddzw(k+1)
329	tri(3,i,k) = ddzu_pres(k+2) * ddzw(k+1)
330	ENDDO
331	ENDDO
332	!$OMP END PARALLEL
333
334	#if defined( __parallel )
335	!
336	!-- Repeat for all y-levels.
337	!$OMP PARALLEL FIRSTPRIVATE( tri ) PRIVATE ( ar1, j )
338	!$OMP DO
339	DO j = nys_z, nyn_z
340	IF ( j <= nnyh ) THEN
341	CALL maketri( tri, j )
342	ELSE
343	CALL maketri( tri, ny+1-j )
344	ENDIF
345	CALL split( tri )
346	CALL substi( ar, ar1, tri, j )
347	ENDDO
348	!$OMP END PARALLEL
349	#else
350	!
351	!-- First y-level.
352	CALL maketri( tri, nys_z )
353	CALL split( tri )
354	CALL substi( ar, ar1, tri, 0 )
355
356	!
357	!-- Further y-levels.
358	DO j = 1, nnyh - 1
359	CALL maketri( tri, j )
360	CALL split( tri )
361	CALL substi( ar, ar1, tri, j )
362	CALL substi( ar, ar1, tri, ny+1-j )
363	ENDDO
364	CALL maketri( tri, nnyh )
365	CALL split( tri )
366	CALL substi( ar, ar1, tri, nnyh+nys )
367	#endif
368
369	CONTAINS
370
371	SUBROUTINE maketri( tri, j )
372
373	!------------------------------------------------------------------------------!
374	! Computes the i- and j-dependent component of the matrix
375	!------------------------------------------------------------------------------!
376
377	USE arrays_3d
378	USE constants
379	USE control_parameters
380	USE grid_variables
381
382	IMPLICIT NONE
383
384	INTEGER :: i, j, k, nnxh
385	REAL :: a, c
386	REAL :: ll(nxl_z:nxr_z)
387	REAL :: tri(5,nxl_z:nxr_z,0:nz-1)
388
389
390	nnxh = ( nx + 1 ) / 2
391
392	!
393	!-- Provide the tridiagonal matrix for solution of the Poisson equation in
394	!-- Fourier space. The coefficients are computed following the method of
395	!-- Schmidt et al. (DFVLR-Mitteilung 84-15), which departs from Stephan
396	!-- Siano's original version by discretizing the Poisson equation,
397	!-- before it is Fourier-transformed
398	#if defined( __parallel )
399	DO i = nxl_z, nxr_z
400	IF ( i >= 0 .AND. i <= nnxh ) THEN
401	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
402	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
403	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
404	FLOAT( ny+1 ) ) ) / ( dy * dy )
405	ELSE
406	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
407	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
408	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
409	FLOAT( ny+1 ) ) ) / ( dy * dy )
410	ENDIF
411	DO k = 0,nz-1
412	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
413	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
414	tri(1,i,k) = a + c - ll(i)
415	ENDDO
416	ENDDO
417	#else
418	DO i = 0, nnxh
419	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / FLOAT( nx+1 ) ) ) / &
420	( dx * dx ) + &
421	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / FLOAT( ny+1 ) ) ) / &
422	( dy * dy )
423	DO k = 0, nz-1
424	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
425	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
426	tri(1,i,k) = a + c - ll(i)
427	IF ( i >= 1 .and. i < nnxh ) THEN
428	tri(1,nx+1-i,k) = tri(1,i,k)
429	ENDIF
430	ENDDO
431	ENDDO
432	#endif
433	IF ( ibc_p_b == 1 .OR. ibc_p_b == 2 ) THEN
434	DO i = nxl_z, nxr_z
435	tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
436	ENDDO
437	ENDIF
438	IF ( ibc_p_t == 1 ) THEN
439	DO i = nxl_z, nxr_z
440	tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
441	ENDDO
442	ENDIF
443
444	END SUBROUTINE maketri
445
446
447	SUBROUTINE substi( ar, ar1, tri, j )
448
449	!------------------------------------------------------------------------------!
450	! Substitution (Forward and Backward) (Thomas algorithm)
451	!------------------------------------------------------------------------------!
452
453	USE control_parameters
454
455	IMPLICIT NONE
456
457	INTEGER :: i, j, k
458	REAL :: ar1(nxl_z:nxr_z,0:nz-1)
459	REAL :: tri(5,nxl_z:nxr_z,0:nz-1)
460	#if defined( __parallel )
461	REAL :: ar(nxl_z:nxr_za,nys_z:nyn_za,1:nza)
462	#else
463	REAL :: ar(1:nz,nys_z:nyn_z,nxl_z:nxr_z)
464	#endif
465
466	!
467	!-- Forward substitution.
468	DO i = nxl_z, nxr_z
469	#if defined( __parallel )
470	ar1(i,0) = ar(i,j,1)
471	#else
472	ar1(i,0) = ar(1,j,i)
473	#endif
474	ENDDO
475	DO k = 1, nz - 1
476	DO i = nxl_z, nxr_z
477	#if defined( __parallel )
478	ar1(i,k) = ar(i,j,k+1) - tri(5,i,k) * ar1(i,k-1)
479	#else
480	ar1(i,k) = ar(k+1,j,i) - tri(5,i,k) * ar1(i,k-1)
481	#endif
482	ENDDO
483	ENDDO
484
485	!
486	!-- Backward substitution
487	!-- Note, the 1.0E-20 in the denominator is due to avoid divisions
488	!-- by zero appearing if the pressure bc is set to neumann at the top of
489	!-- the model domain.
490	DO i = nxl_z, nxr_z
491	#if defined( __parallel )
492	ar(i,j,nz) = ar1(i,nz-1) / ( tri(4,i,nz-1) + 1.0E-20 )
493	#else
494	ar(nz,j,i) = ar1(i,nz-1) / ( tri(4,i,nz-1) + 1.0E-20 )
495	#endif
496	ENDDO
497	DO k = nz-2, 0, -1
498	DO i = nxl_z, nxr_z
499	#if defined( __parallel )
500	ar(i,j,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,j,k+2) ) &
501	/ tri(4,i,k)
502	#else
503	ar(k+1,j,i) = ( ar1(i,k) - tri(3,i,k) * ar(k+2,j,i) ) &
504	/ tri(4,i,k)
505	#endif
506	ENDDO
507	ENDDO
508
509	!
510	!-- Indices i=0, j=0 correspond to horizontally averaged pressure.
511	!-- The respective values of ar should be zero at all k-levels if
512	!-- acceleration of horizontally averaged vertical velocity is zero.
513	IF ( ibc_p_b == 1 .AND. ibc_p_t == 1 ) THEN
514	IF ( j == 0 .AND. nxl_z == 0 ) THEN
515	#if defined( __parallel )
516	DO k = 1, nz
517	ar(nxl_z,j,k) = 0.0
518	ENDDO
519	#else
520	DO k = 1, nz
521	ar(k,j,nxl_z) = 0.0
522	ENDDO
523	#endif
524	ENDIF
525	ENDIF
526
527	END SUBROUTINE substi
528
529
530	SUBROUTINE split( tri )
531
532	!------------------------------------------------------------------------------!
533	! Splitting of the tridiagonal matrix (Thomas algorithm)
534	!------------------------------------------------------------------------------!
535
536	IMPLICIT NONE
537
538	INTEGER :: i, k
539	REAL :: tri(5,nxl_z:nxr_z,0:nz-1)
540
541	!
542	!-- Splitting.
543	DO i = nxl_z, nxr_z
544	tri(4,i,0) = tri(1,i,0)
545	ENDDO
546	DO k = 1, nz-1
547	DO i = nxl_z, nxr_z
548	tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
549	tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
550	ENDDO
551	ENDDO
552
553	END SUBROUTINE split
554
555	END SUBROUTINE tridia
556
557
558	#if defined( __parallel )
559	SUBROUTINE fftxp( ar, direction )
560
561	!------------------------------------------------------------------------------!
562	! Fourier-transformation along x-direction Parallelized version
563	!------------------------------------------------------------------------------!
564
565	IMPLICIT NONE
566
567	CHARACTER (LEN=*) :: direction
568	INTEGER :: j, k
569	REAL :: ar(0:nxa,nys_x:nyn_xa,nzb_x:nzt_xa)
570
571	!
572	!-- Performing the fft with one of the methods implemented
573	!$OMP PARALLEL PRIVATE ( j, k )
574	!$OMP DO
575	DO k = nzb_x, nzt_x
576	DO j = nys_x, nyn_x
577	CALL fft_x( ar(0:nx,j,k), direction )
578	ENDDO
579	ENDDO
580	!$OMP END PARALLEL
581
582	END SUBROUTINE fftxp
583
584	#else
585	SUBROUTINE fftx( ar, direction )
586
587	!------------------------------------------------------------------------------!
588	! Fourier-transformation along x-direction Non parallel version
589	!------------------------------------------------------------------------------!
590
591	IMPLICIT NONE
592
593	CHARACTER (LEN=*) :: direction
594	INTEGER :: i, j, k
595	REAL :: ar(1:nz,0:ny,0:nx)
596
597	!
598	!-- Performing the fft with one of the methods implemented
599	!$OMP PARALLEL PRIVATE ( j, k )
600	!$OMP DO
601	DO k = 1, nz
602	DO j = 0, ny
603	CALL fft_x( ar(k,j,0:nx), direction )
604	ENDDO
605	ENDDO
606	!$OMP END PARALLEL
607
608	END SUBROUTINE fftx
609	#endif
610
611
612	#if defined( __parallel )
613	SUBROUTINE fftyp( ar, direction )
614
615	!------------------------------------------------------------------------------!
616	! Fourier-transformation along y-direction Parallelized version
617	!------------------------------------------------------------------------------!
618
619	IMPLICIT NONE
620
621	CHARACTER (LEN=*) :: direction
622	INTEGER :: i, k
623	REAL :: ar(0:nya,nxl_y:nxr_ya,nzb_y:nzt_ya)
624
625	!
626	!-- Performing the fft with one of the methods implemented
627	!$OMP PARALLEL PRIVATE ( i, k )
628	!$OMP DO
629	DO k = nzb_y, nzt_y
630	DO i = nxl_y, nxr_y
631	CALL fft_y( ar(0:ny,i,k), direction )
632	ENDDO
633	ENDDO
634	!$OMP END PARALLEL
635
636	END SUBROUTINE fftyp
637
638	#else
639	SUBROUTINE ffty( ar, direction )
640
641	!------------------------------------------------------------------------------!
642	! Fourier-transformation along y-direction Non parallel version
643	!------------------------------------------------------------------------------!
644
645	IMPLICIT NONE
646
647	CHARACTER (LEN=*) :: direction
648	INTEGER :: i, k
649	REAL :: ar(1:nz,0:ny,0:nx)
650
651	!
652	!-- Performing the fft with one of the methods implemented
653	!$OMP PARALLEL PRIVATE ( i, k )
654	!$OMP DO
655	DO k = 1, nz
656	DO i = 0, nx
657	CALL fft_y( ar(k,0:ny,i), direction )
658	ENDDO
659	ENDDO
660	!$OMP END PARALLEL
661
662	END SUBROUTINE ffty
663	#endif
664
665	#if defined( __parallel )
666	SUBROUTINE ffty_tr_yx( f_in, work, f_out )
667
668	!------------------------------------------------------------------------------!
669	! Fourier-transformation along y with subsequent transposition y --> x for
670	! a 1d-decomposition along x
671	!
672	! ATTENTION: The performance of this routine is much faster on the NEC-SX6,
673	! if the first index of work_ffty_vec is odd. Otherwise
674	! memory bank conflicts may occur (especially if the index is a
675	! multiple of 128). That's why work_ffty_vec is dimensioned as
676	! 0:ny+1.
677	! Of course, this will not work if users are using an odd number
678	! of gridpoints along y.
679	!------------------------------------------------------------------------------!
680
681	USE control_parameters
682	USE cpulog
683	USE indices
684	USE interfaces
685	USE pegrid
686	USE transpose_indices
687
688	IMPLICIT NONE
689
690	INTEGER :: i, iend, iouter, ir, j, k
691	INTEGER, PARAMETER :: stridex = 4
692
693	REAL, DIMENSION(0:ny,stridex) :: work_ffty
694	#if defined( __nec )
695	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
696	#endif
697	REAL, DIMENSION(1:nza,0:nya,nxl:nxra) :: f_in
698	REAL, DIMENSION(nnx,1:nza,nys_x:nyn_xa,pdims(1)) :: f_out
699	REAL, DIMENSION(nxl:nxra,1:nza,0:nya) :: work
700
701	!
702	!-- Carry out the FFT along y, where all data are present due to the
703	!-- 1d-decomposition along x. Resort the data in a way that x becomes
704	!-- the first index.
705	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
706
707	IF ( host(1:3) == 'nec' ) THEN
708	#if defined( __nec )
709	!
710	!-- Code optimized for vector processors
711	!$OMP PARALLEL PRIVATE ( i, j, k )
712	!$OMP DO
713	DO i = nxl, nxr
714
715	DO j = 0, ny
716	DO k = 1, nz
717	work_ffty_vec(j,k,i) = f_in(k,j,i)
718	ENDDO
719	ENDDO
720
721	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' )
722
723	ENDDO
724
725	!$OMP DO
726	DO k = 1, nz
727	DO j = 0, ny
728	DO i = nxl, nxr
729	work(i,k,j) = work_ffty_vec(j,k,i)
730	ENDDO
731	ENDDO
732	ENDDO
733	!$OMP END PARALLEL
734	#endif
735
736	ELSE
737
738	!
739	!-- Cache optimized code.
740	!-- The i-(x-)direction is split into a strided outer loop and an inner
741	!-- loop for better cache performance
742	!$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty)
743	!$OMP DO
744	DO iouter = nxl, nxr, stridex
745
746	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
747
748	DO k = 1, nz
749
750	DO i = iouter, iend
751
752	ir = i-iouter+1 ! counter within a stride
753	DO j = 0, ny
754	work_ffty(j,ir) = f_in(k,j,i)
755	ENDDO
756	!
757	!-- FFT along y
758	CALL fft_y( work_ffty(:,ir), 'forward' )
759
760	ENDDO
761
762	!
763	!-- Resort
764	DO j = 0, ny
765	DO i = iouter, iend
766	work(i,k,j) = work_ffty(j,i-iouter+1)
767	ENDDO
768	ENDDO
769
770	ENDDO
771
772	ENDDO
773	!$OMP END PARALLEL
774
775	ENDIF
776	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
777
778	!
779	!-- Transpose array
780	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
781	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
782	CALL MPI_ALLTOALL( work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
783	f_out(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
784	comm1dx, ierr )
785	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
786
787	END SUBROUTINE ffty_tr_yx
788
789
790	SUBROUTINE tr_xy_ffty( f_in, work, f_out )
791
792	!------------------------------------------------------------------------------!
793	! Transposition x --> y with a subsequent backward Fourier transformation for
794	! a 1d-decomposition along x
795	!------------------------------------------------------------------------------!
796
797	USE control_parameters
798	USE cpulog
799	USE indices
800	USE interfaces
801	USE pegrid
802	USE transpose_indices
803
804	IMPLICIT NONE
805
806	INTEGER :: i, iend, iouter, ir, j, k
807	INTEGER, PARAMETER :: stridex = 4
808
809	REAL, DIMENSION(0:ny,stridex) :: work_ffty
810	#if defined( __nec )
811	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
812	#endif
813	REAL, DIMENSION(nnx,1:nza,nys_x:nyn_xa,pdims(1)) :: f_in
814	REAL, DIMENSION(1:nza,0:nya,nxl:nxra) :: f_out
815	REAL, DIMENSION(nxl:nxra,1:nza,0:nya) :: work
816
817	!
818	!-- Transpose array
819	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
820	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
821	CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
822	work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
823	comm1dx, ierr )
824	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
825
826	!
827	!-- Resort the data in a way that y becomes the first index and carry out the
828	!-- backward fft along y.
829	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
830
831	IF ( host(1:3) == 'nec' ) THEN
832	#if defined( __nec )
833	!
834	!-- Code optimized for vector processors
835	!$OMP PARALLEL PRIVATE ( i, j, k )
836	!$OMP DO
837	DO k = 1, nz
838	DO j = 0, ny
839	DO i = nxl, nxr
840	work_ffty_vec(j,k,i) = work(i,k,j)
841	ENDDO
842	ENDDO
843	ENDDO
844
845	!$OMP DO
846	DO i = nxl, nxr
847
848	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' )
849
850	DO j = 0, ny
851	DO k = 1, nz
852	f_out(k,j,i) = work_ffty_vec(j,k,i)
853	ENDDO
854	ENDDO
855
856	ENDDO
857	!$OMP END PARALLEL
858	#endif
859
860	ELSE
861
862	!
863	!-- Cache optimized code.
864	!-- The i-(x-)direction is split into a strided outer loop and an inner
865	!-- loop for better cache performance
866	!$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty )
867	!$OMP DO
868	DO iouter = nxl, nxr, stridex
869
870	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
871
872	DO k = 1, nz
873	!
874	!-- Resort
875	DO j = 0, ny
876	DO i = iouter, iend
877	work_ffty(j,i-iouter+1) = work(i,k,j)
878	ENDDO
879	ENDDO
880
881	DO i = iouter, iend
882
883	!
884	!-- FFT along y
885	ir = i-iouter+1 ! counter within a stride
886	CALL fft_y( work_ffty(:,ir), 'backward' )
887
888	DO j = 0, ny
889	f_out(k,j,i) = work_ffty(j,ir)
890	ENDDO
891	ENDDO
892
893	ENDDO
894
895	ENDDO
896	!$OMP END PARALLEL
897
898	ENDIF
899
900	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
901
902	END SUBROUTINE tr_xy_ffty
903
904
905	SUBROUTINE fftx_tri_fftx( ar )
906
907	!------------------------------------------------------------------------------!
908	! FFT along x, solution of the tridiagonal system and backward FFT for
909	! a 1d-decomposition along x
910	!
911	! WARNING: this subroutine may still not work for hybrid parallelization
912	! with OpenMP (for possible necessary changes see the original
913	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
914	!------------------------------------------------------------------------------!
915
916	USE control_parameters
917	USE cpulog
918	USE grid_variables
919	USE indices
920	USE interfaces
921	USE pegrid
922	USE transpose_indices
923
924	IMPLICIT NONE
925
926	character(len=3) :: myth_char
927
928	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
929
930	REAL, DIMENSION(0:nx) :: work_fftx
931	REAL, DIMENSION(0:nx,1:nz) :: work_trix
932	REAL, DIMENSION(nnx,1:nza,nys_x:nyn_xa,pdims(1)) :: ar
933	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
934
935
936	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'start' )
937
938	ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) )
939
940	tn = 0 ! Default thread number in case of one thread
941	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_fftx, work_trix )
942	DO j = nys_x, nyn_x
943
944	!$ tn = omp_get_thread_num()
945
946	IF ( host(1:3) == 'nec' ) THEN
947	!
948	!-- Code optimized for vector processors
949	DO k = 1, nz
950
951	m = 0
952	DO n = 1, pdims(1)
953	DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
954	work_trix(m,k) = ar(i,k,j,n)
955	m = m + 1
956	ENDDO
957	ENDDO
958
959	ENDDO
960
961	CALL fft_x_m( work_trix, 'forward' )
962
963	ELSE
964	!
965	!-- Cache optimized code
966	DO k = 1, nz
967
968	m = 0
969	DO n = 1, pdims(1)
970	DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
971	work_fftx(m) = ar(i,k,j,n)
972	m = m + 1
973	ENDDO
974	ENDDO
975
976	CALL fft_x( work_fftx, 'forward' )
977
978	DO i = 0, nx
979	work_trix(i,k) = work_fftx(i)
980	ENDDO
981
982	ENDDO
983
984	ENDIF
985
986	!
987	!-- Solve the linear equation system
988	CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) )
989
990	IF ( host(1:3) == 'nec' ) THEN
991	!
992	!-- Code optimized for vector processors
993	CALL fft_x_m( work_trix, 'backward' )
994
995	DO k = 1, nz
996
997	m = 0
998	DO n = 1, pdims(1)
999	DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
1000	ar(i,k,j,n) = work_trix(m,k)
1001	m = m + 1
1002	ENDDO
1003	ENDDO
1004
1005	ENDDO
1006
1007	ELSE
1008	!
1009	!-- Cache optimized code
1010	DO k = 1, nz
1011
1012	DO i = 0, nx
1013	work_fftx(i) = work_trix(i,k)
1014	ENDDO
1015
1016	CALL fft_x( work_fftx, 'backward' )
1017
1018	m = 0
1019	DO n = 1, pdims(1)
1020	DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
1021	ar(i,k,j,n) = work_fftx(m)
1022	m = m + 1
1023	ENDDO
1024	ENDDO
1025
1026	ENDDO
1027
1028	ENDIF
1029
1030	ENDDO
1031
1032	DEALLOCATE( tri )
1033
1034	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'stop' )
1035
1036	END SUBROUTINE fftx_tri_fftx
1037
1038
1039	SUBROUTINE fftx_tr_xy( f_in, work, f_out )
1040
1041	!------------------------------------------------------------------------------!
1042	! Fourier-transformation along x with subsequent transposition x --> y for
1043	! a 1d-decomposition along y
1044	!
1045	! ATTENTION: The NEC-branch of this routine may significantly profit from
1046	! further optimizations. So far, performance is much worse than
1047	! for routine ffty_tr_yx (more than three times slower).
1048	!------------------------------------------------------------------------------!
1049
1050	USE control_parameters
1051	USE cpulog
1052	USE indices
1053	USE interfaces
1054	USE pegrid
1055	USE transpose_indices
1056
1057	IMPLICIT NONE
1058
1059	INTEGER :: i, j, k
1060
1061	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
1062	REAL, DIMENSION(1:nza,nys:nyna,0:nxa) :: f_in
1063	REAL, DIMENSION(nny,1:nza,nxl_y:nxr_ya,pdims(2)) :: f_out
1064	REAL, DIMENSION(nys:nyna,1:nza,0:nxa) :: work
1065
1066	!
1067	!-- Carry out the FFT along x, where all data are present due to the
1068	!-- 1d-decomposition along y. Resort the data in a way that y becomes
1069	!-- the first index.
1070	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
1071
1072	IF ( host(1:3) == 'nec' ) THEN
1073	!
1074	!-- Code for vector processors
1075	!$OMP PARALLEL PRIVATE ( i, j, k )
1076	!$OMP DO
1077	DO i = 0, nx
1078
1079	DO j = nys, nyn
1080	DO k = 1, nz
1081	work_fftx(i,k,j) = f_in(k,j,i)
1082	ENDDO
1083	ENDDO
1084
1085	ENDDO
1086
1087	!$OMP DO
1088	DO j = nys, nyn
1089
1090	CALL fft_x_m( work_fftx(:,:,j), 'forward' )
1091
1092	DO k = 1, nz
1093	DO i = 0, nx
1094	work(j,k,i) = work_fftx(i,k,j)
1095	ENDDO
1096	ENDDO
1097
1098	ENDDO
1099	!$OMP END PARALLEL
1100
1101	ELSE
1102
1103	!
1104	!-- Cache optimized code (there might be still a potential for better
1105	!-- optimization).
1106	!$OMP PARALLEL PRIVATE (i,j,k)
1107	!$OMP DO
1108	DO i = 0, nx
1109
1110	DO j = nys, nyn
1111	DO k = 1, nz
1112	work_fftx(i,k,j) = f_in(k,j,i)
1113	ENDDO
1114	ENDDO
1115
1116	ENDDO
1117
1118	!$OMP DO
1119	DO j = nys, nyn
1120	DO k = 1, nz
1121
1122	CALL fft_x( work_fftx(0:nx,k,j), 'forward' )
1123
1124	DO i = 0, nx
1125	work(j,k,i) = work_fftx(i,k,j)
1126	ENDDO
1127	ENDDO
1128
1129	ENDDO
1130	!$OMP END PARALLEL
1131
1132	ENDIF
1133	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
1134
1135	!
1136	!-- Transpose array
1137	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1138	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1139	CALL MPI_ALLTOALL( work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
1140	f_out(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
1141	comm1dy, ierr )
1142	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1143
1144	END SUBROUTINE fftx_tr_xy
1145
1146
1147	SUBROUTINE tr_yx_fftx( f_in, work, f_out )
1148
1149	!------------------------------------------------------------------------------!
1150	! Transposition y --> x with a subsequent backward Fourier transformation for
1151	! a 1d-decomposition along x
1152	!------------------------------------------------------------------------------!
1153
1154	USE control_parameters
1155	USE cpulog
1156	USE indices
1157	USE interfaces
1158	USE pegrid
1159	USE transpose_indices
1160
1161	IMPLICIT NONE
1162
1163	INTEGER :: i, j, k
1164
1165	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
1166	REAL, DIMENSION(nny,1:nza,nxl_y:nxr_ya,pdims(2)) :: f_in
1167	REAL, DIMENSION(1:nza,nys:nyna,0:nxa) :: f_out
1168	REAL, DIMENSION(nys:nyna,1:nza,0:nxa) :: work
1169
1170	!
1171	!-- Transpose array
1172	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1173	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1174	CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
1175	work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
1176	comm1dy, ierr )
1177	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1178
1179	!
1180	!-- Carry out the FFT along x, where all data are present due to the
1181	!-- 1d-decomposition along y. Resort the data in a way that y becomes
1182	!-- the first index.
1183	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
1184
1185	IF ( host(1:3) == 'nec' ) THEN
1186	!
1187	!-- Code optimized for vector processors
1188	!$OMP PARALLEL PRIVATE ( i, j, k )
1189	!$OMP DO
1190	DO j = nys, nyn
1191
1192	DO k = 1, nz
1193	DO i = 0, nx
1194	work_fftx(i,k,j) = work(j,k,i)
1195	ENDDO
1196	ENDDO
1197
1198	CALL fft_x_m( work_fftx(:,:,j), 'backward' )
1199
1200	ENDDO
1201
1202	!$OMP DO
1203	DO i = 0, nx
1204	DO j = nys, nyn
1205	DO k = 1, nz
1206	f_out(k,j,i) = work_fftx(i,k,j)
1207	ENDDO
1208	ENDDO
1209	ENDDO
1210	!$OMP END PARALLEL
1211
1212	ELSE
1213
1214	!
1215	!-- Cache optimized code (there might be still a potential for better
1216	!-- optimization).
1217	!$OMP PARALLEL PRIVATE (i,j,k)
1218	!$OMP DO
1219	DO j = nys, nyn
1220	DO k = 1, nz
1221
1222	DO i = 0, nx
1223	work_fftx(i,k,j) = work(j,k,i)
1224	ENDDO
1225
1226	CALL fft_x( work_fftx(0:nx,k,j), 'backward' )
1227
1228	ENDDO
1229	ENDDO
1230
1231	!$OMP DO
1232	DO i = 0, nx
1233	DO j = nys, nyn
1234	DO k = 1, nz
1235	f_out(k,j,i) = work_fftx(i,k,j)
1236	ENDDO
1237	ENDDO
1238	ENDDO
1239	!$OMP END PARALLEL
1240
1241	ENDIF
1242	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
1243
1244	END SUBROUTINE tr_yx_fftx
1245
1246
1247	SUBROUTINE ffty_tri_ffty( ar )
1248
1249	!------------------------------------------------------------------------------!
1250	! FFT along y, solution of the tridiagonal system and backward FFT for
1251	! a 1d-decomposition along y
1252	!
1253	! WARNING: this subroutine may still not work for hybrid parallelization
1254	! with OpenMP (for possible necessary changes see the original
1255	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
1256	!------------------------------------------------------------------------------!
1257
1258	USE control_parameters
1259	USE cpulog
1260	USE grid_variables
1261	USE indices
1262	USE interfaces
1263	USE pegrid
1264	USE transpose_indices
1265
1266	IMPLICIT NONE
1267
1268	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
1269
1270	REAL, DIMENSION(0:ny) :: work_ffty
1271	REAL, DIMENSION(0:ny,1:nz) :: work_triy
1272	REAL, DIMENSION(nny,1:nza,nxl_y:nxr_ya,pdims(2)) :: ar
1273	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
1274
1275
1276	CALL cpu_log( log_point_s(39), 'fft_y + tridia', 'start' )
1277
1278	ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) )
1279
1280	tn = 0 ! Default thread number in case of one thread
1281	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy )
1282	DO i = nxl_y, nxr_y
1283
1284	!$ tn = omp_get_thread_num()
1285
1286	IF ( host(1:3) == 'nec' ) THEN
1287	!
1288	!-- Code optimized for vector processors
1289	DO k = 1, nz
1290
1291	m = 0
1292	DO n = 1, pdims(2)
1293	DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
1294	work_triy(m,k) = ar(j,k,i,n)
1295	m = m + 1
1296	ENDDO
1297	ENDDO
1298
1299	ENDDO
1300
1301	CALL fft_y_m( work_triy, ny, 'forward' )
1302
1303	ELSE
1304	!
1305	!-- Cache optimized code
1306	DO k = 1, nz
1307
1308	m = 0
1309	DO n = 1, pdims(2)
1310	DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
1311	work_ffty(m) = ar(j,k,i,n)
1312	m = m + 1
1313	ENDDO
1314	ENDDO
1315
1316	CALL fft_y( work_ffty, 'forward' )
1317
1318	DO j = 0, ny
1319	work_triy(j,k) = work_ffty(j)
1320	ENDDO
1321
1322	ENDDO
1323
1324	ENDIF
1325
1326	!
1327	!-- Solve the linear equation system
1328	CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) )
1329
1330	IF ( host(1:3) == 'nec' ) THEN
1331	!
1332	!-- Code optimized for vector processors
1333	CALL fft_y_m( work_triy, ny, 'backward' )
1334
1335	DO k = 1, nz
1336
1337	m = 0
1338	DO n = 1, pdims(2)
1339	DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
1340	ar(j,k,i,n) = work_triy(m,k)
1341	m = m + 1
1342	ENDDO
1343	ENDDO
1344
1345	ENDDO
1346
1347	ELSE
1348	!
1349	!-- Cache optimized code
1350	DO k = 1, nz
1351
1352	DO j = 0, ny
1353	work_ffty(j) = work_triy(j,k)
1354	ENDDO
1355
1356	CALL fft_y( work_ffty, 'backward' )
1357
1358	m = 0
1359	DO n = 1, pdims(2)
1360	DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
1361	ar(j,k,i,n) = work_ffty(m)
1362	m = m + 1
1363	ENDDO
1364	ENDDO
1365
1366	ENDDO
1367
1368	ENDIF
1369
1370	ENDDO
1371
1372	DEALLOCATE( tri )
1373
1374	CALL cpu_log( log_point_s(39), 'fft_y + tridia', 'stop' )
1375
1376	END SUBROUTINE ffty_tri_ffty
1377
1378
1379	SUBROUTINE tridia_1dd( ddx2, ddy2, nx, ny, j, ar, tri )
1380
1381	!------------------------------------------------------------------------------!
1382	! Solves the linear system of equations for a 1d-decomposition along x (see
1383	! tridia)
1384	!
1385	! Attention: when using the intel compilers older than 12.0, array tri must
1386	! be passed as an argument to the contained subroutines. Otherwise
1387	! addres faults will occur. This feature can be activated with
1388	! cpp-switch __intel11
1389	! On NEC, tri should not be passed (except for routine substi_1dd)
1390	! because this causes very bad performance.
1391	!------------------------------------------------------------------------------!
1392
1393	USE arrays_3d
1394	USE control_parameters
1395
1396	USE pegrid
1397
1398	IMPLICIT NONE
1399
1400	INTEGER :: i, j, k, nnyh, nx, ny, omp_get_thread_num, tn
1401
1402	REAL :: ddx2, ddy2
1403
1404	REAL, DIMENSION(0:nx,1:nz) :: ar
1405	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
1406	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
1407
1408
1409	nnyh = ( ny + 1 ) / 2
1410
1411	!
1412	!-- Define constant elements of the tridiagonal matrix.
1413	!-- The compiler on SX6 does loop exchange. If 0:nx is a high power of 2,
1414	!-- the exchanged loops create bank conflicts. The following directive
1415	!-- prohibits loop exchange and the loops perform much better.
1416	! tn = omp_get_thread_num()
1417	! WRITE( 120+tn, * ) '+++ id=',myid,' nx=',nx,' thread=', omp_get_thread_num()
1418	! CALL local_flush( 120+tn )
1419	!CDIR NOLOOPCHG
1420	DO k = 0, nz-1
1421	DO i = 0,nx
1422	tri(2,i,k) = ddzu_pres(k+1) * ddzw(k+1)
1423	tri(3,i,k) = ddzu_pres(k+2) * ddzw(k+1)
1424	ENDDO
1425	ENDDO
1426	! WRITE( 120+tn, * ) '+++ id=',myid,' end of first tridia loop thread=', omp_get_thread_num()
1427	! CALL local_flush( 120+tn )
1428
1429	IF ( j <= nnyh ) THEN
1430	#if defined( __intel11 )
1431	CALL maketri_1dd( j, tri )
1432	#else
1433	CALL maketri_1dd( j )
1434	#endif
1435	ELSE
1436	#if defined( __intel11 )
1437	CALL maketri_1dd( ny+1-j, tri )
1438	#else
1439	CALL maketri_1dd( ny+1-j )
1440	#endif
1441	ENDIF
1442	#if defined( __intel11 )
1443	CALL split_1dd( tri )
1444	#else
1445	CALL split_1dd
1446	#endif
1447	CALL substi_1dd( ar, tri )
1448
1449	CONTAINS
1450
1451	#if defined( __intel11 )
1452	SUBROUTINE maketri_1dd( j, tri )
1453	#else
1454	SUBROUTINE maketri_1dd( j )
1455	#endif
1456
1457	!------------------------------------------------------------------------------!
1458	! computes the i- and j-dependent component of the matrix
1459	!------------------------------------------------------------------------------!
1460
1461	USE constants
1462
1463	IMPLICIT NONE
1464
1465	INTEGER :: i, j, k, nnxh
1466	REAL :: a, c
1467
1468	REAL, DIMENSION(0:nx) :: l
1469
1470	#if defined( __intel11 )
1471	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
1472	#endif
1473
1474
1475	nnxh = ( nx + 1 ) / 2
1476	!
1477	!-- Provide the tridiagonal matrix for solution of the Poisson equation in
1478	!-- Fourier space. The coefficients are computed following the method of
1479	!-- Schmidt et al. (DFVLR-Mitteilung 84-15), which departs from Stephan
1480	!-- Siano's original version by discretizing the Poisson equation,
1481	!-- before it is Fourier-transformed
1482	DO i = 0, nx
1483	IF ( i >= 0 .AND. i <= nnxh ) THEN
1484	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
1485	FLOAT( nx+1 ) ) ) * ddx2 + &
1486	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
1487	FLOAT( ny+1 ) ) ) * ddy2
1488	ELSE
1489	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
1490	FLOAT( nx+1 ) ) ) * ddx2 + &
1491	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
1492	FLOAT( ny+1 ) ) ) * ddy2
1493	ENDIF
1494	ENDDO
1495
1496	DO k = 0, nz-1
1497	DO i = 0, nx
1498	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
1499	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
1500	tri(1,i,k) = a + c - l(i)
1501	ENDDO
1502	ENDDO
1503	IF ( ibc_p_b == 1 .OR. ibc_p_b == 2 ) THEN
1504	DO i = 0, nx
1505	tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
1506	ENDDO
1507	ENDIF
1508	IF ( ibc_p_t == 1 ) THEN
1509	DO i = 0, nx
1510	tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
1511	ENDDO
1512	ENDIF
1513
1514	END SUBROUTINE maketri_1dd
1515
1516
1517	#if defined( __intel11 )
1518	SUBROUTINE split_1dd( tri )
1519	#else
1520	SUBROUTINE split_1dd
1521	#endif
1522
1523	!------------------------------------------------------------------------------!
1524	! Splitting of the tridiagonal matrix (Thomas algorithm)
1525	!------------------------------------------------------------------------------!
1526
1527	IMPLICIT NONE
1528
1529	INTEGER :: i, k
1530
1531	#if defined( __intel11 )
1532	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
1533	#endif
1534
1535
1536	!
1537	!-- Splitting
1538	DO i = 0, nx
1539	tri(4,i,0) = tri(1,i,0)
1540	ENDDO
1541	DO k = 1, nz-1
1542	DO i = 0, nx
1543	tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
1544	tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
1545	ENDDO
1546	ENDDO
1547
1548	END SUBROUTINE split_1dd
1549
1550
1551	SUBROUTINE substi_1dd( ar, tri )
1552
1553	!------------------------------------------------------------------------------!
1554	! Substitution (Forward and Backward) (Thomas algorithm)
1555	!------------------------------------------------------------------------------!
1556
1557	IMPLICIT NONE
1558
1559	INTEGER :: i, k
1560
1561	REAL, DIMENSION(0:nx,nz) :: ar
1562	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
1563	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
1564
1565	!
1566	!-- Forward substitution
1567	DO i = 0, nx
1568	ar1(i,0) = ar(i,1)
1569	ENDDO
1570	DO k = 1, nz-1
1571	DO i = 0, nx
1572	ar1(i,k) = ar(i,k+1) - tri(5,i,k) * ar1(i,k-1)
1573	ENDDO
1574	ENDDO
1575
1576	!
1577	!-- Backward substitution
1578	!-- Note, the add of 1.0E-20 in the denominator is due to avoid divisions
1579	!-- by zero appearing if the pressure bc is set to neumann at the top of
1580	!-- the model domain.
1581	DO i = 0, nx
1582	ar(i,nz) = ar1(i,nz-1) / ( tri(4,i,nz-1) + 1.0E-20 )
1583	ENDDO
1584	DO k = nz-2, 0, -1
1585	DO i = 0, nx
1586	ar(i,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,k+2) ) &
1587	/ tri(4,i,k)
1588	ENDDO
1589	ENDDO
1590
1591	!
1592	!-- Indices i=0, j=0 correspond to horizontally averaged pressure.
1593	!-- The respective values of ar should be zero at all k-levels if
1594	!-- acceleration of horizontally averaged vertical velocity is zero.
1595	IF ( ibc_p_b == 1 .AND. ibc_p_t == 1 ) THEN
1596	IF ( j == 0 ) THEN
1597	DO k = 1, nz
1598	ar(0,k) = 0.0
1599	ENDDO
1600	ENDIF
1601	ENDIF
1602
1603	END SUBROUTINE substi_1dd
1604
1605	END SUBROUTINE tridia_1dd
1606
1607	#endif
1608	#endif
1609	END MODULE poisfft_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |