Home

Context Navigation

source: palm/trunk/SOURCE/poisfft.f90 @ 1017

Last change on this file since 1017 was 1014, checked in by raasch, 12 years ago
last commit documented
Property svn:keywords set to `Id`
File size: 46.9 KB

Line
1	MODULE poisfft_mod
2
3	!------------------------------------------------------------------------------!
4	! Current revisions:
5	! -----------------
6	!
7	!
8	! Former revisions:
9	! -----------------
10	! $Id: poisfft.f90 1014 2012-09-21 07:09:03Z raasch $
11	!
12	! 2012-09-21 07:03:55Z raasch
13	! FLOAT type conversion replaced by REAL
14	!
15	! 1003 2012-09-14 14:35:53Z raasch
16	! indices nxa, nya, etc. replaced by nx, ny, etc.
17	!
18	! 940 2012-07-09 14:31:00Z raasch
19	! special handling of tri-array as an argument in tridia_1dd routines switched
20	! off because it caused segmentation faults with intel 12.1 compiler
21	!
22	! 877 2012-04-03 11:21:44Z suehring
23	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
24	! pressure at the top of the model domain.
25	!
26	! 809 2012-01-30 13:32:58Z maronga
27	! Bugfix: replaced .AND. and .NOT. with && and ! in the preprocessor directives
28	!
29	! 807 2012-01-25 11:53:51Z maronga
30	! New cpp directive "__check" implemented which is used by check_namelist_files
31	! (most of the code is unneeded by check_namelist_files).
32	!
33	! 763 2011-10-06 09:32:09Z suehring
34	! Comment added concerning the last change.
35	!
36	! 761 2011-10-05 17:58:52Z suehring
37	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
38	! pressure at the top of the model domain.
39	!
40	! 696 2011-03-18 07:03:49Z raasch
41	! work_fftx removed from PRIVATE clauses in fftx_tr_xy and tr_yx_fftx
42	!
43	! 683 2011-02-09 14:25:15Z raasch
44	! openMP parallelization for 2d-domain-decomposition
45	!
46	! 667 2010-12-23 12:06:00Z suehring/gryschka
47	! ddzu replaced by ddzu_pres due to changes in zu(0)
48	!
49	! 622 2010-12-10 08:08:13Z raasch
50	! optional barriers included in order to speed up collective operations
51	!
52	! 377 2009-09-04 11:09:00Z raasch
53	! __lcmuk changed to __lc to avoid problems with Intel compiler on sgi-ice
54	!
55	! 164 2008-05-15 08:46:15Z raasch
56	! Arguments removed from transpose routines
57	!
58	! 128 2007-10-26 13:11:14Z raasch
59	! Bugfix: wavenumber calculation for even nx in routines maketri
60	!
61	! 85 2007-05-11 09:35:14Z raasch
62	! Bugfix: work_fft*_vec removed from some PRIVATE-declarations
63	!
64	! 76 2007-03-29 00:58:32Z raasch
65	! Tridiagonal coefficients adjusted for Neumann boundary conditions both at
66	! the bottom and the top.
67	!
68	! RCS Log replace by Id keyword, revision history cleaned up
69	!
70	! Revision 1.24 2006/08/04 15:00:24 raasch
71	! Default setting of the thread number tn in case of not using OpenMP
72	!
73	! Revision 1.23 2006/02/23 12:48:38 raasch
74	! Additional compiler directive in routine tridia_1dd for preventing loop
75	! exchange on NEC-SX6
76	!
77	! Revision 1.20 2004/04/30 12:38:09 raasch
78	! Parts of former poisfft_hybrid moved to this subroutine,
79	! former subroutine changed to a module, renaming of FFT-subroutines and
80	! -module, FFTs completely substituted by calls of fft_x and fft_y,
81	! NAG fft used in the non-parallel case completely removed, l in maketri
82	! is now a 1d-array, variables passed by modules instead of using parameter
83	! lists, enlarged transposition arrays introduced
84	!
85	! Revision 1.1 1997/07/24 11:24:14 raasch
86	! Initial revision
87	!
88	!
89	! Description:
90	! ------------
91	! See below.
92	!------------------------------------------------------------------------------!
93
94	!--------------------------------------------------------------------------!
95	! poisfft !
96	! !
97	! Original version: Stephan Siano (pois3d) !
98	! !
99	! Institute of Meteorology and Climatology, University of Hannover !
100	! Germany !
101	! !
102	! Version as of July 23,1996 !
103	! !
104	! !
105	! Version for parallel computers: Siegfried Raasch !
106	! !
107	! Version as of July 03,1997 !
108	! !
109	! Solves the Poisson equation with a 2D spectral method !
110	! d^2 p / dx^2 + d^2 p / dy^2 + d^2 p / dz^2 = s !
111	! !
112	! Input: !
113	! real ar contains in the (nnx,nny,nnz) elements, !
114	! starting from the element (1,nys,nxl), the !
115	! values for s !
116	! real work Temporary array !
117	! !
118	! Output: !
119	! real ar contains the solution for p !
120	!--------------------------------------------------------------------------!
121
122	USE fft_xy
123	USE indices
124	USE transpose_indices
125
126	IMPLICIT NONE
127
128	PRIVATE
129
130	#if ! defined ( __check )
131	PUBLIC poisfft, poisfft_init
132
133	INTERFACE poisfft
134	MODULE PROCEDURE poisfft
135	END INTERFACE poisfft
136
137	INTERFACE poisfft_init
138	MODULE PROCEDURE poisfft_init
139	END INTERFACE poisfft_init
140	#else
141	PUBLIC poisfft_init
142
143	INTERFACE poisfft_init
144	MODULE PROCEDURE poisfft_init
145	END INTERFACE poisfft_init
146	#endif
147
148	CONTAINS
149
150	SUBROUTINE poisfft_init
151
152	CALL fft_init
153
154	END SUBROUTINE poisfft_init
155
156	#if ! defined ( __check )
157	SUBROUTINE poisfft( ar, work )
158
159	USE cpulog
160	USE interfaces
161	USE pegrid
162
163	IMPLICIT NONE
164
165	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar, work
166
167
168	CALL cpu_log( log_point_s(3), 'poisfft', 'start' )
169
170	!
171	!-- Two-dimensional Fourier Transformation in x- and y-direction.
172	#if defined( __parallel )
173	IF ( pdims(2) == 1 ) THEN
174
175	!
176	!-- 1d-domain-decomposition along x:
177	!-- FFT along y and transposition y --> x
178	CALL ffty_tr_yx( ar, work, ar )
179
180	!
181	!-- FFT along x, solving the tridiagonal system and backward FFT
182	CALL fftx_tri_fftx( ar )
183
184	!
185	!-- Transposition x --> y and backward FFT along y
186	CALL tr_xy_ffty( ar, work, ar )
187
188	ELSEIF ( pdims(1) == 1 ) THEN
189
190	!
191	!-- 1d-domain-decomposition along y:
192	!-- FFT along x and transposition x --> y
193	CALL fftx_tr_xy( ar, work, ar )
194
195	!
196	!-- FFT along y, solving the tridiagonal system and backward FFT
197	CALL ffty_tri_ffty( ar )
198
199	!
200	!-- Transposition y --> x and backward FFT along x
201	CALL tr_yx_fftx( ar, work, ar )
202
203	ELSE
204
205	!
206	!-- 2d-domain-decomposition
207	!-- Transposition z --> x
208	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
209	CALL transpose_zx( ar, work, ar )
210	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
211
212	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
213	CALL fftxp( ar, 'forward' )
214	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
215
216	!
217	!-- Transposition x --> y
218	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
219	CALL transpose_xy( ar, work, ar )
220	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
221
222	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
223	CALL fftyp( ar, 'forward' )
224	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
225
226	!
227	!-- Transposition y --> z
228	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
229	CALL transpose_yz( ar, work, ar )
230	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
231
232	!
233	!-- Solve the Poisson equation in z-direction in cartesian space.
234	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
235	CALL tridia( ar )
236	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
237
238	!
239	!-- Inverse Fourier Transformation
240	!-- Transposition z --> y
241	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
242	CALL transpose_zy( ar, work, ar )
243	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
244
245	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
246	CALL fftyp( ar, 'backward' )
247	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
248
249	!
250	!-- Transposition y --> x
251	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
252	CALL transpose_yx( ar, work, ar )
253	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
254
255	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
256	CALL fftxp( ar, 'backward' )
257	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
258
259	!
260	!-- Transposition x --> z
261	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
262	CALL transpose_xz( ar, work, ar )
263	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
264
265	ENDIF
266
267	#else
268
269	!
270	!-- Two-dimensional Fourier Transformation along x- and y-direction.
271	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
272	CALL fftx( ar, 'forward' )
273	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
274	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
275	CALL ffty( ar, 'forward' )
276	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
277
278	!
279	!-- Solve the Poisson equation in z-direction in cartesian space.
280	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
281	CALL tridia( ar )
282	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
283
284	!
285	!-- Inverse Fourier Transformation.
286	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
287	CALL ffty( ar, 'backward' )
288	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
289	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
290	CALL fftx( ar, 'backward' )
291	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
292
293	#endif
294
295	CALL cpu_log( log_point_s(3), 'poisfft', 'stop' )
296
297	END SUBROUTINE poisfft
298
299
300
301	SUBROUTINE tridia( ar )
302
303	!------------------------------------------------------------------------------!
304	! solves the linear system of equations:
305	!
306	! -(4 pi^2(i^2/(dx^2nnx^2)+j^2/(dy^2nny^2))+
307	! 1/(dzu(k)dzw(k))+1/(dzu(k-1)dzw(k)))*p(i,j,k)+
308	! 1/(dzu(k)dzw(k))p(i,j,k+1)+1/(dzu(k-1)dzw(k))p(i,j,k-1)=d(i,j,k)
309	!
310	! by using the Thomas algorithm
311	!------------------------------------------------------------------------------!
312
313	USE arrays_3d
314
315	IMPLICIT NONE
316
317	INTEGER :: i, j, k, nnyh
318
319	REAL, DIMENSION(nxl_z:nxr_z,0:nz-1) :: ar1
320	REAL, DIMENSION(5,nxl_z:nxr_z,0:nz-1) :: tri
321
322	#if defined( __parallel )
323	REAL :: ar(nxl_z:nxr_z,nys_z:nyn_z,1:nz)
324	#else
325	REAL :: ar(1:nz,nys_z:nyn_z,nxl_z:nxr_z)
326	#endif
327
328
329	nnyh = (ny+1) / 2
330
331	!
332	!-- Define constant elements of the tridiagonal matrix.
333	!$OMP PARALLEL PRIVATE ( k, i )
334	!$OMP DO
335	DO k = 0, nz-1
336	DO i = nxl_z, nxr_z
337	tri(2,i,k) = ddzu_pres(k+1) * ddzw(k+1)
338	tri(3,i,k) = ddzu_pres(k+2) * ddzw(k+1)
339	ENDDO
340	ENDDO
341	!$OMP END PARALLEL
342
343	#if defined( __parallel )
344	!
345	!-- Repeat for all y-levels.
346	!$OMP PARALLEL FIRSTPRIVATE( tri ) PRIVATE ( ar1, j )
347	!$OMP DO
348	DO j = nys_z, nyn_z
349	IF ( j <= nnyh ) THEN
350	CALL maketri( tri, j )
351	ELSE
352	CALL maketri( tri, ny+1-j )
353	ENDIF
354	CALL split( tri )
355	CALL substi( ar, ar1, tri, j )
356	ENDDO
357	!$OMP END PARALLEL
358	#else
359	!
360	!-- First y-level.
361	CALL maketri( tri, nys_z )
362	CALL split( tri )
363	CALL substi( ar, ar1, tri, 0 )
364
365	!
366	!-- Further y-levels.
367	DO j = 1, nnyh - 1
368	CALL maketri( tri, j )
369	CALL split( tri )
370	CALL substi( ar, ar1, tri, j )
371	CALL substi( ar, ar1, tri, ny+1-j )
372	ENDDO
373	CALL maketri( tri, nnyh )
374	CALL split( tri )
375	CALL substi( ar, ar1, tri, nnyh+nys )
376	#endif
377
378	CONTAINS
379
380	SUBROUTINE maketri( tri, j )
381
382	!------------------------------------------------------------------------------!
383	! Computes the i- and j-dependent component of the matrix
384	!------------------------------------------------------------------------------!
385
386	USE arrays_3d
387	USE constants
388	USE control_parameters
389	USE grid_variables
390
391	IMPLICIT NONE
392
393	INTEGER :: i, j, k, nnxh
394	REAL :: a, c
395	REAL :: ll(nxl_z:nxr_z)
396	REAL :: tri(5,nxl_z:nxr_z,0:nz-1)
397
398
399	nnxh = ( nx + 1 ) / 2
400
401	!
402	!-- Provide the tridiagonal matrix for solution of the Poisson equation in
403	!-- Fourier space. The coefficients are computed following the method of
404	!-- Schmidt et al. (DFVLR-Mitteilung 84-15), which departs from Stephan
405	!-- Siano's original version by discretizing the Poisson equation,
406	!-- before it is Fourier-transformed
407	#if defined( __parallel )
408	DO i = nxl_z, nxr_z
409	IF ( i >= 0 .AND. i <= nnxh ) THEN
410	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
411	REAL( nx+1 ) ) ) / ( dx * dx ) + &
412	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
413	REAL( ny+1 ) ) ) / ( dy * dy )
414	ELSE
415	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
416	REAL( nx+1 ) ) ) / ( dx * dx ) + &
417	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
418	REAL( ny+1 ) ) ) / ( dy * dy )
419	ENDIF
420	DO k = 0,nz-1
421	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
422	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
423	tri(1,i,k) = a + c - ll(i)
424	ENDDO
425	ENDDO
426	#else
427	DO i = 0, nnxh
428	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / REAL( nx+1 ) ) ) / &
429	( dx * dx ) + &
430	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / REAL( ny+1 ) ) ) / &
431	( dy * dy )
432	DO k = 0, nz-1
433	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
434	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
435	tri(1,i,k) = a + c - ll(i)
436	IF ( i >= 1 .and. i < nnxh ) THEN
437	tri(1,nx+1-i,k) = tri(1,i,k)
438	ENDIF
439	ENDDO
440	ENDDO
441	#endif
442	IF ( ibc_p_b == 1 .OR. ibc_p_b == 2 ) THEN
443	DO i = nxl_z, nxr_z
444	tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
445	ENDDO
446	ENDIF
447	IF ( ibc_p_t == 1 ) THEN
448	DO i = nxl_z, nxr_z
449	tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
450	ENDDO
451	ENDIF
452
453	END SUBROUTINE maketri
454
455
456	SUBROUTINE substi( ar, ar1, tri, j )
457
458	!------------------------------------------------------------------------------!
459	! Substitution (Forward and Backward) (Thomas algorithm)
460	!------------------------------------------------------------------------------!
461
462	USE control_parameters
463
464	IMPLICIT NONE
465
466	INTEGER :: i, j, k
467	REAL :: ar1(nxl_z:nxr_z,0:nz-1)
468	REAL :: tri(5,nxl_z:nxr_z,0:nz-1)
469	#if defined( __parallel )
470	REAL :: ar(nxl_z:nxr_z,nys_z:nyn_z,1:nz)
471	#else
472	REAL :: ar(1:nz,nys_z:nyn_z,nxl_z:nxr_z)
473	#endif
474
475	!
476	!-- Forward substitution.
477	DO i = nxl_z, nxr_z
478	#if defined( __parallel )
479	ar1(i,0) = ar(i,j,1)
480	#else
481	ar1(i,0) = ar(1,j,i)
482	#endif
483	ENDDO
484	DO k = 1, nz - 1
485	DO i = nxl_z, nxr_z
486	#if defined( __parallel )
487	ar1(i,k) = ar(i,j,k+1) - tri(5,i,k) * ar1(i,k-1)
488	#else
489	ar1(i,k) = ar(k+1,j,i) - tri(5,i,k) * ar1(i,k-1)
490	#endif
491	ENDDO
492	ENDDO
493
494	!
495	!-- Backward substitution
496	!-- Note, the 1.0E-20 in the denominator is due to avoid divisions
497	!-- by zero appearing if the pressure bc is set to neumann at the top of
498	!-- the model domain.
499	DO i = nxl_z, nxr_z
500	#if defined( __parallel )
501	ar(i,j,nz) = ar1(i,nz-1) / ( tri(4,i,nz-1) + 1.0E-20 )
502	#else
503	ar(nz,j,i) = ar1(i,nz-1) / ( tri(4,i,nz-1) + 1.0E-20 )
504	#endif
505	ENDDO
506	DO k = nz-2, 0, -1
507	DO i = nxl_z, nxr_z
508	#if defined( __parallel )
509	ar(i,j,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,j,k+2) ) &
510	/ tri(4,i,k)
511	#else
512	ar(k+1,j,i) = ( ar1(i,k) - tri(3,i,k) * ar(k+2,j,i) ) &
513	/ tri(4,i,k)
514	#endif
515	ENDDO
516	ENDDO
517
518	!
519	!-- Indices i=0, j=0 correspond to horizontally averaged pressure.
520	!-- The respective values of ar should be zero at all k-levels if
521	!-- acceleration of horizontally averaged vertical velocity is zero.
522	IF ( ibc_p_b == 1 .AND. ibc_p_t == 1 ) THEN
523	IF ( j == 0 .AND. nxl_z == 0 ) THEN
524	#if defined( __parallel )
525	DO k = 1, nz
526	ar(nxl_z,j,k) = 0.0
527	ENDDO
528	#else
529	DO k = 1, nz
530	ar(k,j,nxl_z) = 0.0
531	ENDDO
532	#endif
533	ENDIF
534	ENDIF
535
536	END SUBROUTINE substi
537
538
539	SUBROUTINE split( tri )
540
541	!------------------------------------------------------------------------------!
542	! Splitting of the tridiagonal matrix (Thomas algorithm)
543	!------------------------------------------------------------------------------!
544
545	IMPLICIT NONE
546
547	INTEGER :: i, k
548	REAL :: tri(5,nxl_z:nxr_z,0:nz-1)
549
550	!
551	!-- Splitting.
552	DO i = nxl_z, nxr_z
553	tri(4,i,0) = tri(1,i,0)
554	ENDDO
555	DO k = 1, nz-1
556	DO i = nxl_z, nxr_z
557	tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
558	tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
559	ENDDO
560	ENDDO
561
562	END SUBROUTINE split
563
564	END SUBROUTINE tridia
565
566
567	#if defined( __parallel )
568	SUBROUTINE fftxp( ar, direction )
569
570	!------------------------------------------------------------------------------!
571	! Fourier-transformation along x-direction Parallelized version
572	!------------------------------------------------------------------------------!
573
574	IMPLICIT NONE
575
576	CHARACTER (LEN=*) :: direction
577	INTEGER :: j, k
578	REAL :: ar(0:nx,nys_x:nyn_x,nzb_x:nzt_x)
579
580	!
581	!-- Performing the fft with one of the methods implemented
582	!$OMP PARALLEL PRIVATE ( j, k )
583	!$OMP DO
584	DO k = nzb_x, nzt_x
585	DO j = nys_x, nyn_x
586	CALL fft_x( ar(0:nx,j,k), direction )
587	ENDDO
588	ENDDO
589	!$OMP END PARALLEL
590
591	END SUBROUTINE fftxp
592
593	#else
594	SUBROUTINE fftx( ar, direction )
595
596	!------------------------------------------------------------------------------!
597	! Fourier-transformation along x-direction Non parallel version
598	!------------------------------------------------------------------------------!
599
600	IMPLICIT NONE
601
602	CHARACTER (LEN=*) :: direction
603	INTEGER :: i, j, k
604	REAL :: ar(1:nz,0:ny,0:nx)
605
606	!
607	!-- Performing the fft with one of the methods implemented
608	!$OMP PARALLEL PRIVATE ( j, k )
609	!$OMP DO
610	DO k = 1, nz
611	DO j = 0, ny
612	CALL fft_x( ar(k,j,0:nx), direction )
613	ENDDO
614	ENDDO
615	!$OMP END PARALLEL
616
617	END SUBROUTINE fftx
618	#endif
619
620
621	#if defined( __parallel )
622	SUBROUTINE fftyp( ar, direction )
623
624	!------------------------------------------------------------------------------!
625	! Fourier-transformation along y-direction Parallelized version
626	!------------------------------------------------------------------------------!
627
628	IMPLICIT NONE
629
630	CHARACTER (LEN=*) :: direction
631	INTEGER :: i, k
632	REAL :: ar(0:ny,nxl_y:nxr_y,nzb_y:nzt_y)
633
634	!
635	!-- Performing the fft with one of the methods implemented
636	!$OMP PARALLEL PRIVATE ( i, k )
637	!$OMP DO
638	DO k = nzb_y, nzt_y
639	DO i = nxl_y, nxr_y
640	CALL fft_y( ar(0:ny,i,k), direction )
641	ENDDO
642	ENDDO
643	!$OMP END PARALLEL
644
645	END SUBROUTINE fftyp
646
647	#else
648	SUBROUTINE ffty( ar, direction )
649
650	!------------------------------------------------------------------------------!
651	! Fourier-transformation along y-direction Non parallel version
652	!------------------------------------------------------------------------------!
653
654	IMPLICIT NONE
655
656	CHARACTER (LEN=*) :: direction
657	INTEGER :: i, k
658	REAL :: ar(1:nz,0:ny,0:nx)
659
660	!
661	!-- Performing the fft with one of the methods implemented
662	!$OMP PARALLEL PRIVATE ( i, k )
663	!$OMP DO
664	DO k = 1, nz
665	DO i = 0, nx
666	CALL fft_y( ar(k,0:ny,i), direction )
667	ENDDO
668	ENDDO
669	!$OMP END PARALLEL
670
671	END SUBROUTINE ffty
672	#endif
673
674	#if defined( __parallel )
675	SUBROUTINE ffty_tr_yx( f_in, work, f_out )
676
677	!------------------------------------------------------------------------------!
678	! Fourier-transformation along y with subsequent transposition y --> x for
679	! a 1d-decomposition along x
680	!
681	! ATTENTION: The performance of this routine is much faster on the NEC-SX6,
682	! if the first index of work_ffty_vec is odd. Otherwise
683	! memory bank conflicts may occur (especially if the index is a
684	! multiple of 128). That's why work_ffty_vec is dimensioned as
685	! 0:ny+1.
686	! Of course, this will not work if users are using an odd number
687	! of gridpoints along y.
688	!------------------------------------------------------------------------------!
689
690	USE control_parameters
691	USE cpulog
692	USE indices
693	USE interfaces
694	USE pegrid
695	USE transpose_indices
696
697	IMPLICIT NONE
698
699	INTEGER :: i, iend, iouter, ir, j, k
700	INTEGER, PARAMETER :: stridex = 4
701
702	REAL, DIMENSION(0:ny,stridex) :: work_ffty
703	#if defined( __nec )
704	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
705	#endif
706	REAL, DIMENSION(1:nz,0:ny,nxl:nxr) :: f_in
707	REAL, DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_out
708	REAL, DIMENSION(nxl:nxr,1:nz,0:ny) :: work
709
710	!
711	!-- Carry out the FFT along y, where all data are present due to the
712	!-- 1d-decomposition along x. Resort the data in a way that x becomes
713	!-- the first index.
714	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
715
716	IF ( host(1:3) == 'nec' ) THEN
717	#if defined( __nec )
718	!
719	!-- Code optimized for vector processors
720	!$OMP PARALLEL PRIVATE ( i, j, k )
721	!$OMP DO
722	DO i = nxl, nxr
723
724	DO j = 0, ny
725	DO k = 1, nz
726	work_ffty_vec(j,k,i) = f_in(k,j,i)
727	ENDDO
728	ENDDO
729
730	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' )
731
732	ENDDO
733
734	!$OMP DO
735	DO k = 1, nz
736	DO j = 0, ny
737	DO i = nxl, nxr
738	work(i,k,j) = work_ffty_vec(j,k,i)
739	ENDDO
740	ENDDO
741	ENDDO
742	!$OMP END PARALLEL
743	#endif
744
745	ELSE
746
747	!
748	!-- Cache optimized code.
749	!-- The i-(x-)direction is split into a strided outer loop and an inner
750	!-- loop for better cache performance
751	!$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty)
752	!$OMP DO
753	DO iouter = nxl, nxr, stridex
754
755	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
756
757	DO k = 1, nz
758
759	DO i = iouter, iend
760
761	ir = i-iouter+1 ! counter within a stride
762	DO j = 0, ny
763	work_ffty(j,ir) = f_in(k,j,i)
764	ENDDO
765	!
766	!-- FFT along y
767	CALL fft_y( work_ffty(:,ir), 'forward' )
768
769	ENDDO
770
771	!
772	!-- Resort
773	DO j = 0, ny
774	DO i = iouter, iend
775	work(i,k,j) = work_ffty(j,i-iouter+1)
776	ENDDO
777	ENDDO
778
779	ENDDO
780
781	ENDDO
782	!$OMP END PARALLEL
783
784	ENDIF
785	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
786
787	!
788	!-- Transpose array
789	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
790	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
791	CALL MPI_ALLTOALL( work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
792	f_out(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
793	comm1dx, ierr )
794	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
795
796	END SUBROUTINE ffty_tr_yx
797
798
799	SUBROUTINE tr_xy_ffty( f_in, work, f_out )
800
801	!------------------------------------------------------------------------------!
802	! Transposition x --> y with a subsequent backward Fourier transformation for
803	! a 1d-decomposition along x
804	!------------------------------------------------------------------------------!
805
806	USE control_parameters
807	USE cpulog
808	USE indices
809	USE interfaces
810	USE pegrid
811	USE transpose_indices
812
813	IMPLICIT NONE
814
815	INTEGER :: i, iend, iouter, ir, j, k
816	INTEGER, PARAMETER :: stridex = 4
817
818	REAL, DIMENSION(0:ny,stridex) :: work_ffty
819	#if defined( __nec )
820	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
821	#endif
822	REAL, DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_in
823	REAL, DIMENSION(1:nz,0:ny,nxl:nxr) :: f_out
824	REAL, DIMENSION(nxl:nxr,1:nz,0:ny) :: work
825
826	!
827	!-- Transpose array
828	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
829	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
830	CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
831	work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
832	comm1dx, ierr )
833	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
834
835	!
836	!-- Resort the data in a way that y becomes the first index and carry out the
837	!-- backward fft along y.
838	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
839
840	IF ( host(1:3) == 'nec' ) THEN
841	#if defined( __nec )
842	!
843	!-- Code optimized for vector processors
844	!$OMP PARALLEL PRIVATE ( i, j, k )
845	!$OMP DO
846	DO k = 1, nz
847	DO j = 0, ny
848	DO i = nxl, nxr
849	work_ffty_vec(j,k,i) = work(i,k,j)
850	ENDDO
851	ENDDO
852	ENDDO
853
854	!$OMP DO
855	DO i = nxl, nxr
856
857	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' )
858
859	DO j = 0, ny
860	DO k = 1, nz
861	f_out(k,j,i) = work_ffty_vec(j,k,i)
862	ENDDO
863	ENDDO
864
865	ENDDO
866	!$OMP END PARALLEL
867	#endif
868
869	ELSE
870
871	!
872	!-- Cache optimized code.
873	!-- The i-(x-)direction is split into a strided outer loop and an inner
874	!-- loop for better cache performance
875	!$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty )
876	!$OMP DO
877	DO iouter = nxl, nxr, stridex
878
879	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
880
881	DO k = 1, nz
882	!
883	!-- Resort
884	DO j = 0, ny
885	DO i = iouter, iend
886	work_ffty(j,i-iouter+1) = work(i,k,j)
887	ENDDO
888	ENDDO
889
890	DO i = iouter, iend
891
892	!
893	!-- FFT along y
894	ir = i-iouter+1 ! counter within a stride
895	CALL fft_y( work_ffty(:,ir), 'backward' )
896
897	DO j = 0, ny
898	f_out(k,j,i) = work_ffty(j,ir)
899	ENDDO
900	ENDDO
901
902	ENDDO
903
904	ENDDO
905	!$OMP END PARALLEL
906
907	ENDIF
908
909	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
910
911	END SUBROUTINE tr_xy_ffty
912
913
914	SUBROUTINE fftx_tri_fftx( ar )
915
916	!------------------------------------------------------------------------------!
917	! FFT along x, solution of the tridiagonal system and backward FFT for
918	! a 1d-decomposition along x
919	!
920	! WARNING: this subroutine may still not work for hybrid parallelization
921	! with OpenMP (for possible necessary changes see the original
922	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
923	!------------------------------------------------------------------------------!
924
925	USE control_parameters
926	USE cpulog
927	USE grid_variables
928	USE indices
929	USE interfaces
930	USE pegrid
931	USE transpose_indices
932
933	IMPLICIT NONE
934
935	character(len=3) :: myth_char
936
937	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
938
939	REAL, DIMENSION(0:nx) :: work_fftx
940	REAL, DIMENSION(0:nx,1:nz) :: work_trix
941	REAL, DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: ar
942	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
943
944
945	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'start' )
946
947	ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) )
948
949	tn = 0 ! Default thread number in case of one thread
950	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_fftx, work_trix )
951	DO j = nys_x, nyn_x
952
953	!$ tn = omp_get_thread_num()
954
955	IF ( host(1:3) == 'nec' ) THEN
956	!
957	!-- Code optimized for vector processors
958	DO k = 1, nz
959
960	m = 0
961	DO n = 1, pdims(1)
962	DO i = 1, nnx
963	work_trix(m,k) = ar(i,k,j,n)
964	m = m + 1
965	ENDDO
966	ENDDO
967
968	ENDDO
969
970	CALL fft_x_m( work_trix, 'forward' )
971
972	ELSE
973	!
974	!-- Cache optimized code
975	DO k = 1, nz
976
977	m = 0
978	DO n = 1, pdims(1)
979	DO i = 1, nnx
980	work_fftx(m) = ar(i,k,j,n)
981	m = m + 1
982	ENDDO
983	ENDDO
984
985	CALL fft_x( work_fftx, 'forward' )
986
987	DO i = 0, nx
988	work_trix(i,k) = work_fftx(i)
989	ENDDO
990
991	ENDDO
992
993	ENDIF
994
995	!
996	!-- Solve the linear equation system
997	CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) )
998
999	IF ( host(1:3) == 'nec' ) THEN
1000	!
1001	!-- Code optimized for vector processors
1002	CALL fft_x_m( work_trix, 'backward' )
1003
1004	DO k = 1, nz
1005
1006	m = 0
1007	DO n = 1, pdims(1)
1008	DO i = 1, nnx
1009	ar(i,k,j,n) = work_trix(m,k)
1010	m = m + 1
1011	ENDDO
1012	ENDDO
1013
1014	ENDDO
1015
1016	ELSE
1017	!
1018	!-- Cache optimized code
1019	DO k = 1, nz
1020
1021	DO i = 0, nx
1022	work_fftx(i) = work_trix(i,k)
1023	ENDDO
1024
1025	CALL fft_x( work_fftx, 'backward' )
1026
1027	m = 0
1028	DO n = 1, pdims(1)
1029	DO i = 1, nnx
1030	ar(i,k,j,n) = work_fftx(m)
1031	m = m + 1
1032	ENDDO
1033	ENDDO
1034
1035	ENDDO
1036
1037	ENDIF
1038
1039	ENDDO
1040
1041	DEALLOCATE( tri )
1042
1043	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'stop' )
1044
1045	END SUBROUTINE fftx_tri_fftx
1046
1047
1048	SUBROUTINE fftx_tr_xy( f_in, work, f_out )
1049
1050	!------------------------------------------------------------------------------!
1051	! Fourier-transformation along x with subsequent transposition x --> y for
1052	! a 1d-decomposition along y
1053	!
1054	! ATTENTION: The NEC-branch of this routine may significantly profit from
1055	! further optimizations. So far, performance is much worse than
1056	! for routine ffty_tr_yx (more than three times slower).
1057	!------------------------------------------------------------------------------!
1058
1059	USE control_parameters
1060	USE cpulog
1061	USE indices
1062	USE interfaces
1063	USE pegrid
1064	USE transpose_indices
1065
1066	IMPLICIT NONE
1067
1068	INTEGER :: i, j, k
1069
1070	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
1071	REAL, DIMENSION(1:nz,nys:nyn,0:nx) :: f_in
1072	REAL, DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_out
1073	REAL, DIMENSION(nys:nyn,1:nz,0:nx) :: work
1074
1075	!
1076	!-- Carry out the FFT along x, where all data are present due to the
1077	!-- 1d-decomposition along y. Resort the data in a way that y becomes
1078	!-- the first index.
1079	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
1080
1081	IF ( host(1:3) == 'nec' ) THEN
1082	!
1083	!-- Code for vector processors
1084	!$OMP PARALLEL PRIVATE ( i, j, k )
1085	!$OMP DO
1086	DO i = 0, nx
1087
1088	DO j = nys, nyn
1089	DO k = 1, nz
1090	work_fftx(i,k,j) = f_in(k,j,i)
1091	ENDDO
1092	ENDDO
1093
1094	ENDDO
1095
1096	!$OMP DO
1097	DO j = nys, nyn
1098
1099	CALL fft_x_m( work_fftx(:,:,j), 'forward' )
1100
1101	DO k = 1, nz
1102	DO i = 0, nx
1103	work(j,k,i) = work_fftx(i,k,j)
1104	ENDDO
1105	ENDDO
1106
1107	ENDDO
1108	!$OMP END PARALLEL
1109
1110	ELSE
1111
1112	!
1113	!-- Cache optimized code (there might be still a potential for better
1114	!-- optimization).
1115	!$OMP PARALLEL PRIVATE (i,j,k)
1116	!$OMP DO
1117	DO i = 0, nx
1118
1119	DO j = nys, nyn
1120	DO k = 1, nz
1121	work_fftx(i,k,j) = f_in(k,j,i)
1122	ENDDO
1123	ENDDO
1124
1125	ENDDO
1126
1127	!$OMP DO
1128	DO j = nys, nyn
1129	DO k = 1, nz
1130
1131	CALL fft_x( work_fftx(0:nx,k,j), 'forward' )
1132
1133	DO i = 0, nx
1134	work(j,k,i) = work_fftx(i,k,j)
1135	ENDDO
1136	ENDDO
1137
1138	ENDDO
1139	!$OMP END PARALLEL
1140
1141	ENDIF
1142	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
1143
1144	!
1145	!-- Transpose array
1146	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1147	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1148	CALL MPI_ALLTOALL( work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
1149	f_out(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
1150	comm1dy, ierr )
1151	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1152
1153	END SUBROUTINE fftx_tr_xy
1154
1155
1156	SUBROUTINE tr_yx_fftx( f_in, work, f_out )
1157
1158	!------------------------------------------------------------------------------!
1159	! Transposition y --> x with a subsequent backward Fourier transformation for
1160	! a 1d-decomposition along x
1161	!------------------------------------------------------------------------------!
1162
1163	USE control_parameters
1164	USE cpulog
1165	USE indices
1166	USE interfaces
1167	USE pegrid
1168	USE transpose_indices
1169
1170	IMPLICIT NONE
1171
1172	INTEGER :: i, j, k
1173
1174	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
1175	REAL, DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_in
1176	REAL, DIMENSION(1:nz,nys:nyn,0:nx) :: f_out
1177	REAL, DIMENSION(nys:nyn,1:nz,0:nx) :: work
1178
1179	!
1180	!-- Transpose array
1181	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1182	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1183	CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
1184	work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
1185	comm1dy, ierr )
1186	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1187
1188	!
1189	!-- Carry out the FFT along x, where all data are present due to the
1190	!-- 1d-decomposition along y. Resort the data in a way that y becomes
1191	!-- the first index.
1192	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
1193
1194	IF ( host(1:3) == 'nec' ) THEN
1195	!
1196	!-- Code optimized for vector processors
1197	!$OMP PARALLEL PRIVATE ( i, j, k )
1198	!$OMP DO
1199	DO j = nys, nyn
1200
1201	DO k = 1, nz
1202	DO i = 0, nx
1203	work_fftx(i,k,j) = work(j,k,i)
1204	ENDDO
1205	ENDDO
1206
1207	CALL fft_x_m( work_fftx(:,:,j), 'backward' )
1208
1209	ENDDO
1210
1211	!$OMP DO
1212	DO i = 0, nx
1213	DO j = nys, nyn
1214	DO k = 1, nz
1215	f_out(k,j,i) = work_fftx(i,k,j)
1216	ENDDO
1217	ENDDO
1218	ENDDO
1219	!$OMP END PARALLEL
1220
1221	ELSE
1222
1223	!
1224	!-- Cache optimized code (there might be still a potential for better
1225	!-- optimization).
1226	!$OMP PARALLEL PRIVATE (i,j,k)
1227	!$OMP DO
1228	DO j = nys, nyn
1229	DO k = 1, nz
1230
1231	DO i = 0, nx
1232	work_fftx(i,k,j) = work(j,k,i)
1233	ENDDO
1234
1235	CALL fft_x( work_fftx(0:nx,k,j), 'backward' )
1236
1237	ENDDO
1238	ENDDO
1239
1240	!$OMP DO
1241	DO i = 0, nx
1242	DO j = nys, nyn
1243	DO k = 1, nz
1244	f_out(k,j,i) = work_fftx(i,k,j)
1245	ENDDO
1246	ENDDO
1247	ENDDO
1248	!$OMP END PARALLEL
1249
1250	ENDIF
1251	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
1252
1253	END SUBROUTINE tr_yx_fftx
1254
1255
1256	SUBROUTINE ffty_tri_ffty( ar )
1257
1258	!------------------------------------------------------------------------------!
1259	! FFT along y, solution of the tridiagonal system and backward FFT for
1260	! a 1d-decomposition along y
1261	!
1262	! WARNING: this subroutine may still not work for hybrid parallelization
1263	! with OpenMP (for possible necessary changes see the original
1264	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
1265	!------------------------------------------------------------------------------!
1266
1267	USE control_parameters
1268	USE cpulog
1269	USE grid_variables
1270	USE indices
1271	USE interfaces
1272	USE pegrid
1273	USE transpose_indices
1274
1275	IMPLICIT NONE
1276
1277	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
1278
1279	REAL, DIMENSION(0:ny) :: work_ffty
1280	REAL, DIMENSION(0:ny,1:nz) :: work_triy
1281	REAL, DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: ar
1282	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
1283
1284
1285	CALL cpu_log( log_point_s(39), 'fft_y + tridia', 'start' )
1286
1287	ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) )
1288
1289	tn = 0 ! Default thread number in case of one thread
1290	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy )
1291	DO i = nxl_y, nxr_y
1292
1293	!$ tn = omp_get_thread_num()
1294
1295	IF ( host(1:3) == 'nec' ) THEN
1296	!
1297	!-- Code optimized for vector processors
1298	DO k = 1, nz
1299
1300	m = 0
1301	DO n = 1, pdims(2)
1302	DO j = 1, nny
1303	work_triy(m,k) = ar(j,k,i,n)
1304	m = m + 1
1305	ENDDO
1306	ENDDO
1307
1308	ENDDO
1309
1310	CALL fft_y_m( work_triy, ny, 'forward' )
1311
1312	ELSE
1313	!
1314	!-- Cache optimized code
1315	DO k = 1, nz
1316
1317	m = 0
1318	DO n = 1, pdims(2)
1319	DO j = 1, nny
1320	work_ffty(m) = ar(j,k,i,n)
1321	m = m + 1
1322	ENDDO
1323	ENDDO
1324
1325	CALL fft_y( work_ffty, 'forward' )
1326
1327	DO j = 0, ny
1328	work_triy(j,k) = work_ffty(j)
1329	ENDDO
1330
1331	ENDDO
1332
1333	ENDIF
1334
1335	!
1336	!-- Solve the linear equation system
1337	CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) )
1338
1339	IF ( host(1:3) == 'nec' ) THEN
1340	!
1341	!-- Code optimized for vector processors
1342	CALL fft_y_m( work_triy, ny, 'backward' )
1343
1344	DO k = 1, nz
1345
1346	m = 0
1347	DO n = 1, pdims(2)
1348	DO j = 1, nny
1349	ar(j,k,i,n) = work_triy(m,k)
1350	m = m + 1
1351	ENDDO
1352	ENDDO
1353
1354	ENDDO
1355
1356	ELSE
1357	!
1358	!-- Cache optimized code
1359	DO k = 1, nz
1360
1361	DO j = 0, ny
1362	work_ffty(j) = work_triy(j,k)
1363	ENDDO
1364
1365	CALL fft_y( work_ffty, 'backward' )
1366
1367	m = 0
1368	DO n = 1, pdims(2)
1369	DO j = 1, nny
1370	ar(j,k,i,n) = work_ffty(m)
1371	m = m + 1
1372	ENDDO
1373	ENDDO
1374
1375	ENDDO
1376
1377	ENDIF
1378
1379	ENDDO
1380
1381	DEALLOCATE( tri )
1382
1383	CALL cpu_log( log_point_s(39), 'fft_y + tridia', 'stop' )
1384
1385	END SUBROUTINE ffty_tri_ffty
1386
1387
1388	SUBROUTINE tridia_1dd( ddx2, ddy2, nx, ny, j, ar, tri )
1389
1390	!------------------------------------------------------------------------------!
1391	! Solves the linear system of equations for a 1d-decomposition along x (see
1392	! tridia)
1393	!
1394	! Attention: when using the intel compilers older than 12.0, array tri must
1395	! be passed as an argument to the contained subroutines. Otherwise
1396	! addres faults will occur. This feature can be activated with
1397	! cpp-switch __intel11
1398	! On NEC, tri should not be passed (except for routine substi_1dd)
1399	! because this causes very bad performance.
1400	!------------------------------------------------------------------------------!
1401
1402	USE arrays_3d
1403	USE control_parameters
1404
1405	USE pegrid
1406
1407	IMPLICIT NONE
1408
1409	INTEGER :: i, j, k, nnyh, nx, ny, omp_get_thread_num, tn
1410
1411	REAL :: ddx2, ddy2
1412
1413	REAL, DIMENSION(0:nx,1:nz) :: ar
1414	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
1415	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
1416
1417
1418	nnyh = ( ny + 1 ) / 2
1419
1420	!
1421	!-- Define constant elements of the tridiagonal matrix.
1422	!-- The compiler on SX6 does loop exchange. If 0:nx is a high power of 2,
1423	!-- the exchanged loops create bank conflicts. The following directive
1424	!-- prohibits loop exchange and the loops perform much better.
1425	! tn = omp_get_thread_num()
1426	! WRITE( 120+tn, * ) '+++ id=',myid,' nx=',nx,' thread=', omp_get_thread_num()
1427	! CALL local_flush( 120+tn )
1428	!CDIR NOLOOPCHG
1429	DO k = 0, nz-1
1430	DO i = 0,nx
1431	tri(2,i,k) = ddzu_pres(k+1) * ddzw(k+1)
1432	tri(3,i,k) = ddzu_pres(k+2) * ddzw(k+1)
1433	ENDDO
1434	ENDDO
1435	! WRITE( 120+tn, * ) '+++ id=',myid,' end of first tridia loop thread=', omp_get_thread_num()
1436	! CALL local_flush( 120+tn )
1437
1438	IF ( j <= nnyh ) THEN
1439	#if defined( __intel11 )
1440	CALL maketri_1dd( j, tri )
1441	#else
1442	CALL maketri_1dd( j )
1443	#endif
1444	ELSE
1445	#if defined( __intel11 )
1446	CALL maketri_1dd( ny+1-j, tri )
1447	#else
1448	CALL maketri_1dd( ny+1-j )
1449	#endif
1450	ENDIF
1451	#if defined( __intel11 )
1452	CALL split_1dd( tri )
1453	#else
1454	CALL split_1dd
1455	#endif
1456	CALL substi_1dd( ar, tri )
1457
1458	CONTAINS
1459
1460	#if defined( __intel11 )
1461	SUBROUTINE maketri_1dd( j, tri )
1462	#else
1463	SUBROUTINE maketri_1dd( j )
1464	#endif
1465
1466	!------------------------------------------------------------------------------!
1467	! computes the i- and j-dependent component of the matrix
1468	!------------------------------------------------------------------------------!
1469
1470	USE constants
1471
1472	IMPLICIT NONE
1473
1474	INTEGER :: i, j, k, nnxh
1475	REAL :: a, c
1476
1477	REAL, DIMENSION(0:nx) :: l
1478
1479	#if defined( __intel11 )
1480	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
1481	#endif
1482
1483
1484	nnxh = ( nx + 1 ) / 2
1485	!
1486	!-- Provide the tridiagonal matrix for solution of the Poisson equation in
1487	!-- Fourier space. The coefficients are computed following the method of
1488	!-- Schmidt et al. (DFVLR-Mitteilung 84-15), which departs from Stephan
1489	!-- Siano's original version by discretizing the Poisson equation,
1490	!-- before it is Fourier-transformed
1491	DO i = 0, nx
1492	IF ( i >= 0 .AND. i <= nnxh ) THEN
1493	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
1494	REAL( nx+1 ) ) ) * ddx2 + &
1495	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
1496	REAL( ny+1 ) ) ) * ddy2
1497	ELSE
1498	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
1499	REAL( nx+1 ) ) ) * ddx2 + &
1500	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
1501	REAL( ny+1 ) ) ) * ddy2
1502	ENDIF
1503	ENDDO
1504
1505	DO k = 0, nz-1
1506	DO i = 0, nx
1507	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
1508	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
1509	tri(1,i,k) = a + c - l(i)
1510	ENDDO
1511	ENDDO
1512	IF ( ibc_p_b == 1 .OR. ibc_p_b == 2 ) THEN
1513	DO i = 0, nx
1514	tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
1515	ENDDO
1516	ENDIF
1517	IF ( ibc_p_t == 1 ) THEN
1518	DO i = 0, nx
1519	tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
1520	ENDDO
1521	ENDIF
1522
1523	END SUBROUTINE maketri_1dd
1524
1525
1526	#if defined( __intel11 )
1527	SUBROUTINE split_1dd( tri )
1528	#else
1529	SUBROUTINE split_1dd
1530	#endif
1531
1532	!------------------------------------------------------------------------------!
1533	! Splitting of the tridiagonal matrix (Thomas algorithm)
1534	!------------------------------------------------------------------------------!
1535
1536	IMPLICIT NONE
1537
1538	INTEGER :: i, k
1539
1540	#if defined( __intel11 )
1541	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
1542	#endif
1543
1544
1545	!
1546	!-- Splitting
1547	DO i = 0, nx
1548	tri(4,i,0) = tri(1,i,0)
1549	ENDDO
1550	DO k = 1, nz-1
1551	DO i = 0, nx
1552	tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
1553	tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
1554	ENDDO
1555	ENDDO
1556
1557	END SUBROUTINE split_1dd
1558
1559
1560	SUBROUTINE substi_1dd( ar, tri )
1561
1562	!------------------------------------------------------------------------------!
1563	! Substitution (Forward and Backward) (Thomas algorithm)
1564	!------------------------------------------------------------------------------!
1565
1566	IMPLICIT NONE
1567
1568	INTEGER :: i, k
1569
1570	REAL, DIMENSION(0:nx,nz) :: ar
1571	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
1572	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
1573
1574	!
1575	!-- Forward substitution
1576	DO i = 0, nx
1577	ar1(i,0) = ar(i,1)
1578	ENDDO
1579	DO k = 1, nz-1
1580	DO i = 0, nx
1581	ar1(i,k) = ar(i,k+1) - tri(5,i,k) * ar1(i,k-1)
1582	ENDDO
1583	ENDDO
1584
1585	!
1586	!-- Backward substitution
1587	!-- Note, the add of 1.0E-20 in the denominator is due to avoid divisions
1588	!-- by zero appearing if the pressure bc is set to neumann at the top of
1589	!-- the model domain.
1590	DO i = 0, nx
1591	ar(i,nz) = ar1(i,nz-1) / ( tri(4,i,nz-1) + 1.0E-20 )
1592	ENDDO
1593	DO k = nz-2, 0, -1
1594	DO i = 0, nx
1595	ar(i,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,k+2) ) &
1596	/ tri(4,i,k)
1597	ENDDO
1598	ENDDO
1599
1600	!
1601	!-- Indices i=0, j=0 correspond to horizontally averaged pressure.
1602	!-- The respective values of ar should be zero at all k-levels if
1603	!-- acceleration of horizontally averaged vertical velocity is zero.
1604	IF ( ibc_p_b == 1 .AND. ibc_p_t == 1 ) THEN
1605	IF ( j == 0 ) THEN
1606	DO k = 1, nz
1607	ar(0,k) = 0.0
1608	ENDDO
1609	ENDIF
1610	ENDIF
1611
1612	END SUBROUTINE substi_1dd
1613
1614	END SUBROUTINE tridia_1dd
1615
1616	#endif
1617	#endif
1618	END MODULE poisfft_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |