Home

Context Navigation

source: palm/trunk/SOURCE/poisfft.f90 @ 1011

Last change on this file since 1011 was 1004, checked in by raasch, 12 years ago
last commit documented
Property svn:keywords set to `Id`
File size: 46.8 KB

Line
1	MODULE poisfft_mod
2
3	!------------------------------------------------------------------------------!
4	! Current revisions:
5	! -----------------
6	!
7	!
8	! Former revisions:
9	! -----------------
10	! $Id: poisfft.f90 1004 2012-09-14 14:56:50Z raasch $
11	!
12	! 1003 2012-09-14 14:35:53Z raasch
13	! indices nxa, nya, etc. replaced by nx, ny, etc.
14	!
15	! 940 2012-07-09 14:31:00Z raasch
16	! special handling of tri-array as an argument in tridia_1dd routines switched
17	! off because it caused segmentation faults with intel 12.1 compiler
18	!
19	! 877 2012-04-03 11:21:44Z suehring
20	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
21	! pressure at the top of the model domain.
22	!
23	! 809 2012-01-30 13:32:58Z maronga
24	! Bugfix: replaced .AND. and .NOT. with && and ! in the preprocessor directives
25	!
26	! 807 2012-01-25 11:53:51Z maronga
27	! New cpp directive "__check" implemented which is used by check_namelist_files
28	! (most of the code is unneeded by check_namelist_files).
29	!
30	! 763 2011-10-06 09:32:09Z suehring
31	! Comment added concerning the last change.
32	!
33	! 761 2011-10-05 17:58:52Z suehring
34	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
35	! pressure at the top of the model domain.
36	!
37	! 696 2011-03-18 07:03:49Z raasch
38	! work_fftx removed from PRIVATE clauses in fftx_tr_xy and tr_yx_fftx
39	!
40	! 683 2011-02-09 14:25:15Z raasch
41	! openMP parallelization for 2d-domain-decomposition
42	!
43	! 667 2010-12-23 12:06:00Z suehring/gryschka
44	! ddzu replaced by ddzu_pres due to changes in zu(0)
45	!
46	! 622 2010-12-10 08:08:13Z raasch
47	! optional barriers included in order to speed up collective operations
48	!
49	! 377 2009-09-04 11:09:00Z raasch
50	! __lcmuk changed to __lc to avoid problems with Intel compiler on sgi-ice
51	!
52	! 164 2008-05-15 08:46:15Z raasch
53	! Arguments removed from transpose routines
54	!
55	! 128 2007-10-26 13:11:14Z raasch
56	! Bugfix: wavenumber calculation for even nx in routines maketri
57	!
58	! 85 2007-05-11 09:35:14Z raasch
59	! Bugfix: work_fft*_vec removed from some PRIVATE-declarations
60	!
61	! 76 2007-03-29 00:58:32Z raasch
62	! Tridiagonal coefficients adjusted for Neumann boundary conditions both at
63	! the bottom and the top.
64	!
65	! RCS Log replace by Id keyword, revision history cleaned up
66	!
67	! Revision 1.24 2006/08/04 15:00:24 raasch
68	! Default setting of the thread number tn in case of not using OpenMP
69	!
70	! Revision 1.23 2006/02/23 12:48:38 raasch
71	! Additional compiler directive in routine tridia_1dd for preventing loop
72	! exchange on NEC-SX6
73	!
74	! Revision 1.20 2004/04/30 12:38:09 raasch
75	! Parts of former poisfft_hybrid moved to this subroutine,
76	! former subroutine changed to a module, renaming of FFT-subroutines and
77	! -module, FFTs completely substituted by calls of fft_x and fft_y,
78	! NAG fft used in the non-parallel case completely removed, l in maketri
79	! is now a 1d-array, variables passed by modules instead of using parameter
80	! lists, enlarged transposition arrays introduced
81	!
82	! Revision 1.1 1997/07/24 11:24:14 raasch
83	! Initial revision
84	!
85	!
86	! Description:
87	! ------------
88	! See below.
89	!------------------------------------------------------------------------------!
90
91	!--------------------------------------------------------------------------!
92	! poisfft !
93	! !
94	! Original version: Stephan Siano (pois3d) !
95	! !
96	! Institute of Meteorology and Climatology, University of Hannover !
97	! Germany !
98	! !
99	! Version as of July 23,1996 !
100	! !
101	! !
102	! Version for parallel computers: Siegfried Raasch !
103	! !
104	! Version as of July 03,1997 !
105	! !
106	! Solves the Poisson equation with a 2D spectral method !
107	! d^2 p / dx^2 + d^2 p / dy^2 + d^2 p / dz^2 = s !
108	! !
109	! Input: !
110	! real ar contains in the (nnx,nny,nnz) elements, !
111	! starting from the element (1,nys,nxl), the !
112	! values for s !
113	! real work Temporary array !
114	! !
115	! Output: !
116	! real ar contains the solution for p !
117	!--------------------------------------------------------------------------!
118
119	USE fft_xy
120	USE indices
121	USE transpose_indices
122
123	IMPLICIT NONE
124
125	PRIVATE
126
127	#if ! defined ( __check )
128	PUBLIC poisfft, poisfft_init
129
130	INTERFACE poisfft
131	MODULE PROCEDURE poisfft
132	END INTERFACE poisfft
133
134	INTERFACE poisfft_init
135	MODULE PROCEDURE poisfft_init
136	END INTERFACE poisfft_init
137	#else
138	PUBLIC poisfft_init
139
140	INTERFACE poisfft_init
141	MODULE PROCEDURE poisfft_init
142	END INTERFACE poisfft_init
143	#endif
144
145	CONTAINS
146
147	SUBROUTINE poisfft_init
148
149	CALL fft_init
150
151	END SUBROUTINE poisfft_init
152
153	#if ! defined ( __check )
154	SUBROUTINE poisfft( ar, work )
155
156	USE cpulog
157	USE interfaces
158	USE pegrid
159
160	IMPLICIT NONE
161
162	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar, work
163
164
165	CALL cpu_log( log_point_s(3), 'poisfft', 'start' )
166
167	!
168	!-- Two-dimensional Fourier Transformation in x- and y-direction.
169	#if defined( __parallel )
170	IF ( pdims(2) == 1 ) THEN
171
172	!
173	!-- 1d-domain-decomposition along x:
174	!-- FFT along y and transposition y --> x
175	CALL ffty_tr_yx( ar, work, ar )
176
177	!
178	!-- FFT along x, solving the tridiagonal system and backward FFT
179	CALL fftx_tri_fftx( ar )
180
181	!
182	!-- Transposition x --> y and backward FFT along y
183	CALL tr_xy_ffty( ar, work, ar )
184
185	ELSEIF ( pdims(1) == 1 ) THEN
186
187	!
188	!-- 1d-domain-decomposition along y:
189	!-- FFT along x and transposition x --> y
190	CALL fftx_tr_xy( ar, work, ar )
191
192	!
193	!-- FFT along y, solving the tridiagonal system and backward FFT
194	CALL ffty_tri_ffty( ar )
195
196	!
197	!-- Transposition y --> x and backward FFT along x
198	CALL tr_yx_fftx( ar, work, ar )
199
200	ELSE
201
202	!
203	!-- 2d-domain-decomposition
204	!-- Transposition z --> x
205	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
206	CALL transpose_zx( ar, work, ar )
207	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
208
209	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
210	CALL fftxp( ar, 'forward' )
211	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
212
213	!
214	!-- Transposition x --> y
215	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
216	CALL transpose_xy( ar, work, ar )
217	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
218
219	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
220	CALL fftyp( ar, 'forward' )
221	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
222
223	!
224	!-- Transposition y --> z
225	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
226	CALL transpose_yz( ar, work, ar )
227	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
228
229	!
230	!-- Solve the Poisson equation in z-direction in cartesian space.
231	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
232	CALL tridia( ar )
233	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
234
235	!
236	!-- Inverse Fourier Transformation
237	!-- Transposition z --> y
238	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
239	CALL transpose_zy( ar, work, ar )
240	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
241
242	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
243	CALL fftyp( ar, 'backward' )
244	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
245
246	!
247	!-- Transposition y --> x
248	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
249	CALL transpose_yx( ar, work, ar )
250	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
251
252	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
253	CALL fftxp( ar, 'backward' )
254	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
255
256	!
257	!-- Transposition x --> z
258	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
259	CALL transpose_xz( ar, work, ar )
260	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
261
262	ENDIF
263
264	#else
265
266	!
267	!-- Two-dimensional Fourier Transformation along x- and y-direction.
268	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
269	CALL fftx( ar, 'forward' )
270	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
271	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
272	CALL ffty( ar, 'forward' )
273	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
274
275	!
276	!-- Solve the Poisson equation in z-direction in cartesian space.
277	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
278	CALL tridia( ar )
279	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
280
281	!
282	!-- Inverse Fourier Transformation.
283	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
284	CALL ffty( ar, 'backward' )
285	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
286	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
287	CALL fftx( ar, 'backward' )
288	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
289
290	#endif
291
292	CALL cpu_log( log_point_s(3), 'poisfft', 'stop' )
293
294	END SUBROUTINE poisfft
295
296
297
298	SUBROUTINE tridia( ar )
299
300	!------------------------------------------------------------------------------!
301	! solves the linear system of equations:
302	!
303	! -(4 pi^2(i^2/(dx^2nnx^2)+j^2/(dy^2nny^2))+
304	! 1/(dzu(k)dzw(k))+1/(dzu(k-1)dzw(k)))*p(i,j,k)+
305	! 1/(dzu(k)dzw(k))p(i,j,k+1)+1/(dzu(k-1)dzw(k))p(i,j,k-1)=d(i,j,k)
306	!
307	! by using the Thomas algorithm
308	!------------------------------------------------------------------------------!
309
310	USE arrays_3d
311
312	IMPLICIT NONE
313
314	INTEGER :: i, j, k, nnyh
315
316	REAL, DIMENSION(nxl_z:nxr_z,0:nz-1) :: ar1
317	REAL, DIMENSION(5,nxl_z:nxr_z,0:nz-1) :: tri
318
319	#if defined( __parallel )
320	REAL :: ar(nxl_z:nxr_z,nys_z:nyn_z,1:nz)
321	#else
322	REAL :: ar(1:nz,nys_z:nyn_z,nxl_z:nxr_z)
323	#endif
324
325
326	nnyh = (ny+1) / 2
327
328	!
329	!-- Define constant elements of the tridiagonal matrix.
330	!$OMP PARALLEL PRIVATE ( k, i )
331	!$OMP DO
332	DO k = 0, nz-1
333	DO i = nxl_z, nxr_z
334	tri(2,i,k) = ddzu_pres(k+1) * ddzw(k+1)
335	tri(3,i,k) = ddzu_pres(k+2) * ddzw(k+1)
336	ENDDO
337	ENDDO
338	!$OMP END PARALLEL
339
340	#if defined( __parallel )
341	!
342	!-- Repeat for all y-levels.
343	!$OMP PARALLEL FIRSTPRIVATE( tri ) PRIVATE ( ar1, j )
344	!$OMP DO
345	DO j = nys_z, nyn_z
346	IF ( j <= nnyh ) THEN
347	CALL maketri( tri, j )
348	ELSE
349	CALL maketri( tri, ny+1-j )
350	ENDIF
351	CALL split( tri )
352	CALL substi( ar, ar1, tri, j )
353	ENDDO
354	!$OMP END PARALLEL
355	#else
356	!
357	!-- First y-level.
358	CALL maketri( tri, nys_z )
359	CALL split( tri )
360	CALL substi( ar, ar1, tri, 0 )
361
362	!
363	!-- Further y-levels.
364	DO j = 1, nnyh - 1
365	CALL maketri( tri, j )
366	CALL split( tri )
367	CALL substi( ar, ar1, tri, j )
368	CALL substi( ar, ar1, tri, ny+1-j )
369	ENDDO
370	CALL maketri( tri, nnyh )
371	CALL split( tri )
372	CALL substi( ar, ar1, tri, nnyh+nys )
373	#endif
374
375	CONTAINS
376
377	SUBROUTINE maketri( tri, j )
378
379	!------------------------------------------------------------------------------!
380	! Computes the i- and j-dependent component of the matrix
381	!------------------------------------------------------------------------------!
382
383	USE arrays_3d
384	USE constants
385	USE control_parameters
386	USE grid_variables
387
388	IMPLICIT NONE
389
390	INTEGER :: i, j, k, nnxh
391	REAL :: a, c
392	REAL :: ll(nxl_z:nxr_z)
393	REAL :: tri(5,nxl_z:nxr_z,0:nz-1)
394
395
396	nnxh = ( nx + 1 ) / 2
397
398	!
399	!-- Provide the tridiagonal matrix for solution of the Poisson equation in
400	!-- Fourier space. The coefficients are computed following the method of
401	!-- Schmidt et al. (DFVLR-Mitteilung 84-15), which departs from Stephan
402	!-- Siano's original version by discretizing the Poisson equation,
403	!-- before it is Fourier-transformed
404	#if defined( __parallel )
405	DO i = nxl_z, nxr_z
406	IF ( i >= 0 .AND. i <= nnxh ) THEN
407	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
408	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
409	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
410	FLOAT( ny+1 ) ) ) / ( dy * dy )
411	ELSE
412	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
413	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
414	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
415	FLOAT( ny+1 ) ) ) / ( dy * dy )
416	ENDIF
417	DO k = 0,nz-1
418	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
419	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
420	tri(1,i,k) = a + c - ll(i)
421	ENDDO
422	ENDDO
423	#else
424	DO i = 0, nnxh
425	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / FLOAT( nx+1 ) ) ) / &
426	( dx * dx ) + &
427	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / FLOAT( ny+1 ) ) ) / &
428	( dy * dy )
429	DO k = 0, nz-1
430	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
431	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
432	tri(1,i,k) = a + c - ll(i)
433	IF ( i >= 1 .and. i < nnxh ) THEN
434	tri(1,nx+1-i,k) = tri(1,i,k)
435	ENDIF
436	ENDDO
437	ENDDO
438	#endif
439	IF ( ibc_p_b == 1 .OR. ibc_p_b == 2 ) THEN
440	DO i = nxl_z, nxr_z
441	tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
442	ENDDO
443	ENDIF
444	IF ( ibc_p_t == 1 ) THEN
445	DO i = nxl_z, nxr_z
446	tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
447	ENDDO
448	ENDIF
449
450	END SUBROUTINE maketri
451
452
453	SUBROUTINE substi( ar, ar1, tri, j )
454
455	!------------------------------------------------------------------------------!
456	! Substitution (Forward and Backward) (Thomas algorithm)
457	!------------------------------------------------------------------------------!
458
459	USE control_parameters
460
461	IMPLICIT NONE
462
463	INTEGER :: i, j, k
464	REAL :: ar1(nxl_z:nxr_z,0:nz-1)
465	REAL :: tri(5,nxl_z:nxr_z,0:nz-1)
466	#if defined( __parallel )
467	REAL :: ar(nxl_z:nxr_z,nys_z:nyn_z,1:nz)
468	#else
469	REAL :: ar(1:nz,nys_z:nyn_z,nxl_z:nxr_z)
470	#endif
471
472	!
473	!-- Forward substitution.
474	DO i = nxl_z, nxr_z
475	#if defined( __parallel )
476	ar1(i,0) = ar(i,j,1)
477	#else
478	ar1(i,0) = ar(1,j,i)
479	#endif
480	ENDDO
481	DO k = 1, nz - 1
482	DO i = nxl_z, nxr_z
483	#if defined( __parallel )
484	ar1(i,k) = ar(i,j,k+1) - tri(5,i,k) * ar1(i,k-1)
485	#else
486	ar1(i,k) = ar(k+1,j,i) - tri(5,i,k) * ar1(i,k-1)
487	#endif
488	ENDDO
489	ENDDO
490
491	!
492	!-- Backward substitution
493	!-- Note, the 1.0E-20 in the denominator is due to avoid divisions
494	!-- by zero appearing if the pressure bc is set to neumann at the top of
495	!-- the model domain.
496	DO i = nxl_z, nxr_z
497	#if defined( __parallel )
498	ar(i,j,nz) = ar1(i,nz-1) / ( tri(4,i,nz-1) + 1.0E-20 )
499	#else
500	ar(nz,j,i) = ar1(i,nz-1) / ( tri(4,i,nz-1) + 1.0E-20 )
501	#endif
502	ENDDO
503	DO k = nz-2, 0, -1
504	DO i = nxl_z, nxr_z
505	#if defined( __parallel )
506	ar(i,j,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,j,k+2) ) &
507	/ tri(4,i,k)
508	#else
509	ar(k+1,j,i) = ( ar1(i,k) - tri(3,i,k) * ar(k+2,j,i) ) &
510	/ tri(4,i,k)
511	#endif
512	ENDDO
513	ENDDO
514
515	!
516	!-- Indices i=0, j=0 correspond to horizontally averaged pressure.
517	!-- The respective values of ar should be zero at all k-levels if
518	!-- acceleration of horizontally averaged vertical velocity is zero.
519	IF ( ibc_p_b == 1 .AND. ibc_p_t == 1 ) THEN
520	IF ( j == 0 .AND. nxl_z == 0 ) THEN
521	#if defined( __parallel )
522	DO k = 1, nz
523	ar(nxl_z,j,k) = 0.0
524	ENDDO
525	#else
526	DO k = 1, nz
527	ar(k,j,nxl_z) = 0.0
528	ENDDO
529	#endif
530	ENDIF
531	ENDIF
532
533	END SUBROUTINE substi
534
535
536	SUBROUTINE split( tri )
537
538	!------------------------------------------------------------------------------!
539	! Splitting of the tridiagonal matrix (Thomas algorithm)
540	!------------------------------------------------------------------------------!
541
542	IMPLICIT NONE
543
544	INTEGER :: i, k
545	REAL :: tri(5,nxl_z:nxr_z,0:nz-1)
546
547	!
548	!-- Splitting.
549	DO i = nxl_z, nxr_z
550	tri(4,i,0) = tri(1,i,0)
551	ENDDO
552	DO k = 1, nz-1
553	DO i = nxl_z, nxr_z
554	tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
555	tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
556	ENDDO
557	ENDDO
558
559	END SUBROUTINE split
560
561	END SUBROUTINE tridia
562
563
564	#if defined( __parallel )
565	SUBROUTINE fftxp( ar, direction )
566
567	!------------------------------------------------------------------------------!
568	! Fourier-transformation along x-direction Parallelized version
569	!------------------------------------------------------------------------------!
570
571	IMPLICIT NONE
572
573	CHARACTER (LEN=*) :: direction
574	INTEGER :: j, k
575	REAL :: ar(0:nx,nys_x:nyn_x,nzb_x:nzt_x)
576
577	!
578	!-- Performing the fft with one of the methods implemented
579	!$OMP PARALLEL PRIVATE ( j, k )
580	!$OMP DO
581	DO k = nzb_x, nzt_x
582	DO j = nys_x, nyn_x
583	CALL fft_x( ar(0:nx,j,k), direction )
584	ENDDO
585	ENDDO
586	!$OMP END PARALLEL
587
588	END SUBROUTINE fftxp
589
590	#else
591	SUBROUTINE fftx( ar, direction )
592
593	!------------------------------------------------------------------------------!
594	! Fourier-transformation along x-direction Non parallel version
595	!------------------------------------------------------------------------------!
596
597	IMPLICIT NONE
598
599	CHARACTER (LEN=*) :: direction
600	INTEGER :: i, j, k
601	REAL :: ar(1:nz,0:ny,0:nx)
602
603	!
604	!-- Performing the fft with one of the methods implemented
605	!$OMP PARALLEL PRIVATE ( j, k )
606	!$OMP DO
607	DO k = 1, nz
608	DO j = 0, ny
609	CALL fft_x( ar(k,j,0:nx), direction )
610	ENDDO
611	ENDDO
612	!$OMP END PARALLEL
613
614	END SUBROUTINE fftx
615	#endif
616
617
618	#if defined( __parallel )
619	SUBROUTINE fftyp( ar, direction )
620
621	!------------------------------------------------------------------------------!
622	! Fourier-transformation along y-direction Parallelized version
623	!------------------------------------------------------------------------------!
624
625	IMPLICIT NONE
626
627	CHARACTER (LEN=*) :: direction
628	INTEGER :: i, k
629	REAL :: ar(0:ny,nxl_y:nxr_y,nzb_y:nzt_y)
630
631	!
632	!-- Performing the fft with one of the methods implemented
633	!$OMP PARALLEL PRIVATE ( i, k )
634	!$OMP DO
635	DO k = nzb_y, nzt_y
636	DO i = nxl_y, nxr_y
637	CALL fft_y( ar(0:ny,i,k), direction )
638	ENDDO
639	ENDDO
640	!$OMP END PARALLEL
641
642	END SUBROUTINE fftyp
643
644	#else
645	SUBROUTINE ffty( ar, direction )
646
647	!------------------------------------------------------------------------------!
648	! Fourier-transformation along y-direction Non parallel version
649	!------------------------------------------------------------------------------!
650
651	IMPLICIT NONE
652
653	CHARACTER (LEN=*) :: direction
654	INTEGER :: i, k
655	REAL :: ar(1:nz,0:ny,0:nx)
656
657	!
658	!-- Performing the fft with one of the methods implemented
659	!$OMP PARALLEL PRIVATE ( i, k )
660	!$OMP DO
661	DO k = 1, nz
662	DO i = 0, nx
663	CALL fft_y( ar(k,0:ny,i), direction )
664	ENDDO
665	ENDDO
666	!$OMP END PARALLEL
667
668	END SUBROUTINE ffty
669	#endif
670
671	#if defined( __parallel )
672	SUBROUTINE ffty_tr_yx( f_in, work, f_out )
673
674	!------------------------------------------------------------------------------!
675	! Fourier-transformation along y with subsequent transposition y --> x for
676	! a 1d-decomposition along x
677	!
678	! ATTENTION: The performance of this routine is much faster on the NEC-SX6,
679	! if the first index of work_ffty_vec is odd. Otherwise
680	! memory bank conflicts may occur (especially if the index is a
681	! multiple of 128). That's why work_ffty_vec is dimensioned as
682	! 0:ny+1.
683	! Of course, this will not work if users are using an odd number
684	! of gridpoints along y.
685	!------------------------------------------------------------------------------!
686
687	USE control_parameters
688	USE cpulog
689	USE indices
690	USE interfaces
691	USE pegrid
692	USE transpose_indices
693
694	IMPLICIT NONE
695
696	INTEGER :: i, iend, iouter, ir, j, k
697	INTEGER, PARAMETER :: stridex = 4
698
699	REAL, DIMENSION(0:ny,stridex) :: work_ffty
700	#if defined( __nec )
701	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
702	#endif
703	REAL, DIMENSION(1:nz,0:ny,nxl:nxr) :: f_in
704	REAL, DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_out
705	REAL, DIMENSION(nxl:nxr,1:nz,0:ny) :: work
706
707	!
708	!-- Carry out the FFT along y, where all data are present due to the
709	!-- 1d-decomposition along x. Resort the data in a way that x becomes
710	!-- the first index.
711	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
712
713	IF ( host(1:3) == 'nec' ) THEN
714	#if defined( __nec )
715	!
716	!-- Code optimized for vector processors
717	!$OMP PARALLEL PRIVATE ( i, j, k )
718	!$OMP DO
719	DO i = nxl, nxr
720
721	DO j = 0, ny
722	DO k = 1, nz
723	work_ffty_vec(j,k,i) = f_in(k,j,i)
724	ENDDO
725	ENDDO
726
727	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' )
728
729	ENDDO
730
731	!$OMP DO
732	DO k = 1, nz
733	DO j = 0, ny
734	DO i = nxl, nxr
735	work(i,k,j) = work_ffty_vec(j,k,i)
736	ENDDO
737	ENDDO
738	ENDDO
739	!$OMP END PARALLEL
740	#endif
741
742	ELSE
743
744	!
745	!-- Cache optimized code.
746	!-- The i-(x-)direction is split into a strided outer loop and an inner
747	!-- loop for better cache performance
748	!$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty)
749	!$OMP DO
750	DO iouter = nxl, nxr, stridex
751
752	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
753
754	DO k = 1, nz
755
756	DO i = iouter, iend
757
758	ir = i-iouter+1 ! counter within a stride
759	DO j = 0, ny
760	work_ffty(j,ir) = f_in(k,j,i)
761	ENDDO
762	!
763	!-- FFT along y
764	CALL fft_y( work_ffty(:,ir), 'forward' )
765
766	ENDDO
767
768	!
769	!-- Resort
770	DO j = 0, ny
771	DO i = iouter, iend
772	work(i,k,j) = work_ffty(j,i-iouter+1)
773	ENDDO
774	ENDDO
775
776	ENDDO
777
778	ENDDO
779	!$OMP END PARALLEL
780
781	ENDIF
782	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
783
784	!
785	!-- Transpose array
786	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
787	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
788	CALL MPI_ALLTOALL( work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
789	f_out(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
790	comm1dx, ierr )
791	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
792
793	END SUBROUTINE ffty_tr_yx
794
795
796	SUBROUTINE tr_xy_ffty( f_in, work, f_out )
797
798	!------------------------------------------------------------------------------!
799	! Transposition x --> y with a subsequent backward Fourier transformation for
800	! a 1d-decomposition along x
801	!------------------------------------------------------------------------------!
802
803	USE control_parameters
804	USE cpulog
805	USE indices
806	USE interfaces
807	USE pegrid
808	USE transpose_indices
809
810	IMPLICIT NONE
811
812	INTEGER :: i, iend, iouter, ir, j, k
813	INTEGER, PARAMETER :: stridex = 4
814
815	REAL, DIMENSION(0:ny,stridex) :: work_ffty
816	#if defined( __nec )
817	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
818	#endif
819	REAL, DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_in
820	REAL, DIMENSION(1:nz,0:ny,nxl:nxr) :: f_out
821	REAL, DIMENSION(nxl:nxr,1:nz,0:ny) :: work
822
823	!
824	!-- Transpose array
825	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
826	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
827	CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
828	work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
829	comm1dx, ierr )
830	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
831
832	!
833	!-- Resort the data in a way that y becomes the first index and carry out the
834	!-- backward fft along y.
835	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
836
837	IF ( host(1:3) == 'nec' ) THEN
838	#if defined( __nec )
839	!
840	!-- Code optimized for vector processors
841	!$OMP PARALLEL PRIVATE ( i, j, k )
842	!$OMP DO
843	DO k = 1, nz
844	DO j = 0, ny
845	DO i = nxl, nxr
846	work_ffty_vec(j,k,i) = work(i,k,j)
847	ENDDO
848	ENDDO
849	ENDDO
850
851	!$OMP DO
852	DO i = nxl, nxr
853
854	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' )
855
856	DO j = 0, ny
857	DO k = 1, nz
858	f_out(k,j,i) = work_ffty_vec(j,k,i)
859	ENDDO
860	ENDDO
861
862	ENDDO
863	!$OMP END PARALLEL
864	#endif
865
866	ELSE
867
868	!
869	!-- Cache optimized code.
870	!-- The i-(x-)direction is split into a strided outer loop and an inner
871	!-- loop for better cache performance
872	!$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty )
873	!$OMP DO
874	DO iouter = nxl, nxr, stridex
875
876	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
877
878	DO k = 1, nz
879	!
880	!-- Resort
881	DO j = 0, ny
882	DO i = iouter, iend
883	work_ffty(j,i-iouter+1) = work(i,k,j)
884	ENDDO
885	ENDDO
886
887	DO i = iouter, iend
888
889	!
890	!-- FFT along y
891	ir = i-iouter+1 ! counter within a stride
892	CALL fft_y( work_ffty(:,ir), 'backward' )
893
894	DO j = 0, ny
895	f_out(k,j,i) = work_ffty(j,ir)
896	ENDDO
897	ENDDO
898
899	ENDDO
900
901	ENDDO
902	!$OMP END PARALLEL
903
904	ENDIF
905
906	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
907
908	END SUBROUTINE tr_xy_ffty
909
910
911	SUBROUTINE fftx_tri_fftx( ar )
912
913	!------------------------------------------------------------------------------!
914	! FFT along x, solution of the tridiagonal system and backward FFT for
915	! a 1d-decomposition along x
916	!
917	! WARNING: this subroutine may still not work for hybrid parallelization
918	! with OpenMP (for possible necessary changes see the original
919	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
920	!------------------------------------------------------------------------------!
921
922	USE control_parameters
923	USE cpulog
924	USE grid_variables
925	USE indices
926	USE interfaces
927	USE pegrid
928	USE transpose_indices
929
930	IMPLICIT NONE
931
932	character(len=3) :: myth_char
933
934	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
935
936	REAL, DIMENSION(0:nx) :: work_fftx
937	REAL, DIMENSION(0:nx,1:nz) :: work_trix
938	REAL, DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: ar
939	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
940
941
942	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'start' )
943
944	ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) )
945
946	tn = 0 ! Default thread number in case of one thread
947	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_fftx, work_trix )
948	DO j = nys_x, nyn_x
949
950	!$ tn = omp_get_thread_num()
951
952	IF ( host(1:3) == 'nec' ) THEN
953	!
954	!-- Code optimized for vector processors
955	DO k = 1, nz
956
957	m = 0
958	DO n = 1, pdims(1)
959	DO i = 1, nnx
960	work_trix(m,k) = ar(i,k,j,n)
961	m = m + 1
962	ENDDO
963	ENDDO
964
965	ENDDO
966
967	CALL fft_x_m( work_trix, 'forward' )
968
969	ELSE
970	!
971	!-- Cache optimized code
972	DO k = 1, nz
973
974	m = 0
975	DO n = 1, pdims(1)
976	DO i = 1, nnx
977	work_fftx(m) = ar(i,k,j,n)
978	m = m + 1
979	ENDDO
980	ENDDO
981
982	CALL fft_x( work_fftx, 'forward' )
983
984	DO i = 0, nx
985	work_trix(i,k) = work_fftx(i)
986	ENDDO
987
988	ENDDO
989
990	ENDIF
991
992	!
993	!-- Solve the linear equation system
994	CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) )
995
996	IF ( host(1:3) == 'nec' ) THEN
997	!
998	!-- Code optimized for vector processors
999	CALL fft_x_m( work_trix, 'backward' )
1000
1001	DO k = 1, nz
1002
1003	m = 0
1004	DO n = 1, pdims(1)
1005	DO i = 1, nnx
1006	ar(i,k,j,n) = work_trix(m,k)
1007	m = m + 1
1008	ENDDO
1009	ENDDO
1010
1011	ENDDO
1012
1013	ELSE
1014	!
1015	!-- Cache optimized code
1016	DO k = 1, nz
1017
1018	DO i = 0, nx
1019	work_fftx(i) = work_trix(i,k)
1020	ENDDO
1021
1022	CALL fft_x( work_fftx, 'backward' )
1023
1024	m = 0
1025	DO n = 1, pdims(1)
1026	DO i = 1, nnx
1027	ar(i,k,j,n) = work_fftx(m)
1028	m = m + 1
1029	ENDDO
1030	ENDDO
1031
1032	ENDDO
1033
1034	ENDIF
1035
1036	ENDDO
1037
1038	DEALLOCATE( tri )
1039
1040	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'stop' )
1041
1042	END SUBROUTINE fftx_tri_fftx
1043
1044
1045	SUBROUTINE fftx_tr_xy( f_in, work, f_out )
1046
1047	!------------------------------------------------------------------------------!
1048	! Fourier-transformation along x with subsequent transposition x --> y for
1049	! a 1d-decomposition along y
1050	!
1051	! ATTENTION: The NEC-branch of this routine may significantly profit from
1052	! further optimizations. So far, performance is much worse than
1053	! for routine ffty_tr_yx (more than three times slower).
1054	!------------------------------------------------------------------------------!
1055
1056	USE control_parameters
1057	USE cpulog
1058	USE indices
1059	USE interfaces
1060	USE pegrid
1061	USE transpose_indices
1062
1063	IMPLICIT NONE
1064
1065	INTEGER :: i, j, k
1066
1067	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
1068	REAL, DIMENSION(1:nz,nys:nyn,0:nx) :: f_in
1069	REAL, DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_out
1070	REAL, DIMENSION(nys:nyn,1:nz,0:nx) :: work
1071
1072	!
1073	!-- Carry out the FFT along x, where all data are present due to the
1074	!-- 1d-decomposition along y. Resort the data in a way that y becomes
1075	!-- the first index.
1076	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
1077
1078	IF ( host(1:3) == 'nec' ) THEN
1079	!
1080	!-- Code for vector processors
1081	!$OMP PARALLEL PRIVATE ( i, j, k )
1082	!$OMP DO
1083	DO i = 0, nx
1084
1085	DO j = nys, nyn
1086	DO k = 1, nz
1087	work_fftx(i,k,j) = f_in(k,j,i)
1088	ENDDO
1089	ENDDO
1090
1091	ENDDO
1092
1093	!$OMP DO
1094	DO j = nys, nyn
1095
1096	CALL fft_x_m( work_fftx(:,:,j), 'forward' )
1097
1098	DO k = 1, nz
1099	DO i = 0, nx
1100	work(j,k,i) = work_fftx(i,k,j)
1101	ENDDO
1102	ENDDO
1103
1104	ENDDO
1105	!$OMP END PARALLEL
1106
1107	ELSE
1108
1109	!
1110	!-- Cache optimized code (there might be still a potential for better
1111	!-- optimization).
1112	!$OMP PARALLEL PRIVATE (i,j,k)
1113	!$OMP DO
1114	DO i = 0, nx
1115
1116	DO j = nys, nyn
1117	DO k = 1, nz
1118	work_fftx(i,k,j) = f_in(k,j,i)
1119	ENDDO
1120	ENDDO
1121
1122	ENDDO
1123
1124	!$OMP DO
1125	DO j = nys, nyn
1126	DO k = 1, nz
1127
1128	CALL fft_x( work_fftx(0:nx,k,j), 'forward' )
1129
1130	DO i = 0, nx
1131	work(j,k,i) = work_fftx(i,k,j)
1132	ENDDO
1133	ENDDO
1134
1135	ENDDO
1136	!$OMP END PARALLEL
1137
1138	ENDIF
1139	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
1140
1141	!
1142	!-- Transpose array
1143	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1144	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1145	CALL MPI_ALLTOALL( work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
1146	f_out(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
1147	comm1dy, ierr )
1148	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1149
1150	END SUBROUTINE fftx_tr_xy
1151
1152
1153	SUBROUTINE tr_yx_fftx( f_in, work, f_out )
1154
1155	!------------------------------------------------------------------------------!
1156	! Transposition y --> x with a subsequent backward Fourier transformation for
1157	! a 1d-decomposition along x
1158	!------------------------------------------------------------------------------!
1159
1160	USE control_parameters
1161	USE cpulog
1162	USE indices
1163	USE interfaces
1164	USE pegrid
1165	USE transpose_indices
1166
1167	IMPLICIT NONE
1168
1169	INTEGER :: i, j, k
1170
1171	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
1172	REAL, DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_in
1173	REAL, DIMENSION(1:nz,nys:nyn,0:nx) :: f_out
1174	REAL, DIMENSION(nys:nyn,1:nz,0:nx) :: work
1175
1176	!
1177	!-- Transpose array
1178	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1179	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1180	CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
1181	work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
1182	comm1dy, ierr )
1183	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1184
1185	!
1186	!-- Carry out the FFT along x, where all data are present due to the
1187	!-- 1d-decomposition along y. Resort the data in a way that y becomes
1188	!-- the first index.
1189	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
1190
1191	IF ( host(1:3) == 'nec' ) THEN
1192	!
1193	!-- Code optimized for vector processors
1194	!$OMP PARALLEL PRIVATE ( i, j, k )
1195	!$OMP DO
1196	DO j = nys, nyn
1197
1198	DO k = 1, nz
1199	DO i = 0, nx
1200	work_fftx(i,k,j) = work(j,k,i)
1201	ENDDO
1202	ENDDO
1203
1204	CALL fft_x_m( work_fftx(:,:,j), 'backward' )
1205
1206	ENDDO
1207
1208	!$OMP DO
1209	DO i = 0, nx
1210	DO j = nys, nyn
1211	DO k = 1, nz
1212	f_out(k,j,i) = work_fftx(i,k,j)
1213	ENDDO
1214	ENDDO
1215	ENDDO
1216	!$OMP END PARALLEL
1217
1218	ELSE
1219
1220	!
1221	!-- Cache optimized code (there might be still a potential for better
1222	!-- optimization).
1223	!$OMP PARALLEL PRIVATE (i,j,k)
1224	!$OMP DO
1225	DO j = nys, nyn
1226	DO k = 1, nz
1227
1228	DO i = 0, nx
1229	work_fftx(i,k,j) = work(j,k,i)
1230	ENDDO
1231
1232	CALL fft_x( work_fftx(0:nx,k,j), 'backward' )
1233
1234	ENDDO
1235	ENDDO
1236
1237	!$OMP DO
1238	DO i = 0, nx
1239	DO j = nys, nyn
1240	DO k = 1, nz
1241	f_out(k,j,i) = work_fftx(i,k,j)
1242	ENDDO
1243	ENDDO
1244	ENDDO
1245	!$OMP END PARALLEL
1246
1247	ENDIF
1248	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
1249
1250	END SUBROUTINE tr_yx_fftx
1251
1252
1253	SUBROUTINE ffty_tri_ffty( ar )
1254
1255	!------------------------------------------------------------------------------!
1256	! FFT along y, solution of the tridiagonal system and backward FFT for
1257	! a 1d-decomposition along y
1258	!
1259	! WARNING: this subroutine may still not work for hybrid parallelization
1260	! with OpenMP (for possible necessary changes see the original
1261	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
1262	!------------------------------------------------------------------------------!
1263
1264	USE control_parameters
1265	USE cpulog
1266	USE grid_variables
1267	USE indices
1268	USE interfaces
1269	USE pegrid
1270	USE transpose_indices
1271
1272	IMPLICIT NONE
1273
1274	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
1275
1276	REAL, DIMENSION(0:ny) :: work_ffty
1277	REAL, DIMENSION(0:ny,1:nz) :: work_triy
1278	REAL, DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: ar
1279	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
1280
1281
1282	CALL cpu_log( log_point_s(39), 'fft_y + tridia', 'start' )
1283
1284	ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) )
1285
1286	tn = 0 ! Default thread number in case of one thread
1287	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy )
1288	DO i = nxl_y, nxr_y
1289
1290	!$ tn = omp_get_thread_num()
1291
1292	IF ( host(1:3) == 'nec' ) THEN
1293	!
1294	!-- Code optimized for vector processors
1295	DO k = 1, nz
1296
1297	m = 0
1298	DO n = 1, pdims(2)
1299	DO j = 1, nny
1300	work_triy(m,k) = ar(j,k,i,n)
1301	m = m + 1
1302	ENDDO
1303	ENDDO
1304
1305	ENDDO
1306
1307	CALL fft_y_m( work_triy, ny, 'forward' )
1308
1309	ELSE
1310	!
1311	!-- Cache optimized code
1312	DO k = 1, nz
1313
1314	m = 0
1315	DO n = 1, pdims(2)
1316	DO j = 1, nny
1317	work_ffty(m) = ar(j,k,i,n)
1318	m = m + 1
1319	ENDDO
1320	ENDDO
1321
1322	CALL fft_y( work_ffty, 'forward' )
1323
1324	DO j = 0, ny
1325	work_triy(j,k) = work_ffty(j)
1326	ENDDO
1327
1328	ENDDO
1329
1330	ENDIF
1331
1332	!
1333	!-- Solve the linear equation system
1334	CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) )
1335
1336	IF ( host(1:3) == 'nec' ) THEN
1337	!
1338	!-- Code optimized for vector processors
1339	CALL fft_y_m( work_triy, ny, 'backward' )
1340
1341	DO k = 1, nz
1342
1343	m = 0
1344	DO n = 1, pdims(2)
1345	DO j = 1, nny
1346	ar(j,k,i,n) = work_triy(m,k)
1347	m = m + 1
1348	ENDDO
1349	ENDDO
1350
1351	ENDDO
1352
1353	ELSE
1354	!
1355	!-- Cache optimized code
1356	DO k = 1, nz
1357
1358	DO j = 0, ny
1359	work_ffty(j) = work_triy(j,k)
1360	ENDDO
1361
1362	CALL fft_y( work_ffty, 'backward' )
1363
1364	m = 0
1365	DO n = 1, pdims(2)
1366	DO j = 1, nny
1367	ar(j,k,i,n) = work_ffty(m)
1368	m = m + 1
1369	ENDDO
1370	ENDDO
1371
1372	ENDDO
1373
1374	ENDIF
1375
1376	ENDDO
1377
1378	DEALLOCATE( tri )
1379
1380	CALL cpu_log( log_point_s(39), 'fft_y + tridia', 'stop' )
1381
1382	END SUBROUTINE ffty_tri_ffty
1383
1384
1385	SUBROUTINE tridia_1dd( ddx2, ddy2, nx, ny, j, ar, tri )
1386
1387	!------------------------------------------------------------------------------!
1388	! Solves the linear system of equations for a 1d-decomposition along x (see
1389	! tridia)
1390	!
1391	! Attention: when using the intel compilers older than 12.0, array tri must
1392	! be passed as an argument to the contained subroutines. Otherwise
1393	! addres faults will occur. This feature can be activated with
1394	! cpp-switch __intel11
1395	! On NEC, tri should not be passed (except for routine substi_1dd)
1396	! because this causes very bad performance.
1397	!------------------------------------------------------------------------------!
1398
1399	USE arrays_3d
1400	USE control_parameters
1401
1402	USE pegrid
1403
1404	IMPLICIT NONE
1405
1406	INTEGER :: i, j, k, nnyh, nx, ny, omp_get_thread_num, tn
1407
1408	REAL :: ddx2, ddy2
1409
1410	REAL, DIMENSION(0:nx,1:nz) :: ar
1411	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
1412	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
1413
1414
1415	nnyh = ( ny + 1 ) / 2
1416
1417	!
1418	!-- Define constant elements of the tridiagonal matrix.
1419	!-- The compiler on SX6 does loop exchange. If 0:nx is a high power of 2,
1420	!-- the exchanged loops create bank conflicts. The following directive
1421	!-- prohibits loop exchange and the loops perform much better.
1422	! tn = omp_get_thread_num()
1423	! WRITE( 120+tn, * ) '+++ id=',myid,' nx=',nx,' thread=', omp_get_thread_num()
1424	! CALL local_flush( 120+tn )
1425	!CDIR NOLOOPCHG
1426	DO k = 0, nz-1
1427	DO i = 0,nx
1428	tri(2,i,k) = ddzu_pres(k+1) * ddzw(k+1)
1429	tri(3,i,k) = ddzu_pres(k+2) * ddzw(k+1)
1430	ENDDO
1431	ENDDO
1432	! WRITE( 120+tn, * ) '+++ id=',myid,' end of first tridia loop thread=', omp_get_thread_num()
1433	! CALL local_flush( 120+tn )
1434
1435	IF ( j <= nnyh ) THEN
1436	#if defined( __intel11 )
1437	CALL maketri_1dd( j, tri )
1438	#else
1439	CALL maketri_1dd( j )
1440	#endif
1441	ELSE
1442	#if defined( __intel11 )
1443	CALL maketri_1dd( ny+1-j, tri )
1444	#else
1445	CALL maketri_1dd( ny+1-j )
1446	#endif
1447	ENDIF
1448	#if defined( __intel11 )
1449	CALL split_1dd( tri )
1450	#else
1451	CALL split_1dd
1452	#endif
1453	CALL substi_1dd( ar, tri )
1454
1455	CONTAINS
1456
1457	#if defined( __intel11 )
1458	SUBROUTINE maketri_1dd( j, tri )
1459	#else
1460	SUBROUTINE maketri_1dd( j )
1461	#endif
1462
1463	!------------------------------------------------------------------------------!
1464	! computes the i- and j-dependent component of the matrix
1465	!------------------------------------------------------------------------------!
1466
1467	USE constants
1468
1469	IMPLICIT NONE
1470
1471	INTEGER :: i, j, k, nnxh
1472	REAL :: a, c
1473
1474	REAL, DIMENSION(0:nx) :: l
1475
1476	#if defined( __intel11 )
1477	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
1478	#endif
1479
1480
1481	nnxh = ( nx + 1 ) / 2
1482	!
1483	!-- Provide the tridiagonal matrix for solution of the Poisson equation in
1484	!-- Fourier space. The coefficients are computed following the method of
1485	!-- Schmidt et al. (DFVLR-Mitteilung 84-15), which departs from Stephan
1486	!-- Siano's original version by discretizing the Poisson equation,
1487	!-- before it is Fourier-transformed
1488	DO i = 0, nx
1489	IF ( i >= 0 .AND. i <= nnxh ) THEN
1490	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
1491	FLOAT( nx+1 ) ) ) * ddx2 + &
1492	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
1493	FLOAT( ny+1 ) ) ) * ddy2
1494	ELSE
1495	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
1496	FLOAT( nx+1 ) ) ) * ddx2 + &
1497	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
1498	FLOAT( ny+1 ) ) ) * ddy2
1499	ENDIF
1500	ENDDO
1501
1502	DO k = 0, nz-1
1503	DO i = 0, nx
1504	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
1505	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
1506	tri(1,i,k) = a + c - l(i)
1507	ENDDO
1508	ENDDO
1509	IF ( ibc_p_b == 1 .OR. ibc_p_b == 2 ) THEN
1510	DO i = 0, nx
1511	tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
1512	ENDDO
1513	ENDIF
1514	IF ( ibc_p_t == 1 ) THEN
1515	DO i = 0, nx
1516	tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
1517	ENDDO
1518	ENDIF
1519
1520	END SUBROUTINE maketri_1dd
1521
1522
1523	#if defined( __intel11 )
1524	SUBROUTINE split_1dd( tri )
1525	#else
1526	SUBROUTINE split_1dd
1527	#endif
1528
1529	!------------------------------------------------------------------------------!
1530	! Splitting of the tridiagonal matrix (Thomas algorithm)
1531	!------------------------------------------------------------------------------!
1532
1533	IMPLICIT NONE
1534
1535	INTEGER :: i, k
1536
1537	#if defined( __intel11 )
1538	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
1539	#endif
1540
1541
1542	!
1543	!-- Splitting
1544	DO i = 0, nx
1545	tri(4,i,0) = tri(1,i,0)
1546	ENDDO
1547	DO k = 1, nz-1
1548	DO i = 0, nx
1549	tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
1550	tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
1551	ENDDO
1552	ENDDO
1553
1554	END SUBROUTINE split_1dd
1555
1556
1557	SUBROUTINE substi_1dd( ar, tri )
1558
1559	!------------------------------------------------------------------------------!
1560	! Substitution (Forward and Backward) (Thomas algorithm)
1561	!------------------------------------------------------------------------------!
1562
1563	IMPLICIT NONE
1564
1565	INTEGER :: i, k
1566
1567	REAL, DIMENSION(0:nx,nz) :: ar
1568	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
1569	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
1570
1571	!
1572	!-- Forward substitution
1573	DO i = 0, nx
1574	ar1(i,0) = ar(i,1)
1575	ENDDO
1576	DO k = 1, nz-1
1577	DO i = 0, nx
1578	ar1(i,k) = ar(i,k+1) - tri(5,i,k) * ar1(i,k-1)
1579	ENDDO
1580	ENDDO
1581
1582	!
1583	!-- Backward substitution
1584	!-- Note, the add of 1.0E-20 in the denominator is due to avoid divisions
1585	!-- by zero appearing if the pressure bc is set to neumann at the top of
1586	!-- the model domain.
1587	DO i = 0, nx
1588	ar(i,nz) = ar1(i,nz-1) / ( tri(4,i,nz-1) + 1.0E-20 )
1589	ENDDO
1590	DO k = nz-2, 0, -1
1591	DO i = 0, nx
1592	ar(i,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,k+2) ) &
1593	/ tri(4,i,k)
1594	ENDDO
1595	ENDDO
1596
1597	!
1598	!-- Indices i=0, j=0 correspond to horizontally averaged pressure.
1599	!-- The respective values of ar should be zero at all k-levels if
1600	!-- acceleration of horizontally averaged vertical velocity is zero.
1601	IF ( ibc_p_b == 1 .AND. ibc_p_t == 1 ) THEN
1602	IF ( j == 0 ) THEN
1603	DO k = 1, nz
1604	ar(0,k) = 0.0
1605	ENDDO
1606	ENDIF
1607	ENDIF
1608
1609	END SUBROUTINE substi_1dd
1610
1611	END SUBROUTINE tridia_1dd
1612
1613	#endif
1614	#endif
1615	END MODULE poisfft_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |