Home

Context Navigation

source: palm/trunk/SOURCE/poisfft.f90 @ 696

Last change on this file since 696 was 696, checked in by raasch, 13 years ago
adjustments for openmp usage on ibmkisti (mrun, subjob); OpenMP-bugfixes: work_fftx removed from PRIVATE clauses in fftx_tr_xy and tr_yx_fftx (poisfft); Bugfix: Summation of Wicker-Skamarock scheme fluxes and variances for all threads (flow_statistics)
Property svn:keywords set to `Id`
File size: 45.6 KB

Line
1	MODULE poisfft_mod
2
3	!------------------------------------------------------------------------------!
4	! Current revisions:
5	! -----------------
6	! work_fftx removed from PRIVATE clauses in fftx_tr_xy and tr_yx_fftx
7	!
8	! Former revisions:
9	! -----------------
10	! $Id: poisfft.f90 696 2011-03-18 07:03:49Z raasch $
11	!
12	! 683 2011-02-09 14:25:15Z raasch
13	! openMP parallelization for 2d-domain-decomposition
14	!
15	! 667 2010-12-23 12:06:00Z suehring/gryschka
16	! ddzu replaced by ddzu_pres due to changes in zu(0)
17	!
18	! 622 2010-12-10 08:08:13Z raasch
19	! optional barriers included in order to speed up collective operations
20	!
21	! 377 2009-09-04 11:09:00Z raasch
22	! __lcmuk changed to __lc to avoid problems with Intel compiler on sgi-ice
23	!
24	! 164 2008-05-15 08:46:15Z raasch
25	! Arguments removed from transpose routines
26	!
27	! 128 2007-10-26 13:11:14Z raasch
28	! Bugfix: wavenumber calculation for even nx in routines maketri
29	!
30	! 85 2007-05-11 09:35:14Z raasch
31	! Bugfix: work_fft*_vec removed from some PRIVATE-declarations
32	!
33	! 76 2007-03-29 00:58:32Z raasch
34	! Tridiagonal coefficients adjusted for Neumann boundary conditions both at
35	! the bottom and the top.
36	!
37	! RCS Log replace by Id keyword, revision history cleaned up
38	!
39	! Revision 1.24 2006/08/04 15:00:24 raasch
40	! Default setting of the thread number tn in case of not using OpenMP
41	!
42	! Revision 1.23 2006/02/23 12:48:38 raasch
43	! Additional compiler directive in routine tridia_1dd for preventing loop
44	! exchange on NEC-SX6
45	!
46	! Revision 1.20 2004/04/30 12:38:09 raasch
47	! Parts of former poisfft_hybrid moved to this subroutine,
48	! former subroutine changed to a module, renaming of FFT-subroutines and
49	! -module, FFTs completely substituted by calls of fft_x and fft_y,
50	! NAG fft used in the non-parallel case completely removed, l in maketri
51	! is now a 1d-array, variables passed by modules instead of using parameter
52	! lists, enlarged transposition arrays introduced
53	!
54	! Revision 1.1 1997/07/24 11:24:14 raasch
55	! Initial revision
56	!
57	!
58	! Description:
59	! ------------
60	! See below.
61	!------------------------------------------------------------------------------!
62
63	!--------------------------------------------------------------------------!
64	! poisfft !
65	! !
66	! Original version: Stephan Siano (pois3d) !
67	! !
68	! Institute of Meteorology and Climatology, University of Hannover !
69	! Germany !
70	! !
71	! Version as of July 23,1996 !
72	! !
73	! !
74	! Version for parallel computers: Siegfried Raasch !
75	! !
76	! Version as of July 03,1997 !
77	! !
78	! Solves the Poisson equation with a 2D spectral method !
79	! d^2 p / dx^2 + d^2 p / dy^2 + d^2 p / dz^2 = s !
80	! !
81	! Input: !
82	! real ar contains in the (nnx,nny,nnz) elements, !
83	! starting from the element (1,nys,nxl), the !
84	! values for s !
85	! real work Temporary array !
86	! !
87	! Output: !
88	! real ar contains the solution for p !
89	!--------------------------------------------------------------------------!
90
91	USE fft_xy
92	USE indices
93	USE transpose_indices
94
95	IMPLICIT NONE
96
97	PRIVATE
98	PUBLIC poisfft, poisfft_init
99
100	INTERFACE poisfft
101	MODULE PROCEDURE poisfft
102	END INTERFACE poisfft
103
104	INTERFACE poisfft_init
105	MODULE PROCEDURE poisfft_init
106	END INTERFACE poisfft_init
107
108	CONTAINS
109
110	SUBROUTINE poisfft_init
111
112	CALL fft_init
113
114	END SUBROUTINE poisfft_init
115
116
117	SUBROUTINE poisfft( ar, work )
118
119	USE cpulog
120	USE interfaces
121	USE pegrid
122
123	IMPLICIT NONE
124
125	REAL, DIMENSION(1:nza,nys:nyna,nxl:nxra) :: ar, work
126
127
128	CALL cpu_log( log_point_s(3), 'poisfft', 'start' )
129
130	!
131	!-- Two-dimensional Fourier Transformation in x- and y-direction.
132	#if defined( __parallel )
133	IF ( pdims(2) == 1 ) THEN
134
135	!
136	!-- 1d-domain-decomposition along x:
137	!-- FFT along y and transposition y --> x
138	CALL ffty_tr_yx( ar, work, ar )
139
140	!
141	!-- FFT along x, solving the tridiagonal system and backward FFT
142	CALL fftx_tri_fftx( ar )
143
144	!
145	!-- Transposition x --> y and backward FFT along y
146	CALL tr_xy_ffty( ar, work, ar )
147
148	ELSEIF ( pdims(1) == 1 ) THEN
149
150	!
151	!-- 1d-domain-decomposition along y:
152	!-- FFT along x and transposition x --> y
153	CALL fftx_tr_xy( ar, work, ar )
154
155	!
156	!-- FFT along y, solving the tridiagonal system and backward FFT
157	CALL ffty_tri_ffty( ar )
158
159	!
160	!-- Transposition y --> x and backward FFT along x
161	CALL tr_yx_fftx( ar, work, ar )
162
163	ELSE
164
165	!
166	!-- 2d-domain-decomposition
167	!-- Transposition z --> x
168	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
169	CALL transpose_zx( ar, work, ar )
170	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
171
172	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
173	CALL fftxp( ar, 'forward' )
174	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
175
176	!
177	!-- Transposition x --> y
178	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
179	CALL transpose_xy( ar, work, ar )
180	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
181
182	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
183	CALL fftyp( ar, 'forward' )
184	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
185
186	!
187	!-- Transposition y --> z
188	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
189	CALL transpose_yz( ar, work, ar )
190	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
191
192	!
193	!-- Solve the Poisson equation in z-direction in cartesian space.
194	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
195	CALL tridia( ar )
196	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
197
198	!
199	!-- Inverse Fourier Transformation
200	!-- Transposition z --> y
201	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
202	CALL transpose_zy( ar, work, ar )
203	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
204
205	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
206	CALL fftyp( ar, 'backward' )
207	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
208
209	!
210	!-- Transposition y --> x
211	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
212	CALL transpose_yx( ar, work, ar )
213	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
214
215	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
216	CALL fftxp( ar, 'backward' )
217	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
218
219	!
220	!-- Transposition x --> z
221	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
222	CALL transpose_xz( ar, work, ar )
223	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
224
225	ENDIF
226
227	#else
228
229	!
230	!-- Two-dimensional Fourier Transformation along x- and y-direction.
231	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
232	CALL fftx( ar, 'forward' )
233	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
234	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
235	CALL ffty( ar, 'forward' )
236	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
237
238	!
239	!-- Solve the Poisson equation in z-direction in cartesian space.
240	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
241	CALL tridia( ar )
242	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
243
244	!
245	!-- Inverse Fourier Transformation.
246	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
247	CALL ffty( ar, 'backward' )
248	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
249	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
250	CALL fftx( ar, 'backward' )
251	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
252
253	#endif
254
255	CALL cpu_log( log_point_s(3), 'poisfft', 'stop' )
256
257	END SUBROUTINE poisfft
258
259
260
261	SUBROUTINE tridia( ar )
262
263	!------------------------------------------------------------------------------!
264	! solves the linear system of equations:
265	!
266	! -(4 pi^2(i^2/(dx^2nnx^2)+j^2/(dy^2nny^2))+
267	! 1/(dzu(k)dzw(k))+1/(dzu(k-1)dzw(k)))*p(i,j,k)+
268	! 1/(dzu(k)dzw(k))p(i,j,k+1)+1/(dzu(k-1)dzw(k))p(i,j,k-1)=d(i,j,k)
269	!
270	! by using the Thomas algorithm
271	!------------------------------------------------------------------------------!
272
273	USE arrays_3d
274
275	IMPLICIT NONE
276
277	INTEGER :: i, j, k, nnyh
278
279	REAL, DIMENSION(nxl_z:nxr_z,0:nz-1) :: ar1
280	REAL, DIMENSION(5,nxl_z:nxr_z,0:nz-1) :: tri
281
282	#if defined( __parallel )
283	REAL :: ar(nxl_z:nxr_za,nys_z:nyn_za,1:nza)
284	#else
285	REAL :: ar(1:nz,nys_z:nyn_z,nxl_z:nxr_z)
286	#endif
287
288
289	nnyh = (ny+1) / 2
290
291	!
292	!-- Define constant elements of the tridiagonal matrix.
293	!$OMP PARALLEL PRIVATE ( k, i )
294	!$OMP DO
295	DO k = 0, nz-1
296	DO i = nxl_z, nxr_z
297	tri(2,i,k) = ddzu_pres(k+1) * ddzw(k+1)
298	tri(3,i,k) = ddzu_pres(k+2) * ddzw(k+1)
299	ENDDO
300	ENDDO
301	!$OMP END PARALLEL
302
303	#if defined( __parallel )
304	!
305	!-- Repeat for all y-levels.
306	!$OMP PARALLEL FIRSTPRIVATE( tri ) PRIVATE ( ar1, j )
307	!$OMP DO
308	DO j = nys_z, nyn_z
309	IF ( j <= nnyh ) THEN
310	CALL maketri( tri, j )
311	ELSE
312	CALL maketri( tri, ny+1-j )
313	ENDIF
314	CALL split( tri )
315	CALL substi( ar, ar1, tri, j )
316	ENDDO
317	!$OMP END PARALLEL
318	#else
319	!
320	!-- First y-level.
321	CALL maketri( tri, nys_z )
322	CALL split( tri )
323	CALL substi( ar, ar1, tri, 0 )
324
325	!
326	!-- Further y-levels.
327	DO j = 1, nnyh - 1
328	CALL maketri( tri, j )
329	CALL split( tri )
330	CALL substi( ar, ar1, tri, j )
331	CALL substi( ar, ar1, tri, ny+1-j )
332	ENDDO
333	CALL maketri( tri, nnyh )
334	CALL split( tri )
335	CALL substi( ar, ar1, tri, nnyh+nys )
336	#endif
337
338	CONTAINS
339
340	SUBROUTINE maketri( tri, j )
341
342	!------------------------------------------------------------------------------!
343	! Computes the i- and j-dependent component of the matrix
344	!------------------------------------------------------------------------------!
345
346	USE arrays_3d
347	USE constants
348	USE control_parameters
349	USE grid_variables
350
351	IMPLICIT NONE
352
353	INTEGER :: i, j, k, nnxh
354	REAL :: a, c
355	REAL :: ll(nxl_z:nxr_z)
356	REAL :: tri(5,nxl_z:nxr_z,0:nz-1)
357
358
359	nnxh = ( nx + 1 ) / 2
360
361	!
362	!-- Provide the tridiagonal matrix for solution of the Poisson equation in
363	!-- Fourier space. The coefficients are computed following the method of
364	!-- Schmidt et al. (DFVLR-Mitteilung 84-15), which departs from Stephan
365	!-- Siano's original version by discretizing the Poisson equation,
366	!-- before it is Fourier-transformed
367	#if defined( __parallel )
368	DO i = nxl_z, nxr_z
369	IF ( i >= 0 .AND. i <= nnxh ) THEN
370	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
371	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
372	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
373	FLOAT( ny+1 ) ) ) / ( dy * dy )
374	ELSE
375	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
376	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
377	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
378	FLOAT( ny+1 ) ) ) / ( dy * dy )
379	ENDIF
380	DO k = 0,nz-1
381	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
382	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
383	tri(1,i,k) = a + c - ll(i)
384	ENDDO
385	ENDDO
386	#else
387	DO i = 0, nnxh
388	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / FLOAT( nx+1 ) ) ) / &
389	( dx * dx ) + &
390	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / FLOAT( ny+1 ) ) ) / &
391	( dy * dy )
392	DO k = 0, nz-1
393	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
394	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
395	tri(1,i,k) = a + c - ll(i)
396	IF ( i >= 1 .and. i < nnxh ) THEN
397	tri(1,nx+1-i,k) = tri(1,i,k)
398	ENDIF
399	ENDDO
400	ENDDO
401	#endif
402	IF ( ibc_p_b == 1 .OR. ibc_p_b == 2 ) THEN
403	DO i = nxl_z, nxr_z
404	tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
405	ENDDO
406	ENDIF
407	IF ( ibc_p_t == 1 ) THEN
408	DO i = nxl_z, nxr_z
409	tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
410	ENDDO
411	ENDIF
412
413	END SUBROUTINE maketri
414
415
416	SUBROUTINE substi( ar, ar1, tri, j )
417
418	!------------------------------------------------------------------------------!
419	! Substitution (Forward and Backward) (Thomas algorithm)
420	!------------------------------------------------------------------------------!
421
422	USE control_parameters
423
424	IMPLICIT NONE
425
426	INTEGER :: i, j, k
427	REAL :: ar1(nxl_z:nxr_z,0:nz-1)
428	REAL :: tri(5,nxl_z:nxr_z,0:nz-1)
429	#if defined( __parallel )
430	REAL :: ar(nxl_z:nxr_za,nys_z:nyn_za,1:nza)
431	#else
432	REAL :: ar(1:nz,nys_z:nyn_z,nxl_z:nxr_z)
433	#endif
434
435	!
436	!-- Forward substitution.
437	DO i = nxl_z, nxr_z
438	#if defined( __parallel )
439	ar1(i,0) = ar(i,j,1)
440	#else
441	ar1(i,0) = ar(1,j,i)
442	#endif
443	ENDDO
444	DO k = 1, nz - 1
445	DO i = nxl_z, nxr_z
446	#if defined( __parallel )
447	ar1(i,k) = ar(i,j,k+1) - tri(5,i,k) * ar1(i,k-1)
448	#else
449	ar1(i,k) = ar(k+1,j,i) - tri(5,i,k) * ar1(i,k-1)
450	#endif
451	ENDDO
452	ENDDO
453
454	!
455	!-- Backward substitution.
456	DO i = nxl_z, nxr_z
457	#if defined( __parallel )
458	ar(i,j,nz) = ar1(i,nz-1) / tri(4,i,nz-1)
459	#else
460	ar(nz,j,i) = ar1(i,nz-1) / tri(4,i,nz-1)
461	#endif
462	ENDDO
463	DO k = nz-2, 0, -1
464	DO i = nxl_z, nxr_z
465	#if defined( __parallel )
466	ar(i,j,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,j,k+2) ) &
467	/ tri(4,i,k)
468	#else
469	ar(k+1,j,i) = ( ar1(i,k) - tri(3,i,k) * ar(k+2,j,i) ) &
470	/ tri(4,i,k)
471	#endif
472	ENDDO
473	ENDDO
474
475	!
476	!-- Indices i=0, j=0 correspond to horizontally averaged pressure.
477	!-- The respective values of ar should be zero at all k-levels if
478	!-- acceleration of horizontally averaged vertical velocity is zero.
479	IF ( ibc_p_b == 1 .AND. ibc_p_t == 1 ) THEN
480	IF ( j == 0 .AND. nxl_z == 0 ) THEN
481	#if defined( __parallel )
482	DO k = 1, nz
483	ar(nxl_z,j,k) = 0.0
484	ENDDO
485	#else
486	DO k = 1, nz
487	ar(k,j,nxl_z) = 0.0
488	ENDDO
489	#endif
490	ENDIF
491	ENDIF
492
493	END SUBROUTINE substi
494
495
496	SUBROUTINE split( tri )
497
498	!------------------------------------------------------------------------------!
499	! Splitting of the tridiagonal matrix (Thomas algorithm)
500	!------------------------------------------------------------------------------!
501
502	IMPLICIT NONE
503
504	INTEGER :: i, k
505	REAL :: tri(5,nxl_z:nxr_z,0:nz-1)
506
507	!
508	!-- Splitting.
509	DO i = nxl_z, nxr_z
510	tri(4,i,0) = tri(1,i,0)
511	ENDDO
512	DO k = 1, nz-1
513	DO i = nxl_z, nxr_z
514	tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
515	tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
516	ENDDO
517	ENDDO
518
519	END SUBROUTINE split
520
521	END SUBROUTINE tridia
522
523
524	#if defined( __parallel )
525	SUBROUTINE fftxp( ar, direction )
526
527	!------------------------------------------------------------------------------!
528	! Fourier-transformation along x-direction Parallelized version
529	!------------------------------------------------------------------------------!
530
531	IMPLICIT NONE
532
533	CHARACTER (LEN=*) :: direction
534	INTEGER :: j, k
535	REAL :: ar(0:nxa,nys_x:nyn_xa,nzb_x:nzt_xa)
536
537	!
538	!-- Performing the fft with one of the methods implemented
539	!$OMP PARALLEL PRIVATE ( j, k )
540	!$OMP DO
541	DO k = nzb_x, nzt_x
542	DO j = nys_x, nyn_x
543	CALL fft_x( ar(0:nx,j,k), direction )
544	ENDDO
545	ENDDO
546	!$OMP END PARALLEL
547
548	END SUBROUTINE fftxp
549
550	#else
551	SUBROUTINE fftx( ar, direction )
552
553	!------------------------------------------------------------------------------!
554	! Fourier-transformation along x-direction Non parallel version
555	!------------------------------------------------------------------------------!
556
557	IMPLICIT NONE
558
559	CHARACTER (LEN=*) :: direction
560	INTEGER :: i, j, k
561	REAL :: ar(1:nz,0:ny,0:nx)
562
563	!
564	!-- Performing the fft with one of the methods implemented
565	!$OMP PARALLEL PRIVATE ( j, k )
566	!$OMP DO
567	DO k = 1, nz
568	DO j = 0, ny
569	CALL fft_x( ar(k,j,0:nx), direction )
570	ENDDO
571	ENDDO
572	!$OMP END PARALLEL
573
574	END SUBROUTINE fftx
575	#endif
576
577
578	#if defined( __parallel )
579	SUBROUTINE fftyp( ar, direction )
580
581	!------------------------------------------------------------------------------!
582	! Fourier-transformation along y-direction Parallelized version
583	!------------------------------------------------------------------------------!
584
585	IMPLICIT NONE
586
587	CHARACTER (LEN=*) :: direction
588	INTEGER :: i, k
589	REAL :: ar(0:nya,nxl_y:nxr_ya,nzb_y:nzt_ya)
590
591	!
592	!-- Performing the fft with one of the methods implemented
593	!$OMP PARALLEL PRIVATE ( i, k )
594	!$OMP DO
595	DO k = nzb_y, nzt_y
596	DO i = nxl_y, nxr_y
597	CALL fft_y( ar(0:ny,i,k), direction )
598	ENDDO
599	ENDDO
600	!$OMP END PARALLEL
601
602	END SUBROUTINE fftyp
603
604	#else
605	SUBROUTINE ffty( ar, direction )
606
607	!------------------------------------------------------------------------------!
608	! Fourier-transformation along y-direction Non parallel version
609	!------------------------------------------------------------------------------!
610
611	IMPLICIT NONE
612
613	CHARACTER (LEN=*) :: direction
614	INTEGER :: i, k
615	REAL :: ar(1:nz,0:ny,0:nx)
616
617	!
618	!-- Performing the fft with one of the methods implemented
619	!$OMP PARALLEL PRIVATE ( i, k )
620	!$OMP DO
621	DO k = 1, nz
622	DO i = 0, nx
623	CALL fft_y( ar(k,0:ny,i), direction )
624	ENDDO
625	ENDDO
626	!$OMP END PARALLEL
627
628	END SUBROUTINE ffty
629	#endif
630
631	#if defined( __parallel )
632	SUBROUTINE ffty_tr_yx( f_in, work, f_out )
633
634	!------------------------------------------------------------------------------!
635	! Fourier-transformation along y with subsequent transposition y --> x for
636	! a 1d-decomposition along x
637	!
638	! ATTENTION: The performance of this routine is much faster on the NEC-SX6,
639	! if the first index of work_ffty_vec is odd. Otherwise
640	! memory bank conflicts may occur (especially if the index is a
641	! multiple of 128). That's why work_ffty_vec is dimensioned as
642	! 0:ny+1.
643	! Of course, this will not work if users are using an odd number
644	! of gridpoints along y.
645	!------------------------------------------------------------------------------!
646
647	USE control_parameters
648	USE cpulog
649	USE indices
650	USE interfaces
651	USE pegrid
652	USE transpose_indices
653
654	IMPLICIT NONE
655
656	INTEGER :: i, iend, iouter, ir, j, k
657	INTEGER, PARAMETER :: stridex = 4
658
659	REAL, DIMENSION(0:ny,stridex) :: work_ffty
660	#if defined( __nec )
661	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
662	#endif
663	REAL, DIMENSION(1:nza,0:nya,nxl:nxra) :: f_in
664	REAL, DIMENSION(nnx,1:nza,nys_x:nyn_xa,pdims(1)) :: f_out
665	REAL, DIMENSION(nxl:nxra,1:nza,0:nya) :: work
666
667	!
668	!-- Carry out the FFT along y, where all data are present due to the
669	!-- 1d-decomposition along x. Resort the data in a way that x becomes
670	!-- the first index.
671	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
672
673	IF ( host(1:3) == 'nec' ) THEN
674	#if defined( __nec )
675	!
676	!-- Code optimized for vector processors
677	!$OMP PARALLEL PRIVATE ( i, j, k )
678	!$OMP DO
679	DO i = nxl, nxr
680
681	DO j = 0, ny
682	DO k = 1, nz
683	work_ffty_vec(j,k,i) = f_in(k,j,i)
684	ENDDO
685	ENDDO
686
687	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' )
688
689	ENDDO
690
691	!$OMP DO
692	DO k = 1, nz
693	DO j = 0, ny
694	DO i = nxl, nxr
695	work(i,k,j) = work_ffty_vec(j,k,i)
696	ENDDO
697	ENDDO
698	ENDDO
699	!$OMP END PARALLEL
700	#endif
701
702	ELSE
703
704	!
705	!-- Cache optimized code.
706	!-- The i-(x-)direction is split into a strided outer loop and an inner
707	!-- loop for better cache performance
708	!$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty)
709	!$OMP DO
710	DO iouter = nxl, nxr, stridex
711
712	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
713
714	DO k = 1, nz
715
716	DO i = iouter, iend
717
718	ir = i-iouter+1 ! counter within a stride
719	DO j = 0, ny
720	work_ffty(j,ir) = f_in(k,j,i)
721	ENDDO
722	!
723	!-- FFT along y
724	CALL fft_y( work_ffty(:,ir), 'forward' )
725
726	ENDDO
727
728	!
729	!-- Resort
730	DO j = 0, ny
731	DO i = iouter, iend
732	work(i,k,j) = work_ffty(j,i-iouter+1)
733	ENDDO
734	ENDDO
735
736	ENDDO
737
738	ENDDO
739	!$OMP END PARALLEL
740
741	ENDIF
742	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
743
744	!
745	!-- Transpose array
746	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
747	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
748	CALL MPI_ALLTOALL( work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
749	f_out(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
750	comm1dx, ierr )
751	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
752
753	END SUBROUTINE ffty_tr_yx
754
755
756	SUBROUTINE tr_xy_ffty( f_in, work, f_out )
757
758	!------------------------------------------------------------------------------!
759	! Transposition x --> y with a subsequent backward Fourier transformation for
760	! a 1d-decomposition along x
761	!------------------------------------------------------------------------------!
762
763	USE control_parameters
764	USE cpulog
765	USE indices
766	USE interfaces
767	USE pegrid
768	USE transpose_indices
769
770	IMPLICIT NONE
771
772	INTEGER :: i, iend, iouter, ir, j, k
773	INTEGER, PARAMETER :: stridex = 4
774
775	REAL, DIMENSION(0:ny,stridex) :: work_ffty
776	#if defined( __nec )
777	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
778	#endif
779	REAL, DIMENSION(nnx,1:nza,nys_x:nyn_xa,pdims(1)) :: f_in
780	REAL, DIMENSION(1:nza,0:nya,nxl:nxra) :: f_out
781	REAL, DIMENSION(nxl:nxra,1:nza,0:nya) :: work
782
783	!
784	!-- Transpose array
785	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
786	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
787	CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
788	work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
789	comm1dx, ierr )
790	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
791
792	!
793	!-- Resort the data in a way that y becomes the first index and carry out the
794	!-- backward fft along y.
795	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
796
797	IF ( host(1:3) == 'nec' ) THEN
798	#if defined( __nec )
799	!
800	!-- Code optimized for vector processors
801	!$OMP PARALLEL PRIVATE ( i, j, k )
802	!$OMP DO
803	DO k = 1, nz
804	DO j = 0, ny
805	DO i = nxl, nxr
806	work_ffty_vec(j,k,i) = work(i,k,j)
807	ENDDO
808	ENDDO
809	ENDDO
810
811	!$OMP DO
812	DO i = nxl, nxr
813
814	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' )
815
816	DO j = 0, ny
817	DO k = 1, nz
818	f_out(k,j,i) = work_ffty_vec(j,k,i)
819	ENDDO
820	ENDDO
821
822	ENDDO
823	!$OMP END PARALLEL
824	#endif
825
826	ELSE
827
828	!
829	!-- Cache optimized code.
830	!-- The i-(x-)direction is split into a strided outer loop and an inner
831	!-- loop for better cache performance
832	!$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty )
833	!$OMP DO
834	DO iouter = nxl, nxr, stridex
835
836	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
837
838	DO k = 1, nz
839	!
840	!-- Resort
841	DO j = 0, ny
842	DO i = iouter, iend
843	work_ffty(j,i-iouter+1) = work(i,k,j)
844	ENDDO
845	ENDDO
846
847	DO i = iouter, iend
848
849	!
850	!-- FFT along y
851	ir = i-iouter+1 ! counter within a stride
852	CALL fft_y( work_ffty(:,ir), 'backward' )
853
854	DO j = 0, ny
855	f_out(k,j,i) = work_ffty(j,ir)
856	ENDDO
857	ENDDO
858
859	ENDDO
860
861	ENDDO
862	!$OMP END PARALLEL
863
864	ENDIF
865
866	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
867
868	END SUBROUTINE tr_xy_ffty
869
870
871	SUBROUTINE fftx_tri_fftx( ar )
872
873	!------------------------------------------------------------------------------!
874	! FFT along x, solution of the tridiagonal system and backward FFT for
875	! a 1d-decomposition along x
876	!
877	! WARNING: this subroutine may still not work for hybrid parallelization
878	! with OpenMP (for possible necessary changes see the original
879	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
880	!------------------------------------------------------------------------------!
881
882	USE control_parameters
883	USE cpulog
884	USE grid_variables
885	USE indices
886	USE interfaces
887	USE pegrid
888	USE transpose_indices
889
890	IMPLICIT NONE
891
892	character(len=3) :: myth_char
893
894	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
895
896	REAL, DIMENSION(0:nx) :: work_fftx
897	REAL, DIMENSION(0:nx,1:nz) :: work_trix
898	REAL, DIMENSION(nnx,1:nza,nys_x:nyn_xa,pdims(1)) :: ar
899	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
900
901
902	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'start' )
903
904	ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) )
905
906	tn = 0 ! Default thread number in case of one thread
907	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_fftx, work_trix )
908	DO j = nys_x, nyn_x
909
910	!$ tn = omp_get_thread_num()
911
912	IF ( host(1:3) == 'nec' ) THEN
913	!
914	!-- Code optimized for vector processors
915	DO k = 1, nz
916
917	m = 0
918	DO n = 1, pdims(1)
919	DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
920	work_trix(m,k) = ar(i,k,j,n)
921	m = m + 1
922	ENDDO
923	ENDDO
924
925	ENDDO
926
927	CALL fft_x_m( work_trix, 'forward' )
928
929	ELSE
930	!
931	!-- Cache optimized code
932	DO k = 1, nz
933
934	m = 0
935	DO n = 1, pdims(1)
936	DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
937	work_fftx(m) = ar(i,k,j,n)
938	m = m + 1
939	ENDDO
940	ENDDO
941
942	CALL fft_x( work_fftx, 'forward' )
943
944	DO i = 0, nx
945	work_trix(i,k) = work_fftx(i)
946	ENDDO
947
948	ENDDO
949
950	ENDIF
951
952	!
953	!-- Solve the linear equation system
954	CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) )
955
956	IF ( host(1:3) == 'nec' ) THEN
957	!
958	!-- Code optimized for vector processors
959	CALL fft_x_m( work_trix, 'backward' )
960
961	DO k = 1, nz
962
963	m = 0
964	DO n = 1, pdims(1)
965	DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
966	ar(i,k,j,n) = work_trix(m,k)
967	m = m + 1
968	ENDDO
969	ENDDO
970
971	ENDDO
972
973	ELSE
974	!
975	!-- Cache optimized code
976	DO k = 1, nz
977
978	DO i = 0, nx
979	work_fftx(i) = work_trix(i,k)
980	ENDDO
981
982	CALL fft_x( work_fftx, 'backward' )
983
984	m = 0
985	DO n = 1, pdims(1)
986	DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
987	ar(i,k,j,n) = work_fftx(m)
988	m = m + 1
989	ENDDO
990	ENDDO
991
992	ENDDO
993
994	ENDIF
995
996	ENDDO
997
998	DEALLOCATE( tri )
999
1000	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'stop' )
1001
1002	END SUBROUTINE fftx_tri_fftx
1003
1004
1005	SUBROUTINE fftx_tr_xy( f_in, work, f_out )
1006
1007	!------------------------------------------------------------------------------!
1008	! Fourier-transformation along x with subsequent transposition x --> y for
1009	! a 1d-decomposition along y
1010	!
1011	! ATTENTION: The NEC-branch of this routine may significantly profit from
1012	! further optimizations. So far, performance is much worse than
1013	! for routine ffty_tr_yx (more than three times slower).
1014	!------------------------------------------------------------------------------!
1015
1016	USE control_parameters
1017	USE cpulog
1018	USE indices
1019	USE interfaces
1020	USE pegrid
1021	USE transpose_indices
1022
1023	IMPLICIT NONE
1024
1025	INTEGER :: i, j, k
1026
1027	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
1028	REAL, DIMENSION(1:nza,nys:nyna,0:nxa) :: f_in
1029	REAL, DIMENSION(nny,1:nza,nxl_y:nxr_ya,pdims(2)) :: f_out
1030	REAL, DIMENSION(nys:nyna,1:nza,0:nxa) :: work
1031
1032	!
1033	!-- Carry out the FFT along x, where all data are present due to the
1034	!-- 1d-decomposition along y. Resort the data in a way that y becomes
1035	!-- the first index.
1036	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
1037
1038	IF ( host(1:3) == 'nec' ) THEN
1039	!
1040	!-- Code for vector processors
1041	!$OMP PARALLEL PRIVATE ( i, j, k )
1042	!$OMP DO
1043	DO i = 0, nx
1044
1045	DO j = nys, nyn
1046	DO k = 1, nz
1047	work_fftx(i,k,j) = f_in(k,j,i)
1048	ENDDO
1049	ENDDO
1050
1051	ENDDO
1052
1053	!$OMP DO
1054	DO j = nys, nyn
1055
1056	CALL fft_x_m( work_fftx(:,:,j), 'forward' )
1057
1058	DO k = 1, nz
1059	DO i = 0, nx
1060	work(j,k,i) = work_fftx(i,k,j)
1061	ENDDO
1062	ENDDO
1063
1064	ENDDO
1065	!$OMP END PARALLEL
1066
1067	ELSE
1068
1069	!
1070	!-- Cache optimized code (there might be still a potential for better
1071	!-- optimization).
1072	!$OMP PARALLEL PRIVATE (i,j,k)
1073	!$OMP DO
1074	DO i = 0, nx
1075
1076	DO j = nys, nyn
1077	DO k = 1, nz
1078	work_fftx(i,k,j) = f_in(k,j,i)
1079	ENDDO
1080	ENDDO
1081
1082	ENDDO
1083
1084	!$OMP DO
1085	DO j = nys, nyn
1086	DO k = 1, nz
1087
1088	CALL fft_x( work_fftx(0:nx,k,j), 'forward' )
1089
1090	DO i = 0, nx
1091	work(j,k,i) = work_fftx(i,k,j)
1092	ENDDO
1093	ENDDO
1094
1095	ENDDO
1096	!$OMP END PARALLEL
1097
1098	ENDIF
1099	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
1100
1101	!
1102	!-- Transpose array
1103	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1104	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1105	CALL MPI_ALLTOALL( work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
1106	f_out(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
1107	comm1dy, ierr )
1108	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1109
1110	END SUBROUTINE fftx_tr_xy
1111
1112
1113	SUBROUTINE tr_yx_fftx( f_in, work, f_out )
1114
1115	!------------------------------------------------------------------------------!
1116	! Transposition y --> x with a subsequent backward Fourier transformation for
1117	! a 1d-decomposition along x
1118	!------------------------------------------------------------------------------!
1119
1120	USE control_parameters
1121	USE cpulog
1122	USE indices
1123	USE interfaces
1124	USE pegrid
1125	USE transpose_indices
1126
1127	IMPLICIT NONE
1128
1129	INTEGER :: i, j, k
1130
1131	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
1132	REAL, DIMENSION(nny,1:nza,nxl_y:nxr_ya,pdims(2)) :: f_in
1133	REAL, DIMENSION(1:nza,nys:nyna,0:nxa) :: f_out
1134	REAL, DIMENSION(nys:nyna,1:nza,0:nxa) :: work
1135
1136	!
1137	!-- Transpose array
1138	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1139	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1140	CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
1141	work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
1142	comm1dy, ierr )
1143	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1144
1145	!
1146	!-- Carry out the FFT along x, where all data are present due to the
1147	!-- 1d-decomposition along y. Resort the data in a way that y becomes
1148	!-- the first index.
1149	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
1150
1151	IF ( host(1:3) == 'nec' ) THEN
1152	!
1153	!-- Code optimized for vector processors
1154	!$OMP PARALLEL PRIVATE ( i, j, k )
1155	!$OMP DO
1156	DO j = nys, nyn
1157
1158	DO k = 1, nz
1159	DO i = 0, nx
1160	work_fftx(i,k,j) = work(j,k,i)
1161	ENDDO
1162	ENDDO
1163
1164	CALL fft_x_m( work_fftx(:,:,j), 'backward' )
1165
1166	ENDDO
1167
1168	!$OMP DO
1169	DO i = 0, nx
1170	DO j = nys, nyn
1171	DO k = 1, nz
1172	f_out(k,j,i) = work_fftx(i,k,j)
1173	ENDDO
1174	ENDDO
1175	ENDDO
1176	!$OMP END PARALLEL
1177
1178	ELSE
1179
1180	!
1181	!-- Cache optimized code (there might be still a potential for better
1182	!-- optimization).
1183	!$OMP PARALLEL PRIVATE (i,j,k)
1184	!$OMP DO
1185	DO j = nys, nyn
1186	DO k = 1, nz
1187
1188	DO i = 0, nx
1189	work_fftx(i,k,j) = work(j,k,i)
1190	ENDDO
1191
1192	CALL fft_x( work_fftx(0:nx,k,j), 'backward' )
1193
1194	ENDDO
1195	ENDDO
1196
1197	!$OMP DO
1198	DO i = 0, nx
1199	DO j = nys, nyn
1200	DO k = 1, nz
1201	f_out(k,j,i) = work_fftx(i,k,j)
1202	ENDDO
1203	ENDDO
1204	ENDDO
1205	!$OMP END PARALLEL
1206
1207	ENDIF
1208	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
1209
1210	END SUBROUTINE tr_yx_fftx
1211
1212
1213	SUBROUTINE ffty_tri_ffty( ar )
1214
1215	!------------------------------------------------------------------------------!
1216	! FFT along y, solution of the tridiagonal system and backward FFT for
1217	! a 1d-decomposition along y
1218	!
1219	! WARNING: this subroutine may still not work for hybrid parallelization
1220	! with OpenMP (for possible necessary changes see the original
1221	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
1222	!------------------------------------------------------------------------------!
1223
1224	USE control_parameters
1225	USE cpulog
1226	USE grid_variables
1227	USE indices
1228	USE interfaces
1229	USE pegrid
1230	USE transpose_indices
1231
1232	IMPLICIT NONE
1233
1234	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
1235
1236	REAL, DIMENSION(0:ny) :: work_ffty
1237	REAL, DIMENSION(0:ny,1:nz) :: work_triy
1238	REAL, DIMENSION(nny,1:nza,nxl_y:nxr_ya,pdims(2)) :: ar
1239	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
1240
1241
1242	CALL cpu_log( log_point_s(39), 'fft_y + tridia', 'start' )
1243
1244	ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) )
1245
1246	tn = 0 ! Default thread number in case of one thread
1247	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy )
1248	DO i = nxl_y, nxr_y
1249
1250	!$ tn = omp_get_thread_num()
1251
1252	IF ( host(1:3) == 'nec' ) THEN
1253	!
1254	!-- Code optimized for vector processors
1255	DO k = 1, nz
1256
1257	m = 0
1258	DO n = 1, pdims(2)
1259	DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
1260	work_triy(m,k) = ar(j,k,i,n)
1261	m = m + 1
1262	ENDDO
1263	ENDDO
1264
1265	ENDDO
1266
1267	CALL fft_y_m( work_triy, ny, 'forward' )
1268
1269	ELSE
1270	!
1271	!-- Cache optimized code
1272	DO k = 1, nz
1273
1274	m = 0
1275	DO n = 1, pdims(2)
1276	DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
1277	work_ffty(m) = ar(j,k,i,n)
1278	m = m + 1
1279	ENDDO
1280	ENDDO
1281
1282	CALL fft_y( work_ffty, 'forward' )
1283
1284	DO j = 0, ny
1285	work_triy(j,k) = work_ffty(j)
1286	ENDDO
1287
1288	ENDDO
1289
1290	ENDIF
1291
1292	!
1293	!-- Solve the linear equation system
1294	CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) )
1295
1296	IF ( host(1:3) == 'nec' ) THEN
1297	!
1298	!-- Code optimized for vector processors
1299	CALL fft_y_m( work_triy, ny, 'backward' )
1300
1301	DO k = 1, nz
1302
1303	m = 0
1304	DO n = 1, pdims(2)
1305	DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
1306	ar(j,k,i,n) = work_triy(m,k)
1307	m = m + 1
1308	ENDDO
1309	ENDDO
1310
1311	ENDDO
1312
1313	ELSE
1314	!
1315	!-- Cache optimized code
1316	DO k = 1, nz
1317
1318	DO j = 0, ny
1319	work_ffty(j) = work_triy(j,k)
1320	ENDDO
1321
1322	CALL fft_y( work_ffty, 'backward' )
1323
1324	m = 0
1325	DO n = 1, pdims(2)
1326	DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
1327	ar(j,k,i,n) = work_ffty(m)
1328	m = m + 1
1329	ENDDO
1330	ENDDO
1331
1332	ENDDO
1333
1334	ENDIF
1335
1336	ENDDO
1337
1338	DEALLOCATE( tri )
1339
1340	CALL cpu_log( log_point_s(39), 'fft_y + tridia', 'stop' )
1341
1342	END SUBROUTINE ffty_tri_ffty
1343
1344
1345	SUBROUTINE tridia_1dd( ddx2, ddy2, nx, ny, j, ar, tri )
1346
1347	!------------------------------------------------------------------------------!
1348	! Solves the linear system of equations for a 1d-decomposition along x (see
1349	! tridia)
1350	!
1351	! Attention: when using the intel compiler, array tri must be passed as an
1352	! argument to the contained subroutines. Otherwise addres faults
1353	! will occur.
1354	! On NEC, tri should not be passed (except for routine substi_1dd)
1355	! because this causes very bad performance.
1356	!------------------------------------------------------------------------------!
1357
1358	USE arrays_3d
1359	USE control_parameters
1360
1361	USE pegrid
1362
1363	IMPLICIT NONE
1364
1365	INTEGER :: i, j, k, nnyh, nx, ny, omp_get_thread_num, tn
1366
1367	REAL :: ddx2, ddy2
1368
1369	REAL, DIMENSION(0:nx,1:nz) :: ar
1370	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
1371	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
1372
1373
1374	nnyh = ( ny + 1 ) / 2
1375
1376	!
1377	!-- Define constant elements of the tridiagonal matrix.
1378	!-- The compiler on SX6 does loop exchange. If 0:nx is a high power of 2,
1379	!-- the exchanged loops create bank conflicts. The following directive
1380	!-- prohibits loop exchange and the loops perform much better.
1381	! tn = omp_get_thread_num()
1382	! WRITE( 120+tn, * ) '+++ id=',myid,' nx=',nx,' thread=', omp_get_thread_num()
1383	! CALL local_flush( 120+tn )
1384	!CDIR NOLOOPCHG
1385	DO k = 0, nz-1
1386	DO i = 0,nx
1387	tri(2,i,k) = ddzu_pres(k+1) * ddzw(k+1)
1388	tri(3,i,k) = ddzu_pres(k+2) * ddzw(k+1)
1389	ENDDO
1390	ENDDO
1391	! WRITE( 120+tn, * ) '+++ id=',myid,' end of first tridia loop thread=', omp_get_thread_num()
1392	! CALL local_flush( 120+tn )
1393
1394	IF ( j <= nnyh ) THEN
1395	#if defined( __lc )
1396	CALL maketri_1dd( j, tri )
1397	#else
1398	CALL maketri_1dd( j )
1399	#endif
1400	ELSE
1401	#if defined( __lc )
1402	CALL maketri_1dd( ny+1-j, tri )
1403	#else
1404	CALL maketri_1dd( ny+1-j )
1405	#endif
1406	ENDIF
1407	#if defined( __lc )
1408	CALL split_1dd( tri )
1409	#else
1410	CALL split_1dd
1411	#endif
1412	CALL substi_1dd( ar, tri )
1413
1414	CONTAINS
1415
1416	#if defined( __lc )
1417	SUBROUTINE maketri_1dd( j, tri )
1418	#else
1419	SUBROUTINE maketri_1dd( j )
1420	#endif
1421
1422	!------------------------------------------------------------------------------!
1423	! computes the i- and j-dependent component of the matrix
1424	!------------------------------------------------------------------------------!
1425
1426	USE constants
1427
1428	IMPLICIT NONE
1429
1430	INTEGER :: i, j, k, nnxh
1431	REAL :: a, c
1432
1433	REAL, DIMENSION(0:nx) :: l
1434
1435	#if defined( __lc )
1436	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
1437	#endif
1438
1439
1440	nnxh = ( nx + 1 ) / 2
1441	!
1442	!-- Provide the tridiagonal matrix for solution of the Poisson equation in
1443	!-- Fourier space. The coefficients are computed following the method of
1444	!-- Schmidt et al. (DFVLR-Mitteilung 84-15), which departs from Stephan
1445	!-- Siano's original version by discretizing the Poisson equation,
1446	!-- before it is Fourier-transformed
1447	DO i = 0, nx
1448	IF ( i >= 0 .AND. i <= nnxh ) THEN
1449	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
1450	FLOAT( nx+1 ) ) ) * ddx2 + &
1451	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
1452	FLOAT( ny+1 ) ) ) * ddy2
1453	ELSE
1454	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
1455	FLOAT( nx+1 ) ) ) * ddx2 + &
1456	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
1457	FLOAT( ny+1 ) ) ) * ddy2
1458	ENDIF
1459	ENDDO
1460
1461	DO k = 0, nz-1
1462	DO i = 0, nx
1463	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
1464	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
1465	tri(1,i,k) = a + c - l(i)
1466	ENDDO
1467	ENDDO
1468	IF ( ibc_p_b == 1 .OR. ibc_p_b == 2 ) THEN
1469	DO i = 0, nx
1470	tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
1471	ENDDO
1472	ENDIF
1473	IF ( ibc_p_t == 1 ) THEN
1474	DO i = 0, nx
1475	tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
1476	ENDDO
1477	ENDIF
1478
1479	END SUBROUTINE maketri_1dd
1480
1481
1482	#if defined( __lc )
1483	SUBROUTINE split_1dd( tri )
1484	#else
1485	SUBROUTINE split_1dd
1486	#endif
1487
1488	!------------------------------------------------------------------------------!
1489	! Splitting of the tridiagonal matrix (Thomas algorithm)
1490	!------------------------------------------------------------------------------!
1491
1492	IMPLICIT NONE
1493
1494	INTEGER :: i, k
1495
1496	#if defined( __lc )
1497	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
1498	#endif
1499
1500
1501	!
1502	!-- Splitting
1503	DO i = 0, nx
1504	tri(4,i,0) = tri(1,i,0)
1505	ENDDO
1506	DO k = 1, nz-1
1507	DO i = 0, nx
1508	tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
1509	tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
1510	ENDDO
1511	ENDDO
1512
1513	END SUBROUTINE split_1dd
1514
1515
1516	SUBROUTINE substi_1dd( ar, tri )
1517
1518	!------------------------------------------------------------------------------!
1519	! Substitution (Forward and Backward) (Thomas algorithm)
1520	!------------------------------------------------------------------------------!
1521
1522	IMPLICIT NONE
1523
1524	INTEGER :: i, k
1525
1526	REAL, DIMENSION(0:nx,nz) :: ar
1527	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
1528	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
1529
1530	!
1531	!-- Forward substitution
1532	DO i = 0, nx
1533	ar1(i,0) = ar(i,1)
1534	ENDDO
1535	DO k = 1, nz-1
1536	DO i = 0, nx
1537	ar1(i,k) = ar(i,k+1) - tri(5,i,k) * ar1(i,k-1)
1538	ENDDO
1539	ENDDO
1540
1541	!
1542	!-- Backward substitution
1543	DO i = 0, nx
1544	ar(i,nz) = ar1(i,nz-1) / tri(4,i,nz-1)
1545	ENDDO
1546	DO k = nz-2, 0, -1
1547	DO i = 0, nx
1548	ar(i,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,k+2) ) &
1549	/ tri(4,i,k)
1550	ENDDO
1551	ENDDO
1552
1553	!
1554	!-- Indices i=0, j=0 correspond to horizontally averaged pressure.
1555	!-- The respective values of ar should be zero at all k-levels if
1556	!-- acceleration of horizontally averaged vertical velocity is zero.
1557	IF ( ibc_p_b == 1 .AND. ibc_p_t == 1 ) THEN
1558	IF ( j == 0 ) THEN
1559	DO k = 1, nz
1560	ar(0,k) = 0.0
1561	ENDDO
1562	ENDIF
1563	ENDIF
1564
1565	END SUBROUTINE substi_1dd
1566
1567	END SUBROUTINE tridia_1dd
1568
1569	#endif
1570
1571	END MODULE poisfft_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |