Home

Context Navigation

poisfft.f90 @ 683

Last change on this file since 683 was 683, checked in by raasch, 13 years ago

New:
---

optional exchange of ghost points in synchronous mode via MPI_SENDRCV,
steered by d3par parameter synchronous_exchange
(cpu_statistics, exchange_horiz, modules, parin)

openMP-parallelization of pressure solver (fft-method) for 2d-domain-decomposition
(poisfft, transpose)

Changed:

Errors:

mpt bugfix for netCDF4 usage (mrun)

Property svn:keywords set to Id

File size: 45.5 KB

Line
1	MODULE poisfft_mod
2
3	!------------------------------------------------------------------------------!
4	! Current revisions:
5	! -----------------
6	! openMP parallelization for 2d-domain-decomposition
7	!
8	! Former revisions:
9	! -----------------
10	! $Id: poisfft.f90 683 2011-02-09 14:25:15Z raasch $
11	!
12	! 667 2010-12-23 12:06:00Z suehring/gryschka
13	! ddzu replaced by ddzu_pres due to changes in zu(0)
14	!
15	! 622 2010-12-10 08:08:13Z raasch
16	! optional barriers included in order to speed up collective operations
17	!
18	! 377 2009-09-04 11:09:00Z raasch
19	! __lcmuk changed to __lc to avoid problems with Intel compiler on sgi-ice
20	!
21	! 164 2008-05-15 08:46:15Z raasch
22	! Arguments removed from transpose routines
23	!
24	! 128 2007-10-26 13:11:14Z raasch
25	! Bugfix: wavenumber calculation for even nx in routines maketri
26	!
27	! 85 2007-05-11 09:35:14Z raasch
28	! Bugfix: work_fft*_vec removed from some PRIVATE-declarations
29	!
30	! 76 2007-03-29 00:58:32Z raasch
31	! Tridiagonal coefficients adjusted for Neumann boundary conditions both at
32	! the bottom and the top.
33	!
34	! RCS Log replace by Id keyword, revision history cleaned up
35	!
36	! Revision 1.24 2006/08/04 15:00:24 raasch
37	! Default setting of the thread number tn in case of not using OpenMP
38	!
39	! Revision 1.23 2006/02/23 12:48:38 raasch
40	! Additional compiler directive in routine tridia_1dd for preventing loop
41	! exchange on NEC-SX6
42	!
43	! Revision 1.20 2004/04/30 12:38:09 raasch
44	! Parts of former poisfft_hybrid moved to this subroutine,
45	! former subroutine changed to a module, renaming of FFT-subroutines and
46	! -module, FFTs completely substituted by calls of fft_x and fft_y,
47	! NAG fft used in the non-parallel case completely removed, l in maketri
48	! is now a 1d-array, variables passed by modules instead of using parameter
49	! lists, enlarged transposition arrays introduced
50	!
51	! Revision 1.1 1997/07/24 11:24:14 raasch
52	! Initial revision
53	!
54	!
55	! Description:
56	! ------------
57	! See below.
58	!------------------------------------------------------------------------------!
59
60	!--------------------------------------------------------------------------!
61	! poisfft !
62	! !
63	! Original version: Stephan Siano (pois3d) !
64	! !
65	! Institute of Meteorology and Climatology, University of Hannover !
66	! Germany !
67	! !
68	! Version as of July 23,1996 !
69	! !
70	! !
71	! Version for parallel computers: Siegfried Raasch !
72	! !
73	! Version as of July 03,1997 !
74	! !
75	! Solves the Poisson equation with a 2D spectral method !
76	! d^2 p / dx^2 + d^2 p / dy^2 + d^2 p / dz^2 = s !
77	! !
78	! Input: !
79	! real ar contains in the (nnx,nny,nnz) elements, !
80	! starting from the element (1,nys,nxl), the !
81	! values for s !
82	! real work Temporary array !
83	! !
84	! Output: !
85	! real ar contains the solution for p !
86	!--------------------------------------------------------------------------!
87
88	USE fft_xy
89	USE indices
90	USE transpose_indices
91
92	IMPLICIT NONE
93
94	PRIVATE
95	PUBLIC poisfft, poisfft_init
96
97	INTERFACE poisfft
98	MODULE PROCEDURE poisfft
99	END INTERFACE poisfft
100
101	INTERFACE poisfft_init
102	MODULE PROCEDURE poisfft_init
103	END INTERFACE poisfft_init
104
105	CONTAINS
106
107	SUBROUTINE poisfft_init
108
109	CALL fft_init
110
111	END SUBROUTINE poisfft_init
112
113
114	SUBROUTINE poisfft( ar, work )
115
116	USE cpulog
117	USE interfaces
118	USE pegrid
119
120	IMPLICIT NONE
121
122	REAL, DIMENSION(1:nza,nys:nyna,nxl:nxra) :: ar, work
123
124
125	CALL cpu_log( log_point_s(3), 'poisfft', 'start' )
126
127	!
128	!-- Two-dimensional Fourier Transformation in x- and y-direction.
129	#if defined( __parallel )
130	IF ( pdims(2) == 1 ) THEN
131
132	!
133	!-- 1d-domain-decomposition along x:
134	!-- FFT along y and transposition y --> x
135	CALL ffty_tr_yx( ar, work, ar )
136
137	!
138	!-- FFT along x, solving the tridiagonal system and backward FFT
139	CALL fftx_tri_fftx( ar )
140
141	!
142	!-- Transposition x --> y and backward FFT along y
143	CALL tr_xy_ffty( ar, work, ar )
144
145	ELSEIF ( pdims(1) == 1 ) THEN
146
147	!
148	!-- 1d-domain-decomposition along y:
149	!-- FFT along x and transposition x --> y
150	CALL fftx_tr_xy( ar, work, ar )
151
152	!
153	!-- FFT along y, solving the tridiagonal system and backward FFT
154	CALL ffty_tri_ffty( ar )
155
156	!
157	!-- Transposition y --> x and backward FFT along x
158	CALL tr_yx_fftx( ar, work, ar )
159
160	ELSE
161
162	!
163	!-- 2d-domain-decomposition
164	!-- Transposition z --> x
165	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
166	CALL transpose_zx( ar, work, ar )
167	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
168
169	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
170	CALL fftxp( ar, 'forward' )
171	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
172
173	!
174	!-- Transposition x --> y
175	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
176	CALL transpose_xy( ar, work, ar )
177	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
178
179	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
180	CALL fftyp( ar, 'forward' )
181	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
182
183	!
184	!-- Transposition y --> z
185	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
186	CALL transpose_yz( ar, work, ar )
187	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
188
189	!
190	!-- Solve the Poisson equation in z-direction in cartesian space.
191	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
192	CALL tridia( ar )
193	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
194
195	!
196	!-- Inverse Fourier Transformation
197	!-- Transposition z --> y
198	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
199	CALL transpose_zy( ar, work, ar )
200	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
201
202	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
203	CALL fftyp( ar, 'backward' )
204	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
205
206	!
207	!-- Transposition y --> x
208	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
209	CALL transpose_yx( ar, work, ar )
210	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
211
212	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
213	CALL fftxp( ar, 'backward' )
214	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
215
216	!
217	!-- Transposition x --> z
218	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
219	CALL transpose_xz( ar, work, ar )
220	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
221
222	ENDIF
223
224	#else
225
226	!
227	!-- Two-dimensional Fourier Transformation along x- and y-direction.
228	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
229	CALL fftx( ar, 'forward' )
230	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
231	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
232	CALL ffty( ar, 'forward' )
233	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
234
235	!
236	!-- Solve the Poisson equation in z-direction in cartesian space.
237	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
238	CALL tridia( ar )
239	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
240
241	!
242	!-- Inverse Fourier Transformation.
243	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
244	CALL ffty( ar, 'backward' )
245	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
246	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
247	CALL fftx( ar, 'backward' )
248	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
249
250	#endif
251
252	CALL cpu_log( log_point_s(3), 'poisfft', 'stop' )
253
254	END SUBROUTINE poisfft
255
256
257
258	SUBROUTINE tridia( ar )
259
260	!------------------------------------------------------------------------------!
261	! solves the linear system of equations:
262	!
263	! -(4 pi^2(i^2/(dx^2nnx^2)+j^2/(dy^2nny^2))+
264	! 1/(dzu(k)dzw(k))+1/(dzu(k-1)dzw(k)))*p(i,j,k)+
265	! 1/(dzu(k)dzw(k))p(i,j,k+1)+1/(dzu(k-1)dzw(k))p(i,j,k-1)=d(i,j,k)
266	!
267	! by using the Thomas algorithm
268	!------------------------------------------------------------------------------!
269
270	USE arrays_3d
271
272	IMPLICIT NONE
273
274	INTEGER :: i, j, k, nnyh
275
276	REAL, DIMENSION(nxl_z:nxr_z,0:nz-1) :: ar1
277	REAL, DIMENSION(5,nxl_z:nxr_z,0:nz-1) :: tri
278
279	#if defined( __parallel )
280	REAL :: ar(nxl_z:nxr_za,nys_z:nyn_za,1:nza)
281	#else
282	REAL :: ar(1:nz,nys_z:nyn_z,nxl_z:nxr_z)
283	#endif
284
285
286	nnyh = (ny+1) / 2
287
288	!
289	!-- Define constant elements of the tridiagonal matrix.
290	!$OMP PARALLEL PRIVATE ( k, i )
291	!$OMP DO
292	DO k = 0, nz-1
293	DO i = nxl_z, nxr_z
294	tri(2,i,k) = ddzu_pres(k+1) * ddzw(k+1)
295	tri(3,i,k) = ddzu_pres(k+2) * ddzw(k+1)
296	ENDDO
297	ENDDO
298	!$OMP END PARALLEL
299
300	#if defined( __parallel )
301	!
302	!-- Repeat for all y-levels.
303	!$OMP PARALLEL FIRSTPRIVATE( tri ) PRIVATE ( ar1, j )
304	!$OMP DO
305	DO j = nys_z, nyn_z
306	IF ( j <= nnyh ) THEN
307	CALL maketri( tri, j )
308	ELSE
309	CALL maketri( tri, ny+1-j )
310	ENDIF
311	CALL split( tri )
312	CALL substi( ar, ar1, tri, j )
313	ENDDO
314	!$OMP END PARALLEL
315	#else
316	!
317	!-- First y-level.
318	CALL maketri( tri, nys_z )
319	CALL split( tri )
320	CALL substi( ar, ar1, tri, 0 )
321
322	!
323	!-- Further y-levels.
324	DO j = 1, nnyh - 1
325	CALL maketri( tri, j )
326	CALL split( tri )
327	CALL substi( ar, ar1, tri, j )
328	CALL substi( ar, ar1, tri, ny+1-j )
329	ENDDO
330	CALL maketri( tri, nnyh )
331	CALL split( tri )
332	CALL substi( ar, ar1, tri, nnyh+nys )
333	#endif
334
335	CONTAINS
336
337	SUBROUTINE maketri( tri, j )
338
339	!------------------------------------------------------------------------------!
340	! Computes the i- and j-dependent component of the matrix
341	!------------------------------------------------------------------------------!
342
343	USE arrays_3d
344	USE constants
345	USE control_parameters
346	USE grid_variables
347
348	IMPLICIT NONE
349
350	INTEGER :: i, j, k, nnxh
351	REAL :: a, c
352	REAL :: ll(nxl_z:nxr_z)
353	REAL :: tri(5,nxl_z:nxr_z,0:nz-1)
354
355
356	nnxh = ( nx + 1 ) / 2
357
358	!
359	!-- Provide the tridiagonal matrix for solution of the Poisson equation in
360	!-- Fourier space. The coefficients are computed following the method of
361	!-- Schmidt et al. (DFVLR-Mitteilung 84-15), which departs from Stephan
362	!-- Siano's original version by discretizing the Poisson equation,
363	!-- before it is Fourier-transformed
364	#if defined( __parallel )
365	DO i = nxl_z, nxr_z
366	IF ( i >= 0 .AND. i <= nnxh ) THEN
367	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
368	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
369	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
370	FLOAT( ny+1 ) ) ) / ( dy * dy )
371	ELSE
372	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
373	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
374	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
375	FLOAT( ny+1 ) ) ) / ( dy * dy )
376	ENDIF
377	DO k = 0,nz-1
378	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
379	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
380	tri(1,i,k) = a + c - ll(i)
381	ENDDO
382	ENDDO
383	#else
384	DO i = 0, nnxh
385	ll(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / FLOAT( nx+1 ) ) ) / &
386	( dx * dx ) + &
387	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / FLOAT( ny+1 ) ) ) / &
388	( dy * dy )
389	DO k = 0, nz-1
390	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
391	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
392	tri(1,i,k) = a + c - ll(i)
393	IF ( i >= 1 .and. i < nnxh ) THEN
394	tri(1,nx+1-i,k) = tri(1,i,k)
395	ENDIF
396	ENDDO
397	ENDDO
398	#endif
399	IF ( ibc_p_b == 1 .OR. ibc_p_b == 2 ) THEN
400	DO i = nxl_z, nxr_z
401	tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
402	ENDDO
403	ENDIF
404	IF ( ibc_p_t == 1 ) THEN
405	DO i = nxl_z, nxr_z
406	tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
407	ENDDO
408	ENDIF
409
410	END SUBROUTINE maketri
411
412
413	SUBROUTINE substi( ar, ar1, tri, j )
414
415	!------------------------------------------------------------------------------!
416	! Substitution (Forward and Backward) (Thomas algorithm)
417	!------------------------------------------------------------------------------!
418
419	USE control_parameters
420
421	IMPLICIT NONE
422
423	INTEGER :: i, j, k
424	REAL :: ar1(nxl_z:nxr_z,0:nz-1)
425	REAL :: tri(5,nxl_z:nxr_z,0:nz-1)
426	#if defined( __parallel )
427	REAL :: ar(nxl_z:nxr_za,nys_z:nyn_za,1:nza)
428	#else
429	REAL :: ar(1:nz,nys_z:nyn_z,nxl_z:nxr_z)
430	#endif
431
432	!
433	!-- Forward substitution.
434	DO i = nxl_z, nxr_z
435	#if defined( __parallel )
436	ar1(i,0) = ar(i,j,1)
437	#else
438	ar1(i,0) = ar(1,j,i)
439	#endif
440	ENDDO
441	DO k = 1, nz - 1
442	DO i = nxl_z, nxr_z
443	#if defined( __parallel )
444	ar1(i,k) = ar(i,j,k+1) - tri(5,i,k) * ar1(i,k-1)
445	#else
446	ar1(i,k) = ar(k+1,j,i) - tri(5,i,k) * ar1(i,k-1)
447	#endif
448	ENDDO
449	ENDDO
450
451	!
452	!-- Backward substitution.
453	DO i = nxl_z, nxr_z
454	#if defined( __parallel )
455	ar(i,j,nz) = ar1(i,nz-1) / tri(4,i,nz-1)
456	#else
457	ar(nz,j,i) = ar1(i,nz-1) / tri(4,i,nz-1)
458	#endif
459	ENDDO
460	DO k = nz-2, 0, -1
461	DO i = nxl_z, nxr_z
462	#if defined( __parallel )
463	ar(i,j,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,j,k+2) ) &
464	/ tri(4,i,k)
465	#else
466	ar(k+1,j,i) = ( ar1(i,k) - tri(3,i,k) * ar(k+2,j,i) ) &
467	/ tri(4,i,k)
468	#endif
469	ENDDO
470	ENDDO
471
472	!
473	!-- Indices i=0, j=0 correspond to horizontally averaged pressure.
474	!-- The respective values of ar should be zero at all k-levels if
475	!-- acceleration of horizontally averaged vertical velocity is zero.
476	IF ( ibc_p_b == 1 .AND. ibc_p_t == 1 ) THEN
477	IF ( j == 0 .AND. nxl_z == 0 ) THEN
478	#if defined( __parallel )
479	DO k = 1, nz
480	ar(nxl_z,j,k) = 0.0
481	ENDDO
482	#else
483	DO k = 1, nz
484	ar(k,j,nxl_z) = 0.0
485	ENDDO
486	#endif
487	ENDIF
488	ENDIF
489
490	END SUBROUTINE substi
491
492
493	SUBROUTINE split( tri )
494
495	!------------------------------------------------------------------------------!
496	! Splitting of the tridiagonal matrix (Thomas algorithm)
497	!------------------------------------------------------------------------------!
498
499	IMPLICIT NONE
500
501	INTEGER :: i, k
502	REAL :: tri(5,nxl_z:nxr_z,0:nz-1)
503
504	!
505	!-- Splitting.
506	DO i = nxl_z, nxr_z
507	tri(4,i,0) = tri(1,i,0)
508	ENDDO
509	DO k = 1, nz-1
510	DO i = nxl_z, nxr_z
511	tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
512	tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
513	ENDDO
514	ENDDO
515
516	END SUBROUTINE split
517
518	END SUBROUTINE tridia
519
520
521	#if defined( __parallel )
522	SUBROUTINE fftxp( ar, direction )
523
524	!------------------------------------------------------------------------------!
525	! Fourier-transformation along x-direction Parallelized version
526	!------------------------------------------------------------------------------!
527
528	IMPLICIT NONE
529
530	CHARACTER (LEN=*) :: direction
531	INTEGER :: j, k
532	REAL :: ar(0:nxa,nys_x:nyn_xa,nzb_x:nzt_xa)
533
534	!
535	!-- Performing the fft with one of the methods implemented
536	!$OMP PARALLEL PRIVATE ( j, k )
537	!$OMP DO
538	DO k = nzb_x, nzt_x
539	DO j = nys_x, nyn_x
540	CALL fft_x( ar(0:nx,j,k), direction )
541	ENDDO
542	ENDDO
543	!$OMP END PARALLEL
544
545	END SUBROUTINE fftxp
546
547	#else
548	SUBROUTINE fftx( ar, direction )
549
550	!------------------------------------------------------------------------------!
551	! Fourier-transformation along x-direction Non parallel version
552	!------------------------------------------------------------------------------!
553
554	IMPLICIT NONE
555
556	CHARACTER (LEN=*) :: direction
557	INTEGER :: i, j, k
558	REAL :: ar(1:nz,0:ny,0:nx)
559
560	!
561	!-- Performing the fft with one of the methods implemented
562	!$OMP PARALLEL PRIVATE ( j, k )
563	!$OMP DO
564	DO k = 1, nz
565	DO j = 0, ny
566	CALL fft_x( ar(k,j,0:nx), direction )
567	ENDDO
568	ENDDO
569	!$OMP END PARALLEL
570
571	END SUBROUTINE fftx
572	#endif
573
574
575	#if defined( __parallel )
576	SUBROUTINE fftyp( ar, direction )
577
578	!------------------------------------------------------------------------------!
579	! Fourier-transformation along y-direction Parallelized version
580	!------------------------------------------------------------------------------!
581
582	IMPLICIT NONE
583
584	CHARACTER (LEN=*) :: direction
585	INTEGER :: i, k
586	REAL :: ar(0:nya,nxl_y:nxr_ya,nzb_y:nzt_ya)
587
588	!
589	!-- Performing the fft with one of the methods implemented
590	!$OMP PARALLEL PRIVATE ( i, k )
591	!$OMP DO
592	DO k = nzb_y, nzt_y
593	DO i = nxl_y, nxr_y
594	CALL fft_y( ar(0:ny,i,k), direction )
595	ENDDO
596	ENDDO
597	!$OMP END PARALLEL
598
599	END SUBROUTINE fftyp
600
601	#else
602	SUBROUTINE ffty( ar, direction )
603
604	!------------------------------------------------------------------------------!
605	! Fourier-transformation along y-direction Non parallel version
606	!------------------------------------------------------------------------------!
607
608	IMPLICIT NONE
609
610	CHARACTER (LEN=*) :: direction
611	INTEGER :: i, k
612	REAL :: ar(1:nz,0:ny,0:nx)
613
614	!
615	!-- Performing the fft with one of the methods implemented
616	!$OMP PARALLEL PRIVATE ( i, k )
617	!$OMP DO
618	DO k = 1, nz
619	DO i = 0, nx
620	CALL fft_y( ar(k,0:ny,i), direction )
621	ENDDO
622	ENDDO
623	!$OMP END PARALLEL
624
625	END SUBROUTINE ffty
626	#endif
627
628	#if defined( __parallel )
629	SUBROUTINE ffty_tr_yx( f_in, work, f_out )
630
631	!------------------------------------------------------------------------------!
632	! Fourier-transformation along y with subsequent transposition y --> x for
633	! a 1d-decomposition along x
634	!
635	! ATTENTION: The performance of this routine is much faster on the NEC-SX6,
636	! if the first index of work_ffty_vec is odd. Otherwise
637	! memory bank conflicts may occur (especially if the index is a
638	! multiple of 128). That's why work_ffty_vec is dimensioned as
639	! 0:ny+1.
640	! Of course, this will not work if users are using an odd number
641	! of gridpoints along y.
642	!------------------------------------------------------------------------------!
643
644	USE control_parameters
645	USE cpulog
646	USE indices
647	USE interfaces
648	USE pegrid
649	USE transpose_indices
650
651	IMPLICIT NONE
652
653	INTEGER :: i, iend, iouter, ir, j, k
654	INTEGER, PARAMETER :: stridex = 4
655
656	REAL, DIMENSION(0:ny,stridex) :: work_ffty
657	#if defined( __nec )
658	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
659	#endif
660	REAL, DIMENSION(1:nza,0:nya,nxl:nxra) :: f_in
661	REAL, DIMENSION(nnx,1:nza,nys_x:nyn_xa,pdims(1)) :: f_out
662	REAL, DIMENSION(nxl:nxra,1:nza,0:nya) :: work
663
664	!
665	!-- Carry out the FFT along y, where all data are present due to the
666	!-- 1d-decomposition along x. Resort the data in a way that x becomes
667	!-- the first index.
668	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
669
670	IF ( host(1:3) == 'nec' ) THEN
671	#if defined( __nec )
672	!
673	!-- Code optimized for vector processors
674	!$OMP PARALLEL PRIVATE ( i, j, k )
675	!$OMP DO
676	DO i = nxl, nxr
677
678	DO j = 0, ny
679	DO k = 1, nz
680	work_ffty_vec(j,k,i) = f_in(k,j,i)
681	ENDDO
682	ENDDO
683
684	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' )
685
686	ENDDO
687
688	!$OMP DO
689	DO k = 1, nz
690	DO j = 0, ny
691	DO i = nxl, nxr
692	work(i,k,j) = work_ffty_vec(j,k,i)
693	ENDDO
694	ENDDO
695	ENDDO
696	!$OMP END PARALLEL
697	#endif
698
699	ELSE
700
701	!
702	!-- Cache optimized code.
703	!-- The i-(x-)direction is split into a strided outer loop and an inner
704	!-- loop for better cache performance
705	!$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty)
706	!$OMP DO
707	DO iouter = nxl, nxr, stridex
708
709	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
710
711	DO k = 1, nz
712
713	DO i = iouter, iend
714
715	ir = i-iouter+1 ! counter within a stride
716	DO j = 0, ny
717	work_ffty(j,ir) = f_in(k,j,i)
718	ENDDO
719	!
720	!-- FFT along y
721	CALL fft_y( work_ffty(:,ir), 'forward' )
722
723	ENDDO
724
725	!
726	!-- Resort
727	DO j = 0, ny
728	DO i = iouter, iend
729	work(i,k,j) = work_ffty(j,i-iouter+1)
730	ENDDO
731	ENDDO
732
733	ENDDO
734
735	ENDDO
736	!$OMP END PARALLEL
737
738	ENDIF
739	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
740
741	!
742	!-- Transpose array
743	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
744	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
745	CALL MPI_ALLTOALL( work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
746	f_out(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
747	comm1dx, ierr )
748	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
749
750	END SUBROUTINE ffty_tr_yx
751
752
753	SUBROUTINE tr_xy_ffty( f_in, work, f_out )
754
755	!------------------------------------------------------------------------------!
756	! Transposition x --> y with a subsequent backward Fourier transformation for
757	! a 1d-decomposition along x
758	!------------------------------------------------------------------------------!
759
760	USE control_parameters
761	USE cpulog
762	USE indices
763	USE interfaces
764	USE pegrid
765	USE transpose_indices
766
767	IMPLICIT NONE
768
769	INTEGER :: i, iend, iouter, ir, j, k
770	INTEGER, PARAMETER :: stridex = 4
771
772	REAL, DIMENSION(0:ny,stridex) :: work_ffty
773	#if defined( __nec )
774	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
775	#endif
776	REAL, DIMENSION(nnx,1:nza,nys_x:nyn_xa,pdims(1)) :: f_in
777	REAL, DIMENSION(1:nza,0:nya,nxl:nxra) :: f_out
778	REAL, DIMENSION(nxl:nxra,1:nza,0:nya) :: work
779
780	!
781	!-- Transpose array
782	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
783	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
784	CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
785	work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
786	comm1dx, ierr )
787	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
788
789	!
790	!-- Resort the data in a way that y becomes the first index and carry out the
791	!-- backward fft along y.
792	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
793
794	IF ( host(1:3) == 'nec' ) THEN
795	#if defined( __nec )
796	!
797	!-- Code optimized for vector processors
798	!$OMP PARALLEL PRIVATE ( i, j, k )
799	!$OMP DO
800	DO k = 1, nz
801	DO j = 0, ny
802	DO i = nxl, nxr
803	work_ffty_vec(j,k,i) = work(i,k,j)
804	ENDDO
805	ENDDO
806	ENDDO
807
808	!$OMP DO
809	DO i = nxl, nxr
810
811	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' )
812
813	DO j = 0, ny
814	DO k = 1, nz
815	f_out(k,j,i) = work_ffty_vec(j,k,i)
816	ENDDO
817	ENDDO
818
819	ENDDO
820	!$OMP END PARALLEL
821	#endif
822
823	ELSE
824
825	!
826	!-- Cache optimized code.
827	!-- The i-(x-)direction is split into a strided outer loop and an inner
828	!-- loop for better cache performance
829	!$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty )
830	!$OMP DO
831	DO iouter = nxl, nxr, stridex
832
833	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
834
835	DO k = 1, nz
836	!
837	!-- Resort
838	DO j = 0, ny
839	DO i = iouter, iend
840	work_ffty(j,i-iouter+1) = work(i,k,j)
841	ENDDO
842	ENDDO
843
844	DO i = iouter, iend
845
846	!
847	!-- FFT along y
848	ir = i-iouter+1 ! counter within a stride
849	CALL fft_y( work_ffty(:,ir), 'backward' )
850
851	DO j = 0, ny
852	f_out(k,j,i) = work_ffty(j,ir)
853	ENDDO
854	ENDDO
855
856	ENDDO
857
858	ENDDO
859	!$OMP END PARALLEL
860
861	ENDIF
862
863	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
864
865	END SUBROUTINE tr_xy_ffty
866
867
868	SUBROUTINE fftx_tri_fftx( ar )
869
870	!------------------------------------------------------------------------------!
871	! FFT along x, solution of the tridiagonal system and backward FFT for
872	! a 1d-decomposition along x
873	!
874	! WARNING: this subroutine may still not work for hybrid parallelization
875	! with OpenMP (for possible necessary changes see the original
876	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
877	!------------------------------------------------------------------------------!
878
879	USE control_parameters
880	USE cpulog
881	USE grid_variables
882	USE indices
883	USE interfaces
884	USE pegrid
885	USE transpose_indices
886
887	IMPLICIT NONE
888
889	character(len=3) :: myth_char
890
891	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
892
893	REAL, DIMENSION(0:nx) :: work_fftx
894	REAL, DIMENSION(0:nx,1:nz) :: work_trix
895	REAL, DIMENSION(nnx,1:nza,nys_x:nyn_xa,pdims(1)) :: ar
896	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
897
898
899	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'start' )
900
901	ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) )
902
903	tn = 0 ! Default thread number in case of one thread
904	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_fftx, work_trix )
905	DO j = nys_x, nyn_x
906
907	!$ tn = omp_get_thread_num()
908
909	IF ( host(1:3) == 'nec' ) THEN
910	!
911	!-- Code optimized for vector processors
912	DO k = 1, nz
913
914	m = 0
915	DO n = 1, pdims(1)
916	DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
917	work_trix(m,k) = ar(i,k,j,n)
918	m = m + 1
919	ENDDO
920	ENDDO
921
922	ENDDO
923
924	CALL fft_x_m( work_trix, 'forward' )
925
926	ELSE
927	!
928	!-- Cache optimized code
929	DO k = 1, nz
930
931	m = 0
932	DO n = 1, pdims(1)
933	DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
934	work_fftx(m) = ar(i,k,j,n)
935	m = m + 1
936	ENDDO
937	ENDDO
938
939	CALL fft_x( work_fftx, 'forward' )
940
941	DO i = 0, nx
942	work_trix(i,k) = work_fftx(i)
943	ENDDO
944
945	ENDDO
946
947	ENDIF
948
949	!
950	!-- Solve the linear equation system
951	CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) )
952
953	IF ( host(1:3) == 'nec' ) THEN
954	!
955	!-- Code optimized for vector processors
956	CALL fft_x_m( work_trix, 'backward' )
957
958	DO k = 1, nz
959
960	m = 0
961	DO n = 1, pdims(1)
962	DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
963	ar(i,k,j,n) = work_trix(m,k)
964	m = m + 1
965	ENDDO
966	ENDDO
967
968	ENDDO
969
970	ELSE
971	!
972	!-- Cache optimized code
973	DO k = 1, nz
974
975	DO i = 0, nx
976	work_fftx(i) = work_trix(i,k)
977	ENDDO
978
979	CALL fft_x( work_fftx, 'backward' )
980
981	m = 0
982	DO n = 1, pdims(1)
983	DO i = 1, nnx_pe( n-1 ) ! WARN: pcoord(i) should be used!!
984	ar(i,k,j,n) = work_fftx(m)
985	m = m + 1
986	ENDDO
987	ENDDO
988
989	ENDDO
990
991	ENDIF
992
993	ENDDO
994
995	DEALLOCATE( tri )
996
997	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'stop' )
998
999	END SUBROUTINE fftx_tri_fftx
1000
1001
1002	SUBROUTINE fftx_tr_xy( f_in, work, f_out )
1003
1004	!------------------------------------------------------------------------------!
1005	! Fourier-transformation along x with subsequent transposition x --> y for
1006	! a 1d-decomposition along y
1007	!
1008	! ATTENTION: The NEC-branch of this routine may significantly profit from
1009	! further optimizations. So far, performance is much worse than
1010	! for routine ffty_tr_yx (more than three times slower).
1011	!------------------------------------------------------------------------------!
1012
1013	USE control_parameters
1014	USE cpulog
1015	USE indices
1016	USE interfaces
1017	USE pegrid
1018	USE transpose_indices
1019
1020	IMPLICIT NONE
1021
1022	INTEGER :: i, j, k
1023
1024	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
1025	REAL, DIMENSION(1:nza,nys:nyna,0:nxa) :: f_in
1026	REAL, DIMENSION(nny,1:nza,nxl_y:nxr_ya,pdims(2)) :: f_out
1027	REAL, DIMENSION(nys:nyna,1:nza,0:nxa) :: work
1028
1029	!
1030	!-- Carry out the FFT along x, where all data are present due to the
1031	!-- 1d-decomposition along y. Resort the data in a way that y becomes
1032	!-- the first index.
1033	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
1034
1035	IF ( host(1:3) == 'nec' ) THEN
1036	!
1037	!-- Code for vector processors
1038	!$OMP PARALLEL PRIVATE ( i, j, k )
1039	!$OMP DO
1040	DO i = 0, nx
1041
1042	DO j = nys, nyn
1043	DO k = 1, nz
1044	work_fftx(i,k,j) = f_in(k,j,i)
1045	ENDDO
1046	ENDDO
1047
1048	ENDDO
1049
1050	!$OMP DO
1051	DO j = nys, nyn
1052
1053	CALL fft_x_m( work_fftx(:,:,j), 'forward' )
1054
1055	DO k = 1, nz
1056	DO i = 0, nx
1057	work(j,k,i) = work_fftx(i,k,j)
1058	ENDDO
1059	ENDDO
1060
1061	ENDDO
1062	!$OMP END PARALLEL
1063
1064	ELSE
1065
1066	!
1067	!-- Cache optimized code (there might be still a potential for better
1068	!-- optimization).
1069	!$OMP PARALLEL PRIVATE (i,j,k,work_fftx)
1070	!$OMP DO
1071	DO i = 0, nx
1072
1073	DO j = nys, nyn
1074	DO k = 1, nz
1075	work_fftx(i,k,j) = f_in(k,j,i)
1076	ENDDO
1077	ENDDO
1078
1079	ENDDO
1080
1081	!$OMP DO
1082	DO j = nys, nyn
1083	DO k = 1, nz
1084
1085	CALL fft_x( work_fftx(0:nx,k,j), 'forward' )
1086
1087	DO i = 0, nx
1088	work(j,k,i) = work_fftx(i,k,j)
1089	ENDDO
1090	ENDDO
1091
1092	ENDDO
1093	!$OMP END PARALLEL
1094
1095	ENDIF
1096	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
1097
1098	!
1099	!-- Transpose array
1100	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1101	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1102	CALL MPI_ALLTOALL( work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
1103	f_out(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
1104	comm1dy, ierr )
1105	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1106
1107	END SUBROUTINE fftx_tr_xy
1108
1109
1110	SUBROUTINE tr_yx_fftx( f_in, work, f_out )
1111
1112	!------------------------------------------------------------------------------!
1113	! Transposition y --> x with a subsequent backward Fourier transformation for
1114	! a 1d-decomposition along x
1115	!------------------------------------------------------------------------------!
1116
1117	USE control_parameters
1118	USE cpulog
1119	USE indices
1120	USE interfaces
1121	USE pegrid
1122	USE transpose_indices
1123
1124	IMPLICIT NONE
1125
1126	INTEGER :: i, j, k
1127
1128	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
1129	REAL, DIMENSION(nny,1:nza,nxl_y:nxr_ya,pdims(2)) :: f_in
1130	REAL, DIMENSION(1:nza,nys:nyna,0:nxa) :: f_out
1131	REAL, DIMENSION(nys:nyna,1:nza,0:nxa) :: work
1132
1133	!
1134	!-- Transpose array
1135	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1136	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1137	CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
1138	work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
1139	comm1dy, ierr )
1140	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1141
1142	!
1143	!-- Carry out the FFT along x, where all data are present due to the
1144	!-- 1d-decomposition along y. Resort the data in a way that y becomes
1145	!-- the first index.
1146	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
1147
1148	IF ( host(1:3) == 'nec' ) THEN
1149	!
1150	!-- Code optimized for vector processors
1151	!$OMP PARALLEL PRIVATE ( i, j, k )
1152	!$OMP DO
1153	DO j = nys, nyn
1154
1155	DO k = 1, nz
1156	DO i = 0, nx
1157	work_fftx(i,k,j) = work(j,k,i)
1158	ENDDO
1159	ENDDO
1160
1161	CALL fft_x_m( work_fftx(:,:,j), 'backward' )
1162
1163	ENDDO
1164
1165	!$OMP DO
1166	DO i = 0, nx
1167	DO j = nys, nyn
1168	DO k = 1, nz
1169	f_out(k,j,i) = work_fftx(i,k,j)
1170	ENDDO
1171	ENDDO
1172	ENDDO
1173	!$OMP END PARALLEL
1174
1175	ELSE
1176
1177	!
1178	!-- Cache optimized code (there might be still a potential for better
1179	!-- optimization).
1180	!$OMP PARALLEL PRIVATE (i,j,k,work_fftx)
1181	!$OMP DO
1182	DO j = nys, nyn
1183	DO k = 1, nz
1184
1185	DO i = 0, nx
1186	work_fftx(i,k,j) = work(j,k,i)
1187	ENDDO
1188
1189	CALL fft_x( work_fftx(0:nx,k,j), 'backward' )
1190
1191	ENDDO
1192	ENDDO
1193
1194	!$OMP DO
1195	DO i = 0, nx
1196	DO j = nys, nyn
1197	DO k = 1, nz
1198	f_out(k,j,i) = work_fftx(i,k,j)
1199	ENDDO
1200	ENDDO
1201	ENDDO
1202	!$OMP END PARALLEL
1203
1204	ENDIF
1205	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
1206
1207	END SUBROUTINE tr_yx_fftx
1208
1209
1210	SUBROUTINE ffty_tri_ffty( ar )
1211
1212	!------------------------------------------------------------------------------!
1213	! FFT along y, solution of the tridiagonal system and backward FFT for
1214	! a 1d-decomposition along y
1215	!
1216	! WARNING: this subroutine may still not work for hybrid parallelization
1217	! with OpenMP (for possible necessary changes see the original
1218	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
1219	!------------------------------------------------------------------------------!
1220
1221	USE control_parameters
1222	USE cpulog
1223	USE grid_variables
1224	USE indices
1225	USE interfaces
1226	USE pegrid
1227	USE transpose_indices
1228
1229	IMPLICIT NONE
1230
1231	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
1232
1233	REAL, DIMENSION(0:ny) :: work_ffty
1234	REAL, DIMENSION(0:ny,1:nz) :: work_triy
1235	REAL, DIMENSION(nny,1:nza,nxl_y:nxr_ya,pdims(2)) :: ar
1236	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
1237
1238
1239	CALL cpu_log( log_point_s(39), 'fft_y + tridia', 'start' )
1240
1241	ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) )
1242
1243	tn = 0 ! Default thread number in case of one thread
1244	!$OMP PARALLEL PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy )
1245	!$OMP DO
1246	DO i = nxl_y, nxr_y
1247
1248	!$ tn = omp_get_thread_num()
1249
1250	IF ( host(1:3) == 'nec' ) THEN
1251	!
1252	!-- Code optimized for vector processors
1253	DO k = 1, nz
1254
1255	m = 0
1256	DO n = 1, pdims(2)
1257	DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
1258	work_triy(m,k) = ar(j,k,i,n)
1259	m = m + 1
1260	ENDDO
1261	ENDDO
1262
1263	ENDDO
1264
1265	CALL fft_y_m( work_triy, ny, 'forward' )
1266
1267	ELSE
1268	!
1269	!-- Cache optimized code
1270	DO k = 1, nz
1271
1272	m = 0
1273	DO n = 1, pdims(2)
1274	DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
1275	work_ffty(m) = ar(j,k,i,n)
1276	m = m + 1
1277	ENDDO
1278	ENDDO
1279
1280	CALL fft_y( work_ffty, 'forward' )
1281
1282	DO j = 0, ny
1283	work_triy(j,k) = work_ffty(j)
1284	ENDDO
1285
1286	ENDDO
1287
1288	ENDIF
1289
1290	!
1291	!-- Solve the linear equation system
1292	CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) )
1293
1294	IF ( host(1:3) == 'nec' ) THEN
1295	!
1296	!-- Code optimized for vector processors
1297	CALL fft_y_m( work_triy, ny, 'backward' )
1298
1299	DO k = 1, nz
1300
1301	m = 0
1302	DO n = 1, pdims(2)
1303	DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
1304	ar(j,k,i,n) = work_triy(m,k)
1305	m = m + 1
1306	ENDDO
1307	ENDDO
1308
1309	ENDDO
1310
1311	ELSE
1312	!
1313	!-- Cache optimized code
1314	DO k = 1, nz
1315
1316	DO j = 0, ny
1317	work_ffty(j) = work_triy(j,k)
1318	ENDDO
1319
1320	CALL fft_y( work_ffty, 'backward' )
1321
1322	m = 0
1323	DO n = 1, pdims(2)
1324	DO j = 1, nny_pe( n-1 ) ! WARN: pcoord(j) should be used!!
1325	ar(j,k,i,n) = work_ffty(m)
1326	m = m + 1
1327	ENDDO
1328	ENDDO
1329
1330	ENDDO
1331
1332	ENDIF
1333
1334	ENDDO
1335	!$OMP END PARALLEL
1336
1337	DEALLOCATE( tri )
1338
1339	CALL cpu_log( log_point_s(39), 'fft_y + tridia', 'stop' )
1340
1341	END SUBROUTINE ffty_tri_ffty
1342
1343
1344	SUBROUTINE tridia_1dd( ddx2, ddy2, nx, ny, j, ar, tri )
1345
1346	!------------------------------------------------------------------------------!
1347	! Solves the linear system of equations for a 1d-decomposition along x (see
1348	! tridia)
1349	!
1350	! Attention: when using the intel compiler, array tri must be passed as an
1351	! argument to the contained subroutines. Otherwise addres faults
1352	! will occur.
1353	! On NEC, tri should not be passed (except for routine substi_1dd)
1354	! because this causes very bad performance.
1355	!------------------------------------------------------------------------------!
1356
1357	USE arrays_3d
1358	USE control_parameters
1359
1360	USE pegrid
1361
1362	IMPLICIT NONE
1363
1364	INTEGER :: i, j, k, nnyh, nx, ny, omp_get_thread_num, tn
1365
1366	REAL :: ddx2, ddy2
1367
1368	REAL, DIMENSION(0:nx,1:nz) :: ar
1369	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
1370	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
1371
1372
1373	nnyh = ( ny + 1 ) / 2
1374
1375	!
1376	!-- Define constant elements of the tridiagonal matrix.
1377	!-- The compiler on SX6 does loop exchange. If 0:nx is a high power of 2,
1378	!-- the exchanged loops create bank conflicts. The following directive
1379	!-- prohibits loop exchange and the loops perform much better.
1380	! tn = omp_get_thread_num()
1381	! WRITE( 120+tn, * ) '+++ id=',myid,' nx=',nx,' thread=', omp_get_thread_num()
1382	! CALL local_flush( 120+tn )
1383	!CDIR NOLOOPCHG
1384	DO k = 0, nz-1
1385	DO i = 0,nx
1386	tri(2,i,k) = ddzu_pres(k+1) * ddzw(k+1)
1387	tri(3,i,k) = ddzu_pres(k+2) * ddzw(k+1)
1388	ENDDO
1389	ENDDO
1390	! WRITE( 120+tn, * ) '+++ id=',myid,' end of first tridia loop thread=', omp_get_thread_num()
1391	! CALL local_flush( 120+tn )
1392
1393	IF ( j <= nnyh ) THEN
1394	#if defined( __lc )
1395	CALL maketri_1dd( j, tri )
1396	#else
1397	CALL maketri_1dd( j )
1398	#endif
1399	ELSE
1400	#if defined( __lc )
1401	CALL maketri_1dd( ny+1-j, tri )
1402	#else
1403	CALL maketri_1dd( ny+1-j )
1404	#endif
1405	ENDIF
1406	#if defined( __lc )
1407	CALL split_1dd( tri )
1408	#else
1409	CALL split_1dd
1410	#endif
1411	CALL substi_1dd( ar, tri )
1412
1413	CONTAINS
1414
1415	#if defined( __lc )
1416	SUBROUTINE maketri_1dd( j, tri )
1417	#else
1418	SUBROUTINE maketri_1dd( j )
1419	#endif
1420
1421	!------------------------------------------------------------------------------!
1422	! computes the i- and j-dependent component of the matrix
1423	!------------------------------------------------------------------------------!
1424
1425	USE constants
1426
1427	IMPLICIT NONE
1428
1429	INTEGER :: i, j, k, nnxh
1430	REAL :: a, c
1431
1432	REAL, DIMENSION(0:nx) :: l
1433
1434	#if defined( __lc )
1435	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
1436	#endif
1437
1438
1439	nnxh = ( nx + 1 ) / 2
1440	!
1441	!-- Provide the tridiagonal matrix for solution of the Poisson equation in
1442	!-- Fourier space. The coefficients are computed following the method of
1443	!-- Schmidt et al. (DFVLR-Mitteilung 84-15), which departs from Stephan
1444	!-- Siano's original version by discretizing the Poisson equation,
1445	!-- before it is Fourier-transformed
1446	DO i = 0, nx
1447	IF ( i >= 0 .AND. i <= nnxh ) THEN
1448	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
1449	FLOAT( nx+1 ) ) ) * ddx2 + &
1450	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
1451	FLOAT( ny+1 ) ) ) * ddy2
1452	ELSE
1453	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
1454	FLOAT( nx+1 ) ) ) * ddx2 + &
1455	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
1456	FLOAT( ny+1 ) ) ) * ddy2
1457	ENDIF
1458	ENDDO
1459
1460	DO k = 0, nz-1
1461	DO i = 0, nx
1462	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
1463	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
1464	tri(1,i,k) = a + c - l(i)
1465	ENDDO
1466	ENDDO
1467	IF ( ibc_p_b == 1 .OR. ibc_p_b == 2 ) THEN
1468	DO i = 0, nx
1469	tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
1470	ENDDO
1471	ENDIF
1472	IF ( ibc_p_t == 1 ) THEN
1473	DO i = 0, nx
1474	tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
1475	ENDDO
1476	ENDIF
1477
1478	END SUBROUTINE maketri_1dd
1479
1480
1481	#if defined( __lc )
1482	SUBROUTINE split_1dd( tri )
1483	#else
1484	SUBROUTINE split_1dd
1485	#endif
1486
1487	!------------------------------------------------------------------------------!
1488	! Splitting of the tridiagonal matrix (Thomas algorithm)
1489	!------------------------------------------------------------------------------!
1490
1491	IMPLICIT NONE
1492
1493	INTEGER :: i, k
1494
1495	#if defined( __lc )
1496	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
1497	#endif
1498
1499
1500	!
1501	!-- Splitting
1502	DO i = 0, nx
1503	tri(4,i,0) = tri(1,i,0)
1504	ENDDO
1505	DO k = 1, nz-1
1506	DO i = 0, nx
1507	tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
1508	tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
1509	ENDDO
1510	ENDDO
1511
1512	END SUBROUTINE split_1dd
1513
1514
1515	SUBROUTINE substi_1dd( ar, tri )
1516
1517	!------------------------------------------------------------------------------!
1518	! Substitution (Forward and Backward) (Thomas algorithm)
1519	!------------------------------------------------------------------------------!
1520
1521	IMPLICIT NONE
1522
1523	INTEGER :: i, k
1524
1525	REAL, DIMENSION(0:nx,nz) :: ar
1526	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
1527	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
1528
1529	!
1530	!-- Forward substitution
1531	DO i = 0, nx
1532	ar1(i,0) = ar(i,1)
1533	ENDDO
1534	DO k = 1, nz-1
1535	DO i = 0, nx
1536	ar1(i,k) = ar(i,k+1) - tri(5,i,k) * ar1(i,k-1)
1537	ENDDO
1538	ENDDO
1539
1540	!
1541	!-- Backward substitution
1542	DO i = 0, nx
1543	ar(i,nz) = ar1(i,nz-1) / tri(4,i,nz-1)
1544	ENDDO
1545	DO k = nz-2, 0, -1
1546	DO i = 0, nx
1547	ar(i,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,k+2) ) &
1548	/ tri(4,i,k)
1549	ENDDO
1550	ENDDO
1551
1552	!
1553	!-- Indices i=0, j=0 correspond to horizontally averaged pressure.
1554	!-- The respective values of ar should be zero at all k-levels if
1555	!-- acceleration of horizontally averaged vertical velocity is zero.
1556	IF ( ibc_p_b == 1 .AND. ibc_p_t == 1 ) THEN
1557	IF ( j == 0 ) THEN
1558	DO k = 1, nz
1559	ar(0,k) = 0.0
1560	ENDDO
1561	ENDIF
1562	ENDIF
1563
1564	END SUBROUTINE substi_1dd
1565
1566	END SUBROUTINE tridia_1dd
1567
1568	#endif
1569
1570	END MODULE poisfft_mod

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: palm/trunk/SOURCE/poisfft.f90 @ 683

Download in other formats: