Home

Context Navigation

source: palm/trunk/SOURCE/poisfft.f90 @ 1215

Last change on this file since 1215 was 1213, checked in by raasch, 11 years ago
last commit documented
Property svn:keywords set to `Id`
File size: 31.9 KB

Line
1	MODULE poisfft_mod
2
3	!--------------------------------------------------------------------------------!
4	! This file is part of PALM.
5	!
6	! PALM is free software: you can redistribute it and/or modify it under the terms
7	! of the GNU General Public License as published by the Free Software Foundation,
8	! either version 3 of the License, or (at your option) any later version.
9	!
10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13	!
14	! You should have received a copy of the GNU General Public License along with
15	! PALM. If not, see <http://www.gnu.org/licenses/>.
16	!
17	! Copyright 1997-2012 Leibniz University Hannover
18	!--------------------------------------------------------------------------------!
19	!
20	! Current revisions:
21	! -----------------
22	!
23	!
24	! Former revisions:
25	! -----------------
26	! $Id: poisfft.f90 1213 2013-08-15 09:03:50Z kanani $
27	!
28	! 1212 2013-08-15 08:46:27Z raasch
29	! tridia routines moved to seperate module tridia_solver
30	!
31	! 1208 2013-08-13 06:41:49Z raasch
32	! acc-update clauses added for "ar" so that ffts other than cufft can also be
33	! used (although they are not ported and will give a poor performance)
34	!
35	! 1111 2013-03-08 23:54:10Z raasch
36	! further openACC porting of non-parallel (MPI) branch:
37	! tridiagonal routines split into extermal subroutines (instead using CONTAINS),
38	! no distinction between parallel/non-parallel in poisfft and tridia any more,
39	! tridia routines moved to end of file because of probable bug in PGI compiler 12.5
40	! (otherwise "invalid device function" is indicated during runtime),
41	! optimization of tridia routines: constant elements and coefficients of tri are
42	! stored in seperate arrays ddzuw and tric, last dimension of tri reduced from 5
43	! to 2,
44	! poisfft_init is now called internally from poisfft, maketri is called from
45	! poisfft_init,
46	! ibc_p_b = 2 removed
47	!
48	! 1106 2013-03-04 05:31:38Z raasch
49	! routines fftx, ffty, fftxp, fftyp removed, calls replaced by fft_x, fft_y,
50	! in the 1D-decomposition routines fft_x, ffty are replaced by fft_x_1d,
51	! fft_y_1d
52	!
53	! 1103 2013-02-20 02:15:53Z raasch
54	! tri, ar, and ar1 arguments in tridia-routines (2d) are removed because they
55	! sometimes cause segmentation faults with intel 12.1 compiler
56	!
57	! 1092 2013-02-02 11:24:22Z raasch
58	! unused variables removed
59	!
60	! 1036 2012-10-22 13:43:42Z raasch
61	! code put under GPL (PALM 3.9)
62	!
63	! 2012-09-21 07:03:55Z raasch
64	! FLOAT type conversion replaced by REAL
65	!
66	! 1003 2012-09-14 14:35:53Z raasch
67	! indices nxa, nya, etc. replaced by nx, ny, etc.
68	!
69	! 940 2012-07-09 14:31:00Z raasch
70	! special handling of tri-array as an argument in tridia_1dd routines switched
71	! off because it caused segmentation faults with intel 12.1 compiler
72	!
73	! 877 2012-04-03 11:21:44Z suehring
74	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
75	! pressure at the top of the model domain.
76	!
77	! 809 2012-01-30 13:32:58Z maronga
78	! Bugfix: replaced .AND. and .NOT. with && and ! in the preprocessor directives
79	!
80	! 807 2012-01-25 11:53:51Z maronga
81	! New cpp directive "__check" implemented which is used by check_namelist_files
82	! (most of the code is unneeded by check_namelist_files).
83	!
84	! 763 2011-10-06 09:32:09Z suehring
85	! Comment added concerning the last change.
86	!
87	! 761 2011-10-05 17:58:52Z suehring
88	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
89	! pressure at the top of the model domain.
90	!
91	! 696 2011-03-18 07:03:49Z raasch
92	! work_fftx removed from PRIVATE clauses in fftx_tr_xy and tr_yx_fftx
93	!
94	! 683 2011-02-09 14:25:15Z raasch
95	! openMP parallelization for 2d-domain-decomposition
96	!
97	! 667 2010-12-23 12:06:00Z suehring/gryschka
98	! ddzu replaced by ddzu_pres due to changes in zu(0)
99	!
100	! 622 2010-12-10 08:08:13Z raasch
101	! optional barriers included in order to speed up collective operations
102	!
103	! 377 2009-09-04 11:09:00Z raasch
104	! __lcmuk changed to __lc to avoid problems with Intel compiler on sgi-ice
105	!
106	! 164 2008-05-15 08:46:15Z raasch
107	! Arguments removed from transpose routines
108	!
109	! 128 2007-10-26 13:11:14Z raasch
110	! Bugfix: wavenumber calculation for even nx in routines maketri
111	!
112	! 85 2007-05-11 09:35:14Z raasch
113	! Bugfix: work_fft*_vec removed from some PRIVATE-declarations
114	!
115	! 76 2007-03-29 00:58:32Z raasch
116	! Tridiagonal coefficients adjusted for Neumann boundary conditions both at
117	! the bottom and the top.
118	!
119	! RCS Log replace by Id keyword, revision history cleaned up
120	!
121	! Revision 1.24 2006/08/04 15:00:24 raasch
122	! Default setting of the thread number tn in case of not using OpenMP
123	!
124	! Revision 1.23 2006/02/23 12:48:38 raasch
125	! Additional compiler directive in routine tridia_1dd for preventing loop
126	! exchange on NEC-SX6
127	!
128	! Revision 1.20 2004/04/30 12:38:09 raasch
129	! Parts of former poisfft_hybrid moved to this subroutine,
130	! former subroutine changed to a module, renaming of FFT-subroutines and
131	! -module, FFTs completely substituted by calls of fft_x and fft_y,
132	! NAG fft used in the non-parallel case completely removed, l in maketri
133	! is now a 1d-array, variables passed by modules instead of using parameter
134	! lists, enlarged transposition arrays introduced
135	!
136	! Revision 1.1 1997/07/24 11:24:14 raasch
137	! Initial revision
138	!
139	!
140	! Description:
141	! ------------
142	! See below.
143	!------------------------------------------------------------------------------!
144
145	!--------------------------------------------------------------------------!
146	! poisfft !
147	! !
148	! Original version: Stephan Siano (pois3d) !
149	! !
150	! Institute of Meteorology and Climatology, University of Hannover !
151	! Germany !
152	! !
153	! Version as of July 23,1996 !
154	! !
155	! !
156	! Version for parallel computers: Siegfried Raasch !
157	! !
158	! Version as of July 03,1997 !
159	! !
160	! Solves the Poisson equation with a 2D spectral method !
161	! d^2 p / dx^2 + d^2 p / dy^2 + d^2 p / dz^2 = s !
162	! !
163	! Input: !
164	! real ar contains in the (nnx,nny,nnz) elements, !
165	! starting from the element (1,nys,nxl), the !
166	! values for s !
167	! real work Temporary array !
168	! !
169	! Output: !
170	! real ar contains the solution for p !
171	!--------------------------------------------------------------------------!
172
173	USE fft_xy
174	USE indices
175	USE transpose_indices
176	USE tridia_solver
177
178	IMPLICIT NONE
179
180	LOGICAL, SAVE :: poisfft_initialized = .FALSE.
181
182	PRIVATE
183
184	#if ! defined ( __check )
185	PUBLIC poisfft, poisfft_init
186
187	INTERFACE poisfft
188	MODULE PROCEDURE poisfft
189	END INTERFACE poisfft
190
191	INTERFACE poisfft_init
192	MODULE PROCEDURE poisfft_init
193	END INTERFACE poisfft_init
194	#else
195	PUBLIC poisfft_init
196
197	INTERFACE poisfft_init
198	MODULE PROCEDURE poisfft_init
199	END INTERFACE poisfft_init
200	#endif
201
202	CONTAINS
203
204	SUBROUTINE poisfft_init
205
206	USE arrays_3d, ONLY: ddzu_pres, ddzw
207
208	IMPLICIT NONE
209
210	INTEGER :: k
211
212
213	CALL fft_init
214
215	CALL tridia_init
216
217	poisfft_initialized = .TRUE.
218
219	END SUBROUTINE poisfft_init
220
221
222	#if ! defined ( __check )
223	SUBROUTINE poisfft( ar, work )
224
225	USE control_parameters, ONLY : fft_method
226	USE cpulog
227	USE interfaces
228	USE pegrid
229
230	IMPLICIT NONE
231
232	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar, work
233
234
235	CALL cpu_log( log_point_s(3), 'poisfft', 'start' )
236
237	IF ( .NOT. poisfft_initialized ) CALL poisfft_init
238
239	!
240	!-- Two-dimensional Fourier Transformation in x- and y-direction.
241	IF ( pdims(2) == 1 .AND. pdims(1) > 1 ) THEN
242
243	!
244	!-- 1d-domain-decomposition along x:
245	!-- FFT along y and transposition y --> x
246	CALL ffty_tr_yx( ar, work, ar )
247
248	!
249	!-- FFT along x, solving the tridiagonal system and backward FFT
250	CALL fftx_tri_fftx( ar )
251
252	!
253	!-- Transposition x --> y and backward FFT along y
254	CALL tr_xy_ffty( ar, work, ar )
255
256	ELSEIF ( pdims(1) == 1 .AND. pdims(2) > 1 ) THEN
257
258	!
259	!-- 1d-domain-decomposition along y:
260	!-- FFT along x and transposition x --> y
261	CALL fftx_tr_xy( ar, work, ar )
262
263	!
264	!-- FFT along y, solving the tridiagonal system and backward FFT
265	CALL ffty_tri_ffty( ar )
266
267	!
268	!-- Transposition y --> x and backward FFT along x
269	CALL tr_yx_fftx( ar, work, ar )
270
271	ELSE
272
273	!
274	!-- 2d-domain-decomposition or no decomposition (1 PE run)
275	!-- Transposition z --> x
276	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
277	CALL transpose_zx( ar, work, ar )
278	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
279
280	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
281	IF ( fft_method /= 'system-specific' ) THEN
282	!$acc update host( ar )
283	ENDIF
284	CALL fft_x( ar, 'forward' )
285	IF ( fft_method /= 'system-specific' ) THEN
286	!$acc update device( ar )
287	ENDIF
288	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
289
290	!
291	!-- Transposition x --> y
292	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
293	CALL transpose_xy( ar, work, ar )
294	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
295
296	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
297	IF ( fft_method /= 'system-specific' ) THEN
298	!$acc update host( ar )
299	ENDIF
300	CALL fft_y( ar, 'forward' )
301	IF ( fft_method /= 'system-specific' ) THEN
302	!$acc update device( ar )
303	ENDIF
304	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
305
306	!
307	!-- Transposition y --> z
308	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
309	CALL transpose_yz( ar, work, ar )
310	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
311
312	!
313	!-- Solve the tridiagonal equation system along z
314	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
315	CALL tridia_substi( ar )
316	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
317
318	!
319	!-- Inverse Fourier Transformation
320	!-- Transposition z --> y
321	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
322	CALL transpose_zy( ar, work, ar )
323	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
324
325	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
326	IF ( fft_method /= 'system-specific' ) THEN
327	!$acc update host( ar )
328	ENDIF
329	CALL fft_y( ar, 'backward' )
330	IF ( fft_method /= 'system-specific' ) THEN
331	!$acc update device( ar )
332	ENDIF
333	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
334
335	!
336	!-- Transposition y --> x
337	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
338	CALL transpose_yx( ar, work, ar )
339	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
340
341	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
342	IF ( fft_method /= 'system-specific' ) THEN
343	!$acc update host( ar )
344	ENDIF
345	CALL fft_x( ar, 'backward' )
346	IF ( fft_method /= 'system-specific' ) THEN
347	!$acc update device( ar )
348	ENDIF
349	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
350
351	!
352	!-- Transposition x --> z
353	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
354	CALL transpose_xz( ar, work, ar )
355	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
356
357	ENDIF
358
359	CALL cpu_log( log_point_s(3), 'poisfft', 'stop' )
360
361	END SUBROUTINE poisfft
362
363
364
365	SUBROUTINE ffty_tr_yx( f_in, work, f_out )
366
367	!------------------------------------------------------------------------------!
368	! Fourier-transformation along y with subsequent transposition y --> x for
369	! a 1d-decomposition along x
370	!
371	! ATTENTION: The performance of this routine is much faster on the NEC-SX6,
372	! if the first index of work_ffty_vec is odd. Otherwise
373	! memory bank conflicts may occur (especially if the index is a
374	! multiple of 128). That's why work_ffty_vec is dimensioned as
375	! 0:ny+1.
376	! Of course, this will not work if users are using an odd number
377	! of gridpoints along y.
378	!------------------------------------------------------------------------------!
379
380	USE control_parameters
381	USE cpulog
382	USE indices
383	USE interfaces
384	USE pegrid
385	USE transpose_indices
386
387	IMPLICIT NONE
388
389	INTEGER :: i, iend, iouter, ir, j, k
390	INTEGER, PARAMETER :: stridex = 4
391
392	REAL, DIMENSION(0:ny,stridex) :: work_ffty
393	#if defined( __nec )
394	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
395	#endif
396	REAL, DIMENSION(1:nz,0:ny,nxl:nxr) :: f_in
397	REAL, DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_out
398	REAL, DIMENSION(nxl:nxr,1:nz,0:ny) :: work
399
400	!
401	!-- Carry out the FFT along y, where all data are present due to the
402	!-- 1d-decomposition along x. Resort the data in a way that x becomes
403	!-- the first index.
404	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'start' )
405
406	IF ( host(1:3) == 'nec' ) THEN
407	#if defined( __nec )
408	!
409	!-- Code optimized for vector processors
410	!$OMP PARALLEL PRIVATE ( i, j, k )
411	!$OMP DO
412	DO i = nxl, nxr
413
414	DO j = 0, ny
415	DO k = 1, nz
416	work_ffty_vec(j,k,i) = f_in(k,j,i)
417	ENDDO
418	ENDDO
419
420	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' )
421
422	ENDDO
423
424	!$OMP DO
425	DO k = 1, nz
426	DO j = 0, ny
427	DO i = nxl, nxr
428	work(i,k,j) = work_ffty_vec(j,k,i)
429	ENDDO
430	ENDDO
431	ENDDO
432	!$OMP END PARALLEL
433	#endif
434
435	ELSE
436
437	!
438	!-- Cache optimized code.
439	!-- The i-(x-)direction is split into a strided outer loop and an inner
440	!-- loop for better cache performance
441	!$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty)
442	!$OMP DO
443	DO iouter = nxl, nxr, stridex
444
445	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
446
447	DO k = 1, nz
448
449	DO i = iouter, iend
450
451	ir = i-iouter+1 ! counter within a stride
452	DO j = 0, ny
453	work_ffty(j,ir) = f_in(k,j,i)
454	ENDDO
455	!
456	!-- FFT along y
457	CALL fft_y_1d( work_ffty(:,ir), 'forward' )
458
459	ENDDO
460
461	!
462	!-- Resort
463	DO j = 0, ny
464	DO i = iouter, iend
465	work(i,k,j) = work_ffty(j,i-iouter+1)
466	ENDDO
467	ENDDO
468
469	ENDDO
470
471	ENDDO
472	!$OMP END PARALLEL
473
474	ENDIF
475	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'pause' )
476
477	!
478	!-- Transpose array
479	#if defined( __parallel )
480	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
481	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
482	CALL MPI_ALLTOALL( work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
483	f_out(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
484	comm1dx, ierr )
485	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
486	#endif
487
488	END SUBROUTINE ffty_tr_yx
489
490
491	SUBROUTINE tr_xy_ffty( f_in, work, f_out )
492
493	!------------------------------------------------------------------------------!
494	! Transposition x --> y with a subsequent backward Fourier transformation for
495	! a 1d-decomposition along x
496	!------------------------------------------------------------------------------!
497
498	USE control_parameters
499	USE cpulog
500	USE indices
501	USE interfaces
502	USE pegrid
503	USE transpose_indices
504
505	IMPLICIT NONE
506
507	INTEGER :: i, iend, iouter, ir, j, k
508	INTEGER, PARAMETER :: stridex = 4
509
510	REAL, DIMENSION(0:ny,stridex) :: work_ffty
511	#if defined( __nec )
512	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
513	#endif
514	REAL, DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_in
515	REAL, DIMENSION(1:nz,0:ny,nxl:nxr) :: f_out
516	REAL, DIMENSION(nxl:nxr,1:nz,0:ny) :: work
517
518	!
519	!-- Transpose array
520	#if defined( __parallel )
521	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
522	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
523	CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
524	work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
525	comm1dx, ierr )
526	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
527	#endif
528
529	!
530	!-- Resort the data in a way that y becomes the first index and carry out the
531	!-- backward fft along y.
532	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'continue' )
533
534	IF ( host(1:3) == 'nec' ) THEN
535	#if defined( __nec )
536	!
537	!-- Code optimized for vector processors
538	!$OMP PARALLEL PRIVATE ( i, j, k )
539	!$OMP DO
540	DO k = 1, nz
541	DO j = 0, ny
542	DO i = nxl, nxr
543	work_ffty_vec(j,k,i) = work(i,k,j)
544	ENDDO
545	ENDDO
546	ENDDO
547
548	!$OMP DO
549	DO i = nxl, nxr
550
551	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' )
552
553	DO j = 0, ny
554	DO k = 1, nz
555	f_out(k,j,i) = work_ffty_vec(j,k,i)
556	ENDDO
557	ENDDO
558
559	ENDDO
560	!$OMP END PARALLEL
561	#endif
562
563	ELSE
564
565	!
566	!-- Cache optimized code.
567	!-- The i-(x-)direction is split into a strided outer loop and an inner
568	!-- loop for better cache performance
569	!$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty )
570	!$OMP DO
571	DO iouter = nxl, nxr, stridex
572
573	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
574
575	DO k = 1, nz
576	!
577	!-- Resort
578	DO j = 0, ny
579	DO i = iouter, iend
580	work_ffty(j,i-iouter+1) = work(i,k,j)
581	ENDDO
582	ENDDO
583
584	DO i = iouter, iend
585
586	!
587	!-- FFT along y
588	ir = i-iouter+1 ! counter within a stride
589	CALL fft_y_1d( work_ffty(:,ir), 'backward' )
590
591	DO j = 0, ny
592	f_out(k,j,i) = work_ffty(j,ir)
593	ENDDO
594	ENDDO
595
596	ENDDO
597
598	ENDDO
599	!$OMP END PARALLEL
600
601	ENDIF
602
603	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'stop' )
604
605	END SUBROUTINE tr_xy_ffty
606
607
608	SUBROUTINE fftx_tri_fftx( ar )
609
610	!------------------------------------------------------------------------------!
611	! FFT along x, solution of the tridiagonal system and backward FFT for
612	! a 1d-decomposition along x
613	!
614	! WARNING: this subroutine may still not work for hybrid parallelization
615	! with OpenMP (for possible necessary changes see the original
616	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
617	!------------------------------------------------------------------------------!
618
619	USE control_parameters
620	USE cpulog
621	USE grid_variables
622	USE indices
623	USE interfaces
624	USE pegrid
625	USE transpose_indices
626
627	IMPLICIT NONE
628
629	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
630
631	REAL, DIMENSION(0:nx) :: work_fftx
632	REAL, DIMENSION(0:nx,1:nz) :: work_trix
633	REAL, DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: ar
634	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
635
636
637	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'start' )
638
639	ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) )
640
641	tn = 0 ! Default thread number in case of one thread
642	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_fftx, work_trix )
643	DO j = nys_x, nyn_x
644
645	!$ tn = omp_get_thread_num()
646
647	IF ( host(1:3) == 'nec' ) THEN
648	!
649	!-- Code optimized for vector processors
650	DO k = 1, nz
651
652	m = 0
653	DO n = 1, pdims(1)
654	DO i = 1, nnx
655	work_trix(m,k) = ar(i,k,j,n)
656	m = m + 1
657	ENDDO
658	ENDDO
659
660	ENDDO
661
662	CALL fft_x_m( work_trix, 'forward' )
663
664	ELSE
665	!
666	!-- Cache optimized code
667	DO k = 1, nz
668
669	m = 0
670	DO n = 1, pdims(1)
671	DO i = 1, nnx
672	work_fftx(m) = ar(i,k,j,n)
673	m = m + 1
674	ENDDO
675	ENDDO
676
677	CALL fft_x_1d( work_fftx, 'forward' )
678
679	DO i = 0, nx
680	work_trix(i,k) = work_fftx(i)
681	ENDDO
682
683	ENDDO
684
685	ENDIF
686
687	!
688	!-- Solve the linear equation system
689	CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) )
690
691	IF ( host(1:3) == 'nec' ) THEN
692	!
693	!-- Code optimized for vector processors
694	CALL fft_x_m( work_trix, 'backward' )
695
696	DO k = 1, nz
697
698	m = 0
699	DO n = 1, pdims(1)
700	DO i = 1, nnx
701	ar(i,k,j,n) = work_trix(m,k)
702	m = m + 1
703	ENDDO
704	ENDDO
705
706	ENDDO
707
708	ELSE
709	!
710	!-- Cache optimized code
711	DO k = 1, nz
712
713	DO i = 0, nx
714	work_fftx(i) = work_trix(i,k)
715	ENDDO
716
717	CALL fft_x_1d( work_fftx, 'backward' )
718
719	m = 0
720	DO n = 1, pdims(1)
721	DO i = 1, nnx
722	ar(i,k,j,n) = work_fftx(m)
723	m = m + 1
724	ENDDO
725	ENDDO
726
727	ENDDO
728
729	ENDIF
730
731	ENDDO
732
733	DEALLOCATE( tri )
734
735	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'stop' )
736
737	END SUBROUTINE fftx_tri_fftx
738
739
740	SUBROUTINE fftx_tr_xy( f_in, work, f_out )
741
742	!------------------------------------------------------------------------------!
743	! Fourier-transformation along x with subsequent transposition x --> y for
744	! a 1d-decomposition along y
745	!
746	! ATTENTION: The NEC-branch of this routine may significantly profit from
747	! further optimizations. So far, performance is much worse than
748	! for routine ffty_tr_yx (more than three times slower).
749	!------------------------------------------------------------------------------!
750
751	USE control_parameters
752	USE cpulog
753	USE indices
754	USE interfaces
755	USE pegrid
756	USE transpose_indices
757
758	IMPLICIT NONE
759
760	INTEGER :: i, j, k
761
762	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
763	REAL, DIMENSION(1:nz,nys:nyn,0:nx) :: f_in
764	REAL, DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_out
765	REAL, DIMENSION(nys:nyn,1:nz,0:nx) :: work
766
767	!
768	!-- Carry out the FFT along x, where all data are present due to the
769	!-- 1d-decomposition along y. Resort the data in a way that y becomes
770	!-- the first index.
771	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'start' )
772
773	IF ( host(1:3) == 'nec' ) THEN
774	!
775	!-- Code for vector processors
776	!$OMP PARALLEL PRIVATE ( i, j, k )
777	!$OMP DO
778	DO i = 0, nx
779
780	DO j = nys, nyn
781	DO k = 1, nz
782	work_fftx(i,k,j) = f_in(k,j,i)
783	ENDDO
784	ENDDO
785
786	ENDDO
787
788	!$OMP DO
789	DO j = nys, nyn
790
791	CALL fft_x_m( work_fftx(:,:,j), 'forward' )
792
793	DO k = 1, nz
794	DO i = 0, nx
795	work(j,k,i) = work_fftx(i,k,j)
796	ENDDO
797	ENDDO
798
799	ENDDO
800	!$OMP END PARALLEL
801
802	ELSE
803
804	!
805	!-- Cache optimized code (there might be still a potential for better
806	!-- optimization).
807	!$OMP PARALLEL PRIVATE (i,j,k)
808	!$OMP DO
809	DO i = 0, nx
810
811	DO j = nys, nyn
812	DO k = 1, nz
813	work_fftx(i,k,j) = f_in(k,j,i)
814	ENDDO
815	ENDDO
816
817	ENDDO
818
819	!$OMP DO
820	DO j = nys, nyn
821	DO k = 1, nz
822
823	CALL fft_x_1d( work_fftx(0:nx,k,j), 'forward' )
824
825	DO i = 0, nx
826	work(j,k,i) = work_fftx(i,k,j)
827	ENDDO
828	ENDDO
829
830	ENDDO
831	!$OMP END PARALLEL
832
833	ENDIF
834	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'pause' )
835
836	!
837	!-- Transpose array
838	#if defined( __parallel )
839	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
840	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
841	CALL MPI_ALLTOALL( work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
842	f_out(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
843	comm1dy, ierr )
844	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
845	#endif
846
847	END SUBROUTINE fftx_tr_xy
848
849
850	SUBROUTINE tr_yx_fftx( f_in, work, f_out )
851
852	!------------------------------------------------------------------------------!
853	! Transposition y --> x with a subsequent backward Fourier transformation for
854	! a 1d-decomposition along x
855	!------------------------------------------------------------------------------!
856
857	USE control_parameters
858	USE cpulog
859	USE indices
860	USE interfaces
861	USE pegrid
862	USE transpose_indices
863
864	IMPLICIT NONE
865
866	INTEGER :: i, j, k
867
868	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
869	REAL, DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_in
870	REAL, DIMENSION(1:nz,nys:nyn,0:nx) :: f_out
871	REAL, DIMENSION(nys:nyn,1:nz,0:nx) :: work
872
873	!
874	!-- Transpose array
875	#if defined( __parallel )
876	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
877	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
878	CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
879	work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
880	comm1dy, ierr )
881	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
882	#endif
883
884	!
885	!-- Carry out the FFT along x, where all data are present due to the
886	!-- 1d-decomposition along y. Resort the data in a way that y becomes
887	!-- the first index.
888	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'continue' )
889
890	IF ( host(1:3) == 'nec' ) THEN
891	!
892	!-- Code optimized for vector processors
893	!$OMP PARALLEL PRIVATE ( i, j, k )
894	!$OMP DO
895	DO j = nys, nyn
896
897	DO k = 1, nz
898	DO i = 0, nx
899	work_fftx(i,k,j) = work(j,k,i)
900	ENDDO
901	ENDDO
902
903	CALL fft_x_m( work_fftx(:,:,j), 'backward' )
904
905	ENDDO
906
907	!$OMP DO
908	DO i = 0, nx
909	DO j = nys, nyn
910	DO k = 1, nz
911	f_out(k,j,i) = work_fftx(i,k,j)
912	ENDDO
913	ENDDO
914	ENDDO
915	!$OMP END PARALLEL
916
917	ELSE
918
919	!
920	!-- Cache optimized code (there might be still a potential for better
921	!-- optimization).
922	!$OMP PARALLEL PRIVATE (i,j,k)
923	!$OMP DO
924	DO j = nys, nyn
925	DO k = 1, nz
926
927	DO i = 0, nx
928	work_fftx(i,k,j) = work(j,k,i)
929	ENDDO
930
931	CALL fft_x_1d( work_fftx(0:nx,k,j), 'backward' )
932
933	ENDDO
934	ENDDO
935
936	!$OMP DO
937	DO i = 0, nx
938	DO j = nys, nyn
939	DO k = 1, nz
940	f_out(k,j,i) = work_fftx(i,k,j)
941	ENDDO
942	ENDDO
943	ENDDO
944	!$OMP END PARALLEL
945
946	ENDIF
947	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'stop' )
948
949	END SUBROUTINE tr_yx_fftx
950
951
952	SUBROUTINE ffty_tri_ffty( ar )
953
954	!------------------------------------------------------------------------------!
955	! FFT along y, solution of the tridiagonal system and backward FFT for
956	! a 1d-decomposition along y
957	!
958	! WARNING: this subroutine may still not work for hybrid parallelization
959	! with OpenMP (for possible necessary changes see the original
960	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
961	!------------------------------------------------------------------------------!
962
963	USE control_parameters
964	USE cpulog
965	USE grid_variables
966	USE indices
967	USE interfaces
968	USE pegrid
969	USE transpose_indices
970
971	IMPLICIT NONE
972
973	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
974
975	REAL, DIMENSION(0:ny) :: work_ffty
976	REAL, DIMENSION(0:ny,1:nz) :: work_triy
977	REAL, DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: ar
978	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
979
980
981	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'start' )
982
983	ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) )
984
985	tn = 0 ! Default thread number in case of one thread
986	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy )
987	DO i = nxl_y, nxr_y
988
989	!$ tn = omp_get_thread_num()
990
991	IF ( host(1:3) == 'nec' ) THEN
992	!
993	!-- Code optimized for vector processors
994	DO k = 1, nz
995
996	m = 0
997	DO n = 1, pdims(2)
998	DO j = 1, nny
999	work_triy(m,k) = ar(j,k,i,n)
1000	m = m + 1
1001	ENDDO
1002	ENDDO
1003
1004	ENDDO
1005
1006	CALL fft_y_m( work_triy, ny, 'forward' )
1007
1008	ELSE
1009	!
1010	!-- Cache optimized code
1011	DO k = 1, nz
1012
1013	m = 0
1014	DO n = 1, pdims(2)
1015	DO j = 1, nny
1016	work_ffty(m) = ar(j,k,i,n)
1017	m = m + 1
1018	ENDDO
1019	ENDDO
1020
1021	CALL fft_y_1d( work_ffty, 'forward' )
1022
1023	DO j = 0, ny
1024	work_triy(j,k) = work_ffty(j)
1025	ENDDO
1026
1027	ENDDO
1028
1029	ENDIF
1030
1031	!
1032	!-- Solve the linear equation system
1033	CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) )
1034
1035	IF ( host(1:3) == 'nec' ) THEN
1036	!
1037	!-- Code optimized for vector processors
1038	CALL fft_y_m( work_triy, ny, 'backward' )
1039
1040	DO k = 1, nz
1041
1042	m = 0
1043	DO n = 1, pdims(2)
1044	DO j = 1, nny
1045	ar(j,k,i,n) = work_triy(m,k)
1046	m = m + 1
1047	ENDDO
1048	ENDDO
1049
1050	ENDDO
1051
1052	ELSE
1053	!
1054	!-- Cache optimized code
1055	DO k = 1, nz
1056
1057	DO j = 0, ny
1058	work_ffty(j) = work_triy(j,k)
1059	ENDDO
1060
1061	CALL fft_y_1d( work_ffty, 'backward' )
1062
1063	m = 0
1064	DO n = 1, pdims(2)
1065	DO j = 1, nny
1066	ar(j,k,i,n) = work_ffty(m)
1067	m = m + 1
1068	ENDDO
1069	ENDDO
1070
1071	ENDDO
1072
1073	ENDIF
1074
1075	ENDDO
1076
1077	DEALLOCATE( tri )
1078
1079	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'stop' )
1080
1081	END SUBROUTINE ffty_tri_ffty
1082
1083	#endif
1084
1085	END MODULE poisfft_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |