Home

Context Navigation

source: palm/trunk/SOURCE/poisfft.f90 @ 1212

Last change on this file since 1212 was 1212, checked in by raasch, 11 years ago
tridia-solver moved to seperate module; the tridiagonal matrix coefficients of array tri are calculated only once at the beginning
Property svn:keywords set to `Id`
File size: 31.9 KB

Line
1	MODULE poisfft_mod
2
3	!--------------------------------------------------------------------------------!
4	! This file is part of PALM.
5	!
6	! PALM is free software: you can redistribute it and/or modify it under the terms
7	! of the GNU General Public License as published by the Free Software Foundation,
8	! either version 3 of the License, or (at your option) any later version.
9	!
10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13	!
14	! You should have received a copy of the GNU General Public License along with
15	! PALM. If not, see <http://www.gnu.org/licenses/>.
16	!
17	! Copyright 1997-2012 Leibniz University Hannover
18	!--------------------------------------------------------------------------------!
19	!
20	! Current revisions:
21	! -----------------
22	! tridia routines moved to seperate module tridia_solver
23	!
24	! Former revisions:
25	! -----------------
26	! $Id: poisfft.f90 1212 2013-08-15 08:46:27Z raasch $
27	!
28	! 1208 2013-08-13 06:41:49Z raasch
29	! acc-update clauses added for "ar" so that ffts other than cufft can also be
30	! used (although they are not ported and will give a poor performance)
31	!
32	! 1111 2013-03-08 23:54:10Z raasch
33	! further openACC porting of non-parallel (MPI) branch:
34	! tridiagonal routines split into extermal subroutines (instead using CONTAINS),
35	! no distinction between parallel/non-parallel in poisfft and tridia any more,
36	! tridia routines moved to end of file because of probable bug in PGI compiler 12.5
37	! (otherwise "invalid device function" is indicated during runtime),
38	! optimization of tridia routines: constant elements and coefficients of tri are
39	! stored in seperate arrays ddzuw and tric, last dimension of tri reduced from 5
40	! to 2,
41	! poisfft_init is now called internally from poisfft, maketri is called from
42	! poisfft_init,
43	! ibc_p_b = 2 removed
44	!
45	! 1106 2013-03-04 05:31:38Z raasch
46	! routines fftx, ffty, fftxp, fftyp removed, calls replaced by fft_x, fft_y,
47	! in the 1D-decomposition routines fft_x, ffty are replaced by fft_x_1d,
48	! fft_y_1d
49	!
50	! 1103 2013-02-20 02:15:53Z raasch
51	! tri, ar, and ar1 arguments in tridia-routines (2d) are removed because they
52	! sometimes cause segmentation faults with intel 12.1 compiler
53	!
54	! 1092 2013-02-02 11:24:22Z raasch
55	! unused variables removed
56	!
57	! 1036 2012-10-22 13:43:42Z raasch
58	! code put under GPL (PALM 3.9)
59	!
60	! 2012-09-21 07:03:55Z raasch
61	! FLOAT type conversion replaced by REAL
62	!
63	! 1003 2012-09-14 14:35:53Z raasch
64	! indices nxa, nya, etc. replaced by nx, ny, etc.
65	!
66	! 940 2012-07-09 14:31:00Z raasch
67	! special handling of tri-array as an argument in tridia_1dd routines switched
68	! off because it caused segmentation faults with intel 12.1 compiler
69	!
70	! 877 2012-04-03 11:21:44Z suehring
71	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
72	! pressure at the top of the model domain.
73	!
74	! 809 2012-01-30 13:32:58Z maronga
75	! Bugfix: replaced .AND. and .NOT. with && and ! in the preprocessor directives
76	!
77	! 807 2012-01-25 11:53:51Z maronga
78	! New cpp directive "__check" implemented which is used by check_namelist_files
79	! (most of the code is unneeded by check_namelist_files).
80	!
81	! 763 2011-10-06 09:32:09Z suehring
82	! Comment added concerning the last change.
83	!
84	! 761 2011-10-05 17:58:52Z suehring
85	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
86	! pressure at the top of the model domain.
87	!
88	! 696 2011-03-18 07:03:49Z raasch
89	! work_fftx removed from PRIVATE clauses in fftx_tr_xy and tr_yx_fftx
90	!
91	! 683 2011-02-09 14:25:15Z raasch
92	! openMP parallelization for 2d-domain-decomposition
93	!
94	! 667 2010-12-23 12:06:00Z suehring/gryschka
95	! ddzu replaced by ddzu_pres due to changes in zu(0)
96	!
97	! 622 2010-12-10 08:08:13Z raasch
98	! optional barriers included in order to speed up collective operations
99	!
100	! 377 2009-09-04 11:09:00Z raasch
101	! __lcmuk changed to __lc to avoid problems with Intel compiler on sgi-ice
102	!
103	! 164 2008-05-15 08:46:15Z raasch
104	! Arguments removed from transpose routines
105	!
106	! 128 2007-10-26 13:11:14Z raasch
107	! Bugfix: wavenumber calculation for even nx in routines maketri
108	!
109	! 85 2007-05-11 09:35:14Z raasch
110	! Bugfix: work_fft*_vec removed from some PRIVATE-declarations
111	!
112	! 76 2007-03-29 00:58:32Z raasch
113	! Tridiagonal coefficients adjusted for Neumann boundary conditions both at
114	! the bottom and the top.
115	!
116	! RCS Log replace by Id keyword, revision history cleaned up
117	!
118	! Revision 1.24 2006/08/04 15:00:24 raasch
119	! Default setting of the thread number tn in case of not using OpenMP
120	!
121	! Revision 1.23 2006/02/23 12:48:38 raasch
122	! Additional compiler directive in routine tridia_1dd for preventing loop
123	! exchange on NEC-SX6
124	!
125	! Revision 1.20 2004/04/30 12:38:09 raasch
126	! Parts of former poisfft_hybrid moved to this subroutine,
127	! former subroutine changed to a module, renaming of FFT-subroutines and
128	! -module, FFTs completely substituted by calls of fft_x and fft_y,
129	! NAG fft used in the non-parallel case completely removed, l in maketri
130	! is now a 1d-array, variables passed by modules instead of using parameter
131	! lists, enlarged transposition arrays introduced
132	!
133	! Revision 1.1 1997/07/24 11:24:14 raasch
134	! Initial revision
135	!
136	!
137	! Description:
138	! ------------
139	! See below.
140	!------------------------------------------------------------------------------!
141
142	!--------------------------------------------------------------------------!
143	! poisfft !
144	! !
145	! Original version: Stephan Siano (pois3d) !
146	! !
147	! Institute of Meteorology and Climatology, University of Hannover !
148	! Germany !
149	! !
150	! Version as of July 23,1996 !
151	! !
152	! !
153	! Version for parallel computers: Siegfried Raasch !
154	! !
155	! Version as of July 03,1997 !
156	! !
157	! Solves the Poisson equation with a 2D spectral method !
158	! d^2 p / dx^2 + d^2 p / dy^2 + d^2 p / dz^2 = s !
159	! !
160	! Input: !
161	! real ar contains in the (nnx,nny,nnz) elements, !
162	! starting from the element (1,nys,nxl), the !
163	! values for s !
164	! real work Temporary array !
165	! !
166	! Output: !
167	! real ar contains the solution for p !
168	!--------------------------------------------------------------------------!
169
170	USE fft_xy
171	USE indices
172	USE transpose_indices
173	USE tridia_solver
174
175	IMPLICIT NONE
176
177	LOGICAL, SAVE :: poisfft_initialized = .FALSE.
178
179	PRIVATE
180
181	#if ! defined ( __check )
182	PUBLIC poisfft, poisfft_init
183
184	INTERFACE poisfft
185	MODULE PROCEDURE poisfft
186	END INTERFACE poisfft
187
188	INTERFACE poisfft_init
189	MODULE PROCEDURE poisfft_init
190	END INTERFACE poisfft_init
191	#else
192	PUBLIC poisfft_init
193
194	INTERFACE poisfft_init
195	MODULE PROCEDURE poisfft_init
196	END INTERFACE poisfft_init
197	#endif
198
199	CONTAINS
200
201	SUBROUTINE poisfft_init
202
203	USE arrays_3d, ONLY: ddzu_pres, ddzw
204
205	IMPLICIT NONE
206
207	INTEGER :: k
208
209
210	CALL fft_init
211
212	CALL tridia_init
213
214	poisfft_initialized = .TRUE.
215
216	END SUBROUTINE poisfft_init
217
218
219	#if ! defined ( __check )
220	SUBROUTINE poisfft( ar, work )
221
222	USE control_parameters, ONLY : fft_method
223	USE cpulog
224	USE interfaces
225	USE pegrid
226
227	IMPLICIT NONE
228
229	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar, work
230
231
232	CALL cpu_log( log_point_s(3), 'poisfft', 'start' )
233
234	IF ( .NOT. poisfft_initialized ) CALL poisfft_init
235
236	!
237	!-- Two-dimensional Fourier Transformation in x- and y-direction.
238	IF ( pdims(2) == 1 .AND. pdims(1) > 1 ) THEN
239
240	!
241	!-- 1d-domain-decomposition along x:
242	!-- FFT along y and transposition y --> x
243	CALL ffty_tr_yx( ar, work, ar )
244
245	!
246	!-- FFT along x, solving the tridiagonal system and backward FFT
247	CALL fftx_tri_fftx( ar )
248
249	!
250	!-- Transposition x --> y and backward FFT along y
251	CALL tr_xy_ffty( ar, work, ar )
252
253	ELSEIF ( pdims(1) == 1 .AND. pdims(2) > 1 ) THEN
254
255	!
256	!-- 1d-domain-decomposition along y:
257	!-- FFT along x and transposition x --> y
258	CALL fftx_tr_xy( ar, work, ar )
259
260	!
261	!-- FFT along y, solving the tridiagonal system and backward FFT
262	CALL ffty_tri_ffty( ar )
263
264	!
265	!-- Transposition y --> x and backward FFT along x
266	CALL tr_yx_fftx( ar, work, ar )
267
268	ELSE
269
270	!
271	!-- 2d-domain-decomposition or no decomposition (1 PE run)
272	!-- Transposition z --> x
273	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
274	CALL transpose_zx( ar, work, ar )
275	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
276
277	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
278	IF ( fft_method /= 'system-specific' ) THEN
279	!$acc update host( ar )
280	ENDIF
281	CALL fft_x( ar, 'forward' )
282	IF ( fft_method /= 'system-specific' ) THEN
283	!$acc update device( ar )
284	ENDIF
285	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
286
287	!
288	!-- Transposition x --> y
289	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
290	CALL transpose_xy( ar, work, ar )
291	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
292
293	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
294	IF ( fft_method /= 'system-specific' ) THEN
295	!$acc update host( ar )
296	ENDIF
297	CALL fft_y( ar, 'forward' )
298	IF ( fft_method /= 'system-specific' ) THEN
299	!$acc update device( ar )
300	ENDIF
301	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
302
303	!
304	!-- Transposition y --> z
305	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
306	CALL transpose_yz( ar, work, ar )
307	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
308
309	!
310	!-- Solve the tridiagonal equation system along z
311	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
312	CALL tridia_substi( ar )
313	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
314
315	!
316	!-- Inverse Fourier Transformation
317	!-- Transposition z --> y
318	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
319	CALL transpose_zy( ar, work, ar )
320	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
321
322	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
323	IF ( fft_method /= 'system-specific' ) THEN
324	!$acc update host( ar )
325	ENDIF
326	CALL fft_y( ar, 'backward' )
327	IF ( fft_method /= 'system-specific' ) THEN
328	!$acc update device( ar )
329	ENDIF
330	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
331
332	!
333	!-- Transposition y --> x
334	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
335	CALL transpose_yx( ar, work, ar )
336	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
337
338	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
339	IF ( fft_method /= 'system-specific' ) THEN
340	!$acc update host( ar )
341	ENDIF
342	CALL fft_x( ar, 'backward' )
343	IF ( fft_method /= 'system-specific' ) THEN
344	!$acc update device( ar )
345	ENDIF
346	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
347
348	!
349	!-- Transposition x --> z
350	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
351	CALL transpose_xz( ar, work, ar )
352	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
353
354	ENDIF
355
356	CALL cpu_log( log_point_s(3), 'poisfft', 'stop' )
357
358	END SUBROUTINE poisfft
359
360
361
362	SUBROUTINE ffty_tr_yx( f_in, work, f_out )
363
364	!------------------------------------------------------------------------------!
365	! Fourier-transformation along y with subsequent transposition y --> x for
366	! a 1d-decomposition along x
367	!
368	! ATTENTION: The performance of this routine is much faster on the NEC-SX6,
369	! if the first index of work_ffty_vec is odd. Otherwise
370	! memory bank conflicts may occur (especially if the index is a
371	! multiple of 128). That's why work_ffty_vec is dimensioned as
372	! 0:ny+1.
373	! Of course, this will not work if users are using an odd number
374	! of gridpoints along y.
375	!------------------------------------------------------------------------------!
376
377	USE control_parameters
378	USE cpulog
379	USE indices
380	USE interfaces
381	USE pegrid
382	USE transpose_indices
383
384	IMPLICIT NONE
385
386	INTEGER :: i, iend, iouter, ir, j, k
387	INTEGER, PARAMETER :: stridex = 4
388
389	REAL, DIMENSION(0:ny,stridex) :: work_ffty
390	#if defined( __nec )
391	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
392	#endif
393	REAL, DIMENSION(1:nz,0:ny,nxl:nxr) :: f_in
394	REAL, DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_out
395	REAL, DIMENSION(nxl:nxr,1:nz,0:ny) :: work
396
397	!
398	!-- Carry out the FFT along y, where all data are present due to the
399	!-- 1d-decomposition along x. Resort the data in a way that x becomes
400	!-- the first index.
401	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'start' )
402
403	IF ( host(1:3) == 'nec' ) THEN
404	#if defined( __nec )
405	!
406	!-- Code optimized for vector processors
407	!$OMP PARALLEL PRIVATE ( i, j, k )
408	!$OMP DO
409	DO i = nxl, nxr
410
411	DO j = 0, ny
412	DO k = 1, nz
413	work_ffty_vec(j,k,i) = f_in(k,j,i)
414	ENDDO
415	ENDDO
416
417	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' )
418
419	ENDDO
420
421	!$OMP DO
422	DO k = 1, nz
423	DO j = 0, ny
424	DO i = nxl, nxr
425	work(i,k,j) = work_ffty_vec(j,k,i)
426	ENDDO
427	ENDDO
428	ENDDO
429	!$OMP END PARALLEL
430	#endif
431
432	ELSE
433
434	!
435	!-- Cache optimized code.
436	!-- The i-(x-)direction is split into a strided outer loop and an inner
437	!-- loop for better cache performance
438	!$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty)
439	!$OMP DO
440	DO iouter = nxl, nxr, stridex
441
442	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
443
444	DO k = 1, nz
445
446	DO i = iouter, iend
447
448	ir = i-iouter+1 ! counter within a stride
449	DO j = 0, ny
450	work_ffty(j,ir) = f_in(k,j,i)
451	ENDDO
452	!
453	!-- FFT along y
454	CALL fft_y_1d( work_ffty(:,ir), 'forward' )
455
456	ENDDO
457
458	!
459	!-- Resort
460	DO j = 0, ny
461	DO i = iouter, iend
462	work(i,k,j) = work_ffty(j,i-iouter+1)
463	ENDDO
464	ENDDO
465
466	ENDDO
467
468	ENDDO
469	!$OMP END PARALLEL
470
471	ENDIF
472	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'pause' )
473
474	!
475	!-- Transpose array
476	#if defined( __parallel )
477	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
478	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
479	CALL MPI_ALLTOALL( work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
480	f_out(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
481	comm1dx, ierr )
482	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
483	#endif
484
485	END SUBROUTINE ffty_tr_yx
486
487
488	SUBROUTINE tr_xy_ffty( f_in, work, f_out )
489
490	!------------------------------------------------------------------------------!
491	! Transposition x --> y with a subsequent backward Fourier transformation for
492	! a 1d-decomposition along x
493	!------------------------------------------------------------------------------!
494
495	USE control_parameters
496	USE cpulog
497	USE indices
498	USE interfaces
499	USE pegrid
500	USE transpose_indices
501
502	IMPLICIT NONE
503
504	INTEGER :: i, iend, iouter, ir, j, k
505	INTEGER, PARAMETER :: stridex = 4
506
507	REAL, DIMENSION(0:ny,stridex) :: work_ffty
508	#if defined( __nec )
509	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
510	#endif
511	REAL, DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_in
512	REAL, DIMENSION(1:nz,0:ny,nxl:nxr) :: f_out
513	REAL, DIMENSION(nxl:nxr,1:nz,0:ny) :: work
514
515	!
516	!-- Transpose array
517	#if defined( __parallel )
518	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
519	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
520	CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
521	work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
522	comm1dx, ierr )
523	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
524	#endif
525
526	!
527	!-- Resort the data in a way that y becomes the first index and carry out the
528	!-- backward fft along y.
529	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'continue' )
530
531	IF ( host(1:3) == 'nec' ) THEN
532	#if defined( __nec )
533	!
534	!-- Code optimized for vector processors
535	!$OMP PARALLEL PRIVATE ( i, j, k )
536	!$OMP DO
537	DO k = 1, nz
538	DO j = 0, ny
539	DO i = nxl, nxr
540	work_ffty_vec(j,k,i) = work(i,k,j)
541	ENDDO
542	ENDDO
543	ENDDO
544
545	!$OMP DO
546	DO i = nxl, nxr
547
548	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' )
549
550	DO j = 0, ny
551	DO k = 1, nz
552	f_out(k,j,i) = work_ffty_vec(j,k,i)
553	ENDDO
554	ENDDO
555
556	ENDDO
557	!$OMP END PARALLEL
558	#endif
559
560	ELSE
561
562	!
563	!-- Cache optimized code.
564	!-- The i-(x-)direction is split into a strided outer loop and an inner
565	!-- loop for better cache performance
566	!$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty )
567	!$OMP DO
568	DO iouter = nxl, nxr, stridex
569
570	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
571
572	DO k = 1, nz
573	!
574	!-- Resort
575	DO j = 0, ny
576	DO i = iouter, iend
577	work_ffty(j,i-iouter+1) = work(i,k,j)
578	ENDDO
579	ENDDO
580
581	DO i = iouter, iend
582
583	!
584	!-- FFT along y
585	ir = i-iouter+1 ! counter within a stride
586	CALL fft_y_1d( work_ffty(:,ir), 'backward' )
587
588	DO j = 0, ny
589	f_out(k,j,i) = work_ffty(j,ir)
590	ENDDO
591	ENDDO
592
593	ENDDO
594
595	ENDDO
596	!$OMP END PARALLEL
597
598	ENDIF
599
600	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'stop' )
601
602	END SUBROUTINE tr_xy_ffty
603
604
605	SUBROUTINE fftx_tri_fftx( ar )
606
607	!------------------------------------------------------------------------------!
608	! FFT along x, solution of the tridiagonal system and backward FFT for
609	! a 1d-decomposition along x
610	!
611	! WARNING: this subroutine may still not work for hybrid parallelization
612	! with OpenMP (for possible necessary changes see the original
613	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
614	!------------------------------------------------------------------------------!
615
616	USE control_parameters
617	USE cpulog
618	USE grid_variables
619	USE indices
620	USE interfaces
621	USE pegrid
622	USE transpose_indices
623
624	IMPLICIT NONE
625
626	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
627
628	REAL, DIMENSION(0:nx) :: work_fftx
629	REAL, DIMENSION(0:nx,1:nz) :: work_trix
630	REAL, DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: ar
631	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
632
633
634	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'start' )
635
636	ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) )
637
638	tn = 0 ! Default thread number in case of one thread
639	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_fftx, work_trix )
640	DO j = nys_x, nyn_x
641
642	!$ tn = omp_get_thread_num()
643
644	IF ( host(1:3) == 'nec' ) THEN
645	!
646	!-- Code optimized for vector processors
647	DO k = 1, nz
648
649	m = 0
650	DO n = 1, pdims(1)
651	DO i = 1, nnx
652	work_trix(m,k) = ar(i,k,j,n)
653	m = m + 1
654	ENDDO
655	ENDDO
656
657	ENDDO
658
659	CALL fft_x_m( work_trix, 'forward' )
660
661	ELSE
662	!
663	!-- Cache optimized code
664	DO k = 1, nz
665
666	m = 0
667	DO n = 1, pdims(1)
668	DO i = 1, nnx
669	work_fftx(m) = ar(i,k,j,n)
670	m = m + 1
671	ENDDO
672	ENDDO
673
674	CALL fft_x_1d( work_fftx, 'forward' )
675
676	DO i = 0, nx
677	work_trix(i,k) = work_fftx(i)
678	ENDDO
679
680	ENDDO
681
682	ENDIF
683
684	!
685	!-- Solve the linear equation system
686	CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) )
687
688	IF ( host(1:3) == 'nec' ) THEN
689	!
690	!-- Code optimized for vector processors
691	CALL fft_x_m( work_trix, 'backward' )
692
693	DO k = 1, nz
694
695	m = 0
696	DO n = 1, pdims(1)
697	DO i = 1, nnx
698	ar(i,k,j,n) = work_trix(m,k)
699	m = m + 1
700	ENDDO
701	ENDDO
702
703	ENDDO
704
705	ELSE
706	!
707	!-- Cache optimized code
708	DO k = 1, nz
709
710	DO i = 0, nx
711	work_fftx(i) = work_trix(i,k)
712	ENDDO
713
714	CALL fft_x_1d( work_fftx, 'backward' )
715
716	m = 0
717	DO n = 1, pdims(1)
718	DO i = 1, nnx
719	ar(i,k,j,n) = work_fftx(m)
720	m = m + 1
721	ENDDO
722	ENDDO
723
724	ENDDO
725
726	ENDIF
727
728	ENDDO
729
730	DEALLOCATE( tri )
731
732	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'stop' )
733
734	END SUBROUTINE fftx_tri_fftx
735
736
737	SUBROUTINE fftx_tr_xy( f_in, work, f_out )
738
739	!------------------------------------------------------------------------------!
740	! Fourier-transformation along x with subsequent transposition x --> y for
741	! a 1d-decomposition along y
742	!
743	! ATTENTION: The NEC-branch of this routine may significantly profit from
744	! further optimizations. So far, performance is much worse than
745	! for routine ffty_tr_yx (more than three times slower).
746	!------------------------------------------------------------------------------!
747
748	USE control_parameters
749	USE cpulog
750	USE indices
751	USE interfaces
752	USE pegrid
753	USE transpose_indices
754
755	IMPLICIT NONE
756
757	INTEGER :: i, j, k
758
759	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
760	REAL, DIMENSION(1:nz,nys:nyn,0:nx) :: f_in
761	REAL, DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_out
762	REAL, DIMENSION(nys:nyn,1:nz,0:nx) :: work
763
764	!
765	!-- Carry out the FFT along x, where all data are present due to the
766	!-- 1d-decomposition along y. Resort the data in a way that y becomes
767	!-- the first index.
768	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'start' )
769
770	IF ( host(1:3) == 'nec' ) THEN
771	!
772	!-- Code for vector processors
773	!$OMP PARALLEL PRIVATE ( i, j, k )
774	!$OMP DO
775	DO i = 0, nx
776
777	DO j = nys, nyn
778	DO k = 1, nz
779	work_fftx(i,k,j) = f_in(k,j,i)
780	ENDDO
781	ENDDO
782
783	ENDDO
784
785	!$OMP DO
786	DO j = nys, nyn
787
788	CALL fft_x_m( work_fftx(:,:,j), 'forward' )
789
790	DO k = 1, nz
791	DO i = 0, nx
792	work(j,k,i) = work_fftx(i,k,j)
793	ENDDO
794	ENDDO
795
796	ENDDO
797	!$OMP END PARALLEL
798
799	ELSE
800
801	!
802	!-- Cache optimized code (there might be still a potential for better
803	!-- optimization).
804	!$OMP PARALLEL PRIVATE (i,j,k)
805	!$OMP DO
806	DO i = 0, nx
807
808	DO j = nys, nyn
809	DO k = 1, nz
810	work_fftx(i,k,j) = f_in(k,j,i)
811	ENDDO
812	ENDDO
813
814	ENDDO
815
816	!$OMP DO
817	DO j = nys, nyn
818	DO k = 1, nz
819
820	CALL fft_x_1d( work_fftx(0:nx,k,j), 'forward' )
821
822	DO i = 0, nx
823	work(j,k,i) = work_fftx(i,k,j)
824	ENDDO
825	ENDDO
826
827	ENDDO
828	!$OMP END PARALLEL
829
830	ENDIF
831	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'pause' )
832
833	!
834	!-- Transpose array
835	#if defined( __parallel )
836	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
837	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
838	CALL MPI_ALLTOALL( work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
839	f_out(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
840	comm1dy, ierr )
841	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
842	#endif
843
844	END SUBROUTINE fftx_tr_xy
845
846
847	SUBROUTINE tr_yx_fftx( f_in, work, f_out )
848
849	!------------------------------------------------------------------------------!
850	! Transposition y --> x with a subsequent backward Fourier transformation for
851	! a 1d-decomposition along x
852	!------------------------------------------------------------------------------!
853
854	USE control_parameters
855	USE cpulog
856	USE indices
857	USE interfaces
858	USE pegrid
859	USE transpose_indices
860
861	IMPLICIT NONE
862
863	INTEGER :: i, j, k
864
865	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
866	REAL, DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_in
867	REAL, DIMENSION(1:nz,nys:nyn,0:nx) :: f_out
868	REAL, DIMENSION(nys:nyn,1:nz,0:nx) :: work
869
870	!
871	!-- Transpose array
872	#if defined( __parallel )
873	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
874	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
875	CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
876	work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
877	comm1dy, ierr )
878	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
879	#endif
880
881	!
882	!-- Carry out the FFT along x, where all data are present due to the
883	!-- 1d-decomposition along y. Resort the data in a way that y becomes
884	!-- the first index.
885	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'continue' )
886
887	IF ( host(1:3) == 'nec' ) THEN
888	!
889	!-- Code optimized for vector processors
890	!$OMP PARALLEL PRIVATE ( i, j, k )
891	!$OMP DO
892	DO j = nys, nyn
893
894	DO k = 1, nz
895	DO i = 0, nx
896	work_fftx(i,k,j) = work(j,k,i)
897	ENDDO
898	ENDDO
899
900	CALL fft_x_m( work_fftx(:,:,j), 'backward' )
901
902	ENDDO
903
904	!$OMP DO
905	DO i = 0, nx
906	DO j = nys, nyn
907	DO k = 1, nz
908	f_out(k,j,i) = work_fftx(i,k,j)
909	ENDDO
910	ENDDO
911	ENDDO
912	!$OMP END PARALLEL
913
914	ELSE
915
916	!
917	!-- Cache optimized code (there might be still a potential for better
918	!-- optimization).
919	!$OMP PARALLEL PRIVATE (i,j,k)
920	!$OMP DO
921	DO j = nys, nyn
922	DO k = 1, nz
923
924	DO i = 0, nx
925	work_fftx(i,k,j) = work(j,k,i)
926	ENDDO
927
928	CALL fft_x_1d( work_fftx(0:nx,k,j), 'backward' )
929
930	ENDDO
931	ENDDO
932
933	!$OMP DO
934	DO i = 0, nx
935	DO j = nys, nyn
936	DO k = 1, nz
937	f_out(k,j,i) = work_fftx(i,k,j)
938	ENDDO
939	ENDDO
940	ENDDO
941	!$OMP END PARALLEL
942
943	ENDIF
944	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'stop' )
945
946	END SUBROUTINE tr_yx_fftx
947
948
949	SUBROUTINE ffty_tri_ffty( ar )
950
951	!------------------------------------------------------------------------------!
952	! FFT along y, solution of the tridiagonal system and backward FFT for
953	! a 1d-decomposition along y
954	!
955	! WARNING: this subroutine may still not work for hybrid parallelization
956	! with OpenMP (for possible necessary changes see the original
957	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
958	!------------------------------------------------------------------------------!
959
960	USE control_parameters
961	USE cpulog
962	USE grid_variables
963	USE indices
964	USE interfaces
965	USE pegrid
966	USE transpose_indices
967
968	IMPLICIT NONE
969
970	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
971
972	REAL, DIMENSION(0:ny) :: work_ffty
973	REAL, DIMENSION(0:ny,1:nz) :: work_triy
974	REAL, DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: ar
975	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
976
977
978	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'start' )
979
980	ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) )
981
982	tn = 0 ! Default thread number in case of one thread
983	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy )
984	DO i = nxl_y, nxr_y
985
986	!$ tn = omp_get_thread_num()
987
988	IF ( host(1:3) == 'nec' ) THEN
989	!
990	!-- Code optimized for vector processors
991	DO k = 1, nz
992
993	m = 0
994	DO n = 1, pdims(2)
995	DO j = 1, nny
996	work_triy(m,k) = ar(j,k,i,n)
997	m = m + 1
998	ENDDO
999	ENDDO
1000
1001	ENDDO
1002
1003	CALL fft_y_m( work_triy, ny, 'forward' )
1004
1005	ELSE
1006	!
1007	!-- Cache optimized code
1008	DO k = 1, nz
1009
1010	m = 0
1011	DO n = 1, pdims(2)
1012	DO j = 1, nny
1013	work_ffty(m) = ar(j,k,i,n)
1014	m = m + 1
1015	ENDDO
1016	ENDDO
1017
1018	CALL fft_y_1d( work_ffty, 'forward' )
1019
1020	DO j = 0, ny
1021	work_triy(j,k) = work_ffty(j)
1022	ENDDO
1023
1024	ENDDO
1025
1026	ENDIF
1027
1028	!
1029	!-- Solve the linear equation system
1030	CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) )
1031
1032	IF ( host(1:3) == 'nec' ) THEN
1033	!
1034	!-- Code optimized for vector processors
1035	CALL fft_y_m( work_triy, ny, 'backward' )
1036
1037	DO k = 1, nz
1038
1039	m = 0
1040	DO n = 1, pdims(2)
1041	DO j = 1, nny
1042	ar(j,k,i,n) = work_triy(m,k)
1043	m = m + 1
1044	ENDDO
1045	ENDDO
1046
1047	ENDDO
1048
1049	ELSE
1050	!
1051	!-- Cache optimized code
1052	DO k = 1, nz
1053
1054	DO j = 0, ny
1055	work_ffty(j) = work_triy(j,k)
1056	ENDDO
1057
1058	CALL fft_y_1d( work_ffty, 'backward' )
1059
1060	m = 0
1061	DO n = 1, pdims(2)
1062	DO j = 1, nny
1063	ar(j,k,i,n) = work_ffty(m)
1064	m = m + 1
1065	ENDDO
1066	ENDDO
1067
1068	ENDDO
1069
1070	ENDIF
1071
1072	ENDDO
1073
1074	DEALLOCATE( tri )
1075
1076	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'stop' )
1077
1078	END SUBROUTINE ffty_tri_ffty
1079
1080	#endif
1081
1082	END MODULE poisfft_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |