Home

Context Navigation

source: palm/trunk/SOURCE/poisfft.f90 @ 1216

Last change on this file since 1216 was 1216, checked in by raasch, 11 years ago
overlapping execution of fft and transpositions (MPI_ALLTOALL), but real overlapping is not activated so far, fftw implemented for 1D-decomposition resorting of arrays moved to separate routines resort_for_... bugfix in mbuild concerning Makefile_check
Property svn:keywords set to `Id`
File size: 45.8 KB

Line
1	MODULE poisfft_mod
2
3	!--------------------------------------------------------------------------------!
4	! This file is part of PALM.
5	!
6	! PALM is free software: you can redistribute it and/or modify it under the terms
7	! of the GNU General Public License as published by the Free Software Foundation,
8	! either version 3 of the License, or (at your option) any later version.
9	!
10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13	!
14	! You should have received a copy of the GNU General Public License along with
15	! PALM. If not, see <http://www.gnu.org/licenses/>.
16	!
17	! Copyright 1997-2012 Leibniz University Hannover
18	!--------------------------------------------------------------------------------!
19	!
20	! Current revisions:
21	! -----------------
22	! resorting of arrays moved to separate routines resort_for_...,
23	! one argument, used as temporary work array, removed from all transpose
24	! routines
25	! overlapping fft / transposition implemented
26	!
27	! Former revisions:
28	! -----------------
29	! $Id: poisfft.f90 1216 2013-08-26 09:31:42Z raasch $
30	!
31	! 1212 2013-08-15 08:46:27Z raasch
32	! tridia routines moved to seperate module tridia_solver
33	!
34	! 1208 2013-08-13 06:41:49Z raasch
35	! acc-update clauses added for "ar" so that ffts other than cufft can also be
36	! used (although they are not ported and will give a poor performance)
37	!
38	! 1111 2013-03-08 23:54:10Z raasch
39	! further openACC porting of non-parallel (MPI) branch:
40	! tridiagonal routines split into extermal subroutines (instead using CONTAINS),
41	! no distinction between parallel/non-parallel in poisfft and tridia any more,
42	! tridia routines moved to end of file because of probable bug in PGI compiler 12.5
43	! (otherwise "invalid device function" is indicated during runtime),
44	! optimization of tridia routines: constant elements and coefficients of tri are
45	! stored in seperate arrays ddzuw and tric, last dimension of tri reduced from 5
46	! to 2,
47	! poisfft_init is now called internally from poisfft, maketri is called from
48	! poisfft_init,
49	! ibc_p_b = 2 removed
50	!
51	! 1106 2013-03-04 05:31:38Z raasch
52	! routines fftx, ffty, fftxp, fftyp removed, calls replaced by fft_x, fft_y,
53	! in the 1D-decomposition routines fft_x, ffty are replaced by fft_x_1d,
54	! fft_y_1d
55	!
56	! 1103 2013-02-20 02:15:53Z raasch
57	! tri, ar, and ar1 arguments in tridia-routines (2d) are removed because they
58	! sometimes cause segmentation faults with intel 12.1 compiler
59	!
60	! 1092 2013-02-02 11:24:22Z raasch
61	! unused variables removed
62	!
63	! 1036 2012-10-22 13:43:42Z raasch
64	! code put under GPL (PALM 3.9)
65	!
66	! 2012-09-21 07:03:55Z raasch
67	! FLOAT type conversion replaced by REAL
68	!
69	! 1003 2012-09-14 14:35:53Z raasch
70	! indices nxa, nya, etc. replaced by nx, ny, etc.
71	!
72	! 940 2012-07-09 14:31:00Z raasch
73	! special handling of tri-array as an argument in tridia_1dd routines switched
74	! off because it caused segmentation faults with intel 12.1 compiler
75	!
76	! 877 2012-04-03 11:21:44Z suehring
77	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
78	! pressure at the top of the model domain.
79	!
80	! 809 2012-01-30 13:32:58Z maronga
81	! Bugfix: replaced .AND. and .NOT. with && and ! in the preprocessor directives
82	!
83	! 807 2012-01-25 11:53:51Z maronga
84	! New cpp directive "__check" implemented which is used by check_namelist_files
85	! (most of the code is unneeded by check_namelist_files).
86	!
87	! 763 2011-10-06 09:32:09Z suehring
88	! Comment added concerning the last change.
89	!
90	! 761 2011-10-05 17:58:52Z suehring
91	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
92	! pressure at the top of the model domain.
93	!
94	! 696 2011-03-18 07:03:49Z raasch
95	! work_fftx removed from PRIVATE clauses in fftx_tr_xy and tr_yx_fftx
96	!
97	! 683 2011-02-09 14:25:15Z raasch
98	! openMP parallelization for 2d-domain-decomposition
99	!
100	! 667 2010-12-23 12:06:00Z suehring/gryschka
101	! ddzu replaced by ddzu_pres due to changes in zu(0)
102	!
103	! 622 2010-12-10 08:08:13Z raasch
104	! optional barriers included in order to speed up collective operations
105	!
106	! 377 2009-09-04 11:09:00Z raasch
107	! __lcmuk changed to __lc to avoid problems with Intel compiler on sgi-ice
108	!
109	! 164 2008-05-15 08:46:15Z raasch
110	! Arguments removed from transpose routines
111	!
112	! 128 2007-10-26 13:11:14Z raasch
113	! Bugfix: wavenumber calculation for even nx in routines maketri
114	!
115	! 85 2007-05-11 09:35:14Z raasch
116	! Bugfix: work_fft*_vec removed from some PRIVATE-declarations
117	!
118	! 76 2007-03-29 00:58:32Z raasch
119	! Tridiagonal coefficients adjusted for Neumann boundary conditions both at
120	! the bottom and the top.
121	!
122	! RCS Log replace by Id keyword, revision history cleaned up
123	!
124	! Revision 1.24 2006/08/04 15:00:24 raasch
125	! Default setting of the thread number tn in case of not using OpenMP
126	!
127	! Revision 1.23 2006/02/23 12:48:38 raasch
128	! Additional compiler directive in routine tridia_1dd for preventing loop
129	! exchange on NEC-SX6
130	!
131	! Revision 1.20 2004/04/30 12:38:09 raasch
132	! Parts of former poisfft_hybrid moved to this subroutine,
133	! former subroutine changed to a module, renaming of FFT-subroutines and
134	! -module, FFTs completely substituted by calls of fft_x and fft_y,
135	! NAG fft used in the non-parallel case completely removed, l in maketri
136	! is now a 1d-array, variables passed by modules instead of using parameter
137	! lists, enlarged transposition arrays introduced
138	!
139	! Revision 1.1 1997/07/24 11:24:14 raasch
140	! Initial revision
141	!
142	!
143	! Description:
144	! ------------
145	! See below.
146	!------------------------------------------------------------------------------!
147
148	!--------------------------------------------------------------------------!
149	! poisfft !
150	! !
151	! Original version: Stephan Siano (pois3d) !
152	! !
153	! Institute of Meteorology and Climatology, University of Hannover !
154	! Germany !
155	! !
156	! Version as of July 23,1996 !
157	! !
158	! !
159	! Version for parallel computers: Siegfried Raasch !
160	! !
161	! Version as of July 03,1997 !
162	! !
163	! Solves the Poisson equation with a 2D spectral method !
164	! d^2 p / dx^2 + d^2 p / dy^2 + d^2 p / dz^2 = s !
165	! !
166	! Input: !
167	! real ar contains in the (nnx,nny,nnz) elements, !
168	! starting from the element (1,nys,nxl), the !
169	! values for s !
170	! real work Temporary array !
171	! !
172	! Output: !
173	! real ar contains the solution for p !
174	!--------------------------------------------------------------------------!
175
176	USE fft_xy
177	USE indices
178	USE transpose_indices
179	USE tridia_solver
180
181	IMPLICIT NONE
182
183	LOGICAL, SAVE :: poisfft_initialized = .FALSE.
184
185	PRIVATE
186
187	#if ! defined ( __check )
188	PUBLIC poisfft, poisfft_init
189
190	INTERFACE poisfft
191	MODULE PROCEDURE poisfft
192	END INTERFACE poisfft
193
194	INTERFACE poisfft_init
195	MODULE PROCEDURE poisfft_init
196	END INTERFACE poisfft_init
197	#else
198	PUBLIC poisfft_init
199
200	INTERFACE poisfft_init
201	MODULE PROCEDURE poisfft_init
202	END INTERFACE poisfft_init
203	#endif
204
205	CONTAINS
206
207	SUBROUTINE poisfft_init
208
209	USE arrays_3d, ONLY: ddzu_pres, ddzw
210
211	IMPLICIT NONE
212
213	INTEGER :: k
214
215
216	CALL fft_init
217
218	CALL tridia_init
219
220	poisfft_initialized = .TRUE.
221
222	END SUBROUTINE poisfft_init
223
224
225	#if ! defined ( __check )
226	SUBROUTINE poisfft( ar, ar_inv_test )
227
228	USE control_parameters, ONLY : fft_method, transpose_compute_overlap
229	USE cpulog
230	USE interfaces
231	USE pegrid
232
233	IMPLICIT NONE
234
235	INTEGER :: ind_even, ind_odd, ind_third, ii, iind, inew, jj, jind, &
236	jnew, ki, kk, knew, n, nblk, nnx_y, nny_z, nnz_t, nnz_x, &
237	nxl_y_bound, nxr_y_bound
238	INTEGER, DIMENSION(4) :: isave
239
240	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar
241	REAL, DIMENSION(nys:nyn,nxl:nxr,1:nz) :: ar_inv_test ! work array tend from pres
242	!$acc declare create( ar_inv )
243	REAL, DIMENSION(nys:nyn,nxl:nxr,1:nz) :: ar_inv
244
245	REAL, DIMENSION(:,:,:), ALLOCATABLE :: f_in, f_inv, f_out_y, f_out_z
246	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: ar1
247
248
249	CALL cpu_log( log_point_s(3), 'poisfft', 'start' )
250
251	IF ( .NOT. poisfft_initialized ) CALL poisfft_init
252
253	!
254	!-- Two-dimensional Fourier Transformation in x- and y-direction.
255	IF ( pdims(2) == 1 .AND. pdims(1) > 1 ) THEN
256
257	!
258	!-- 1d-domain-decomposition along x:
259	!-- FFT along y and transposition y --> x
260	CALL ffty_tr_yx( ar, ar )
261
262	!
263	!-- FFT along x, solving the tridiagonal system and backward FFT
264	CALL fftx_tri_fftx( ar )
265
266	!
267	!-- Transposition x --> y and backward FFT along y
268	CALL tr_xy_ffty( ar, ar )
269
270	ELSEIF ( pdims(1) == 1 .AND. pdims(2) > 1 ) THEN
271
272	!
273	!-- 1d-domain-decomposition along y:
274	!-- FFT along x and transposition x --> y
275	CALL fftx_tr_xy( ar, ar )
276
277	!
278	!-- FFT along y, solving the tridiagonal system and backward FFT
279	CALL ffty_tri_ffty( ar )
280
281	!
282	!-- Transposition y --> x and backward FFT along x
283	CALL tr_yx_fftx( ar, ar )
284
285	ELSEIF ( .NOT. transpose_compute_overlap ) THEN
286
287	!
288	!-- 2d-domain-decomposition or no decomposition (1 PE run)
289	!-- Transposition z --> x
290	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
291	CALL resort_for_zx( ar, ar_inv )
292	CALL transpose_zx( ar_inv, ar )
293	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
294
295	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
296	IF ( fft_method /= 'system-specific' ) THEN
297	!$acc update host( ar )
298	ENDIF
299	CALL fft_x( ar, 'forward' )
300	IF ( fft_method /= 'system-specific' ) THEN
301	!$acc update device( ar )
302	ENDIF
303	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
304
305	!
306	!-- Transposition x --> y
307	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
308	CALL resort_for_xy( ar, ar_inv )
309	CALL transpose_xy( ar_inv, ar )
310	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
311
312	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
313	IF ( fft_method /= 'system-specific' ) THEN
314	!$acc update host( ar )
315	ENDIF
316	CALL fft_y( ar, 'forward', ar_tr = ar, &
317	nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
318	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
319	IF ( fft_method /= 'system-specific' ) THEN
320	!$acc update device( ar )
321	ENDIF
322	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
323
324	!
325	!-- Transposition y --> z
326	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
327	CALL resort_for_yz( ar, ar_inv )
328	CALL transpose_yz( ar_inv, ar )
329	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
330
331	!
332	!-- Solve the tridiagonal equation system along z
333	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
334	CALL tridia_substi( ar )
335	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
336
337	!
338	!-- Inverse Fourier Transformation
339	!-- Transposition z --> y
340	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
341	CALL transpose_zy( ar, ar_inv )
342	CALL resort_for_zy( ar_inv, ar )
343	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
344
345	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
346	IF ( fft_method /= 'system-specific' ) THEN
347	!$acc update host( ar )
348	ENDIF
349	CALL fft_y( ar, 'backward', ar_tr = ar, &
350	nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
351	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
352	IF ( fft_method /= 'system-specific' ) THEN
353	!$acc update device( ar )
354	ENDIF
355	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
356
357	!
358	!-- Transposition y --> x
359	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
360	CALL transpose_yx( ar, ar_inv )
361	CALL resort_for_yx( ar_inv, ar )
362	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
363
364	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
365	IF ( fft_method /= 'system-specific' ) THEN
366	!$acc update host( ar )
367	ENDIF
368	CALL fft_x( ar, 'backward' )
369	IF ( fft_method /= 'system-specific' ) THEN
370	!$acc update device( ar )
371	ENDIF
372	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
373
374	!
375	!-- Transposition x --> z
376	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
377	CALL transpose_xz( ar, ar_inv )
378	CALL resort_for_xz( ar_inv, ar )
379	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
380
381	ELSE
382
383	!
384	!-- 2d-domain-decomposition or no decomposition (1 PE run) with
385	!-- overlapping transposition / fft
386	ALLOCATE( f_out_y(0:ny,nxl_y:nxr_y,nzb_y:nzt_y), &
387	f_out_z(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
388	!
389	!-- Transposition z --> x + subsequent fft along x
390	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
391	CALL resort_for_zx( ar, f_inv )
392	!
393	!-- Save original indices and gridpoint counter
394	isave(1) = nz
395	isave(2) = nzb_x
396	isave(3) = nzt_x
397	isave(4) = sendrecvcount_zx
398	!
399	!-- Set new indices for transformation
400	nblk = nz / pdims(1)
401	nz = pdims(1)
402	nnz_x = 1
403	nzb_x = 1 + myidx * nnz_x
404	nzt_x = ( myidx + 1 ) * nnz_x
405	sendrecvcount_zx = nnx * nny * nnz_x
406
407	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x,2) )
408	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
409
410	DO kk = 1, nblk+1
411	ind_odd = MOD( kk, 2 ) + 1
412	ind_even = MOD( kk+1, 2 ) + 1
413	!$OMP sections private(ki,knew,n)
414	!$OMP section
415	IF ( kk <= nblk ) THEN
416
417	IF ( kk == 1 ) THEN
418	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
419	ELSE
420	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
421	ENDIF
422
423	DO knew = 1, nz
424	ki = kk + nblk * ( knew - 1 )
425	f_in(:,:,knew) = f_inv(:,:,ki)
426	ENDDO
427
428	CALL transpose_zx( f_in, ar1(:,:,:,ind_odd))
429	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
430
431	ENDIF
432
433	!$OMP section
434	IF ( kk >= 2 ) THEN
435
436	IF ( kk == 2 ) THEN
437	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
438	ELSE
439	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
440	ENDIF
441
442	n = isave(2) + kk - 2
443	CALL fft_x( ar1(:,:,:,ind_even), 'forward', ar_2d = f_out_z(:,:,n))
444	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
445
446	ENDIF
447	!$OMP end sections
448
449	ENDDO
450	!
451	!-- Restore original indices/counters
452	nz = isave(1)
453	nzb_x = isave(2)
454	nzt_x = isave(3)
455	sendrecvcount_zx = isave(4)
456
457	DEALLOCATE( ar1, f_in, f_inv )
458
459	!
460	!-- Transposition x --> y + subsequent fft along y
461	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
462	CALL resort_for_xy( f_out_z, f_inv )
463	!
464	!-- Save original indices and gridpoint counter
465	isave(1) = nx
466	isave(2) = nxl_y
467	isave(3) = nxr_y
468	isave(4) = sendrecvcount_xy
469	!
470	!-- Set new indices for transformation
471	nblk = ( ( nx+1 ) / pdims(2) ) - 1
472	nx = pdims(2)
473	nnx_y = 1
474	nxl_y = myidy * nnx_y
475	nxr_y = ( myidy + 1 ) * nnx_y - 1
476	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
477
478	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y,2) )
479	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
480
481	DO ii = 0, nblk+1
482	ind_odd = MOD( ii+1, 2 ) + 1
483	ind_even = MOD( ii+2, 2 ) + 1
484	!$OMP sections private(ki,knew,n)
485	!$OMP section
486	IF ( ii <= nblk ) THEN
487
488	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
489
490	DO inew = 0, nx-1
491	iind = ii + ( nblk + 1 ) * inew
492	f_in(:,:,inew) = f_inv(:,:,iind)
493	ENDDO
494
495	CALL transpose_xy( f_in, ar1(:,:,:,ind_odd) )
496
497	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
498
499	ENDIF
500
501	!$OMP section
502	IF ( ii >= 1 ) THEN
503
504	IF ( ii == 1 ) THEN
505	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
506	ELSE
507	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
508	ENDIF
509
510	nxl_y_bound = isave(2)
511	nxr_y_bound = isave(3)
512	n = isave(2) + ii - 1
513	! CALL fft_y( ar1(:,:,:,ind_even), 'forward', ar_3d = f_out_y, &
514	! ni = n )
515	CALL fft_y( ar1(:,:,:,ind_even), 'forward', ar_tr = f_out_y, &
516	nxl_y_bound = nxl_y_bound, nxr_y_bound = nxr_y_bound, &
517	nxl_y_l = n, nxr_y_l = n )
518
519	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
520
521	ENDIF
522	!$OMP end sections
523
524	ENDDO
525	!
526	!-- Restore original indices/counters
527	nx = isave(1)
528	nxl_y = isave(2)
529	nxr_y = isave(3)
530	sendrecvcount_xy = isave(4)
531
532	DEALLOCATE( ar1, f_in, f_inv )
533
534	!
535	!-- Transposition y --> z + subsequent tridia + resort for z --> y
536	ALLOCATE( f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
537	CALL resort_for_yz( f_out_y, f_inv )
538	!
539	!-- Save original indices and gridpoint counter
540	isave(1) = ny
541	isave(2) = nys_z
542	isave(3) = nyn_z
543	isave(4) = sendrecvcount_yz
544	!
545	!-- Set new indices for transformation
546	nblk = ( ( ny+1 ) / pdims(1) ) - 1
547	ny = pdims(1)
548	nny_z = 1
549	nys_z = myidx * nny_z
550	nyn_z = ( myidx + 1 ) * nny_z - 1
551	sendrecvcount_yz = ( nxr_y-nxl_y+1 ) * nny_z * ( nzt_y-nzb_y+1 )
552
553	ALLOCATE( ar1(nxl_z:nxr_z,nys_z:nyn_z,1:nz,3) )
554	ALLOCATE( f_in(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
555
556	DO jj = 0, nblk+2
557	ind_odd = MOD( jj+3, 3 ) + 1
558	ind_even = MOD( jj+2, 3 ) + 1
559	ind_third = MOD( jj+1, 3 ) + 1
560	!$OMP sections private(ki,knew,n)
561	!$OMP section
562	IF ( jj <= nblk ) THEN
563	!
564	!-- Forward Fourier Transformation
565	!-- Transposition y --> z
566	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
567
568	DO jnew = 0, ny-1
569	jind = jj + ( nblk + 1 ) * jnew
570	f_in(:,:,jnew) =f_inv(:,:,jind)
571	ENDDO
572
573	CALL transpose_yz( f_in, ar1(:,:,:,ind_odd) )
574
575	IF ( jj == nblk ) THEN
576	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
577	ELSE
578	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
579	ENDIF
580
581	ENDIF
582
583	IF ( jj >= 2 ) THEN
584	!
585	!-- Inverse Fourier Transformation
586	!-- Transposition z --> y
587	!-- Only one thread should call MPI routines, therefore forward and
588	!-- backward tranpose are in the same section
589	IF ( jj == 2 ) THEN
590	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
591	ELSE
592	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
593	ENDIF
594
595	CALL transpose_zy( ar1(:,:,:,ind_third), f_in )
596
597	DO jnew = 0, ny-1
598	jind = jj-2 + ( nblk + 1 ) * jnew
599	f_inv(:,:,jind) = f_in(:,:,jnew)
600	ENDDO
601
602	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
603
604	ENDIF
605
606	!$OMP section
607	IF ( jj >= 1 .AND. jj <= nblk+1 ) THEN
608	!
609	!-- Solve the tridiagonal equation system along z
610	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
611
612	n = isave(2) + jj - 1
613	CALL tridia_substi_overlap( ar1(:,:,:,ind_even), n )
614
615	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
616	ENDIF
617	!$OMP end sections
618
619	ENDDO
620	!
621	!-- Restore original indices/counters
622	ny = isave(1)
623	nys_z = isave(2)
624	nyn_z = isave(3)
625	sendrecvcount_yz = isave(4)
626
627	CALL resort_for_zy( f_inv, f_out_y )
628
629	DEALLOCATE( ar1, f_in, f_inv )
630
631	!
632	!-- fft along y backward + subsequent transposition y --> x
633	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
634	!
635	!-- Save original indices and gridpoint counter
636	isave(1) = nx
637	isave(2) = nxl_y
638	isave(3) = nxr_y
639	isave(4) = sendrecvcount_xy
640	!
641	!-- Set new indices for transformation
642	nblk = (( nx+1 ) / pdims(2) ) - 1
643	nx = pdims(2)
644	nnx_y = 1
645	nxl_y = myidy * nnx_y
646	nxr_y = ( myidy + 1 ) * nnx_y - 1
647	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
648
649	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y,2) )
650	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
651
652	DO ii = 0, nblk+1
653	ind_odd = MOD( ii+1, 2 ) + 1
654	ind_even = MOD( ii+2, 2 ) + 1
655	!$OMP sections private(ki,knew,n)
656	!$OMP section
657	IF ( ii <= nblk ) THEN
658
659	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
660
661	n = isave(2) + ii
662	nxl_y_bound = isave(2)
663	nxr_y_bound = isave(3)
664
665	! CALL fft_y( ar1(:,:,:,ind_even), 'backward', ar_3d = f_out_y, &
666	! ni = n )
667	CALL fft_y( ar1(:,:,:,ind_even), 'backward', ar_tr = f_out_y, &
668	nxl_y_bound = nxl_y_bound, nxr_y_bound = nxr_y_bound, &
669	nxl_y_l = n, nxr_y_l = n )
670
671	IF ( ii == nblk ) THEN
672	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
673	ELSE
674	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
675	ENDIF
676
677	ENDIF
678
679	!$OMP section
680	IF ( ii >= 1 ) THEN
681
682	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
683
684	CALL transpose_yx( ar1(:,:,:,ind_odd), f_in )
685
686	DO inew = 0, nx-1
687	iind = ii-1 + (nblk+1) * inew
688	f_inv(:,:,iind) = f_in(:,:,inew)
689	ENDDO
690
691	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
692
693	ENDIF
694	!$OMP end sections
695
696	ENDDO
697	!
698	!-- Restore original indices/counters
699	nx = isave(1)
700	nxl_y = isave(2)
701	nxr_y = isave(3)
702	sendrecvcount_xy = isave(4)
703
704	CALL resort_for_yx( f_inv, f_out_z )
705
706	DEALLOCATE( ar1, f_in, f_inv )
707
708	!
709	!-- fft along x backward + subsequent final transposition x --> z
710	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
711	!
712	!-- Save original indices and gridpoint counter
713	isave(1) = nz
714	isave(2) = nzb_x
715	isave(3) = nzt_x
716	isave(4) = sendrecvcount_zx
717	!
718	!-- Set new indices for transformation
719	nblk = nz / pdims(1)
720	nz = pdims(1)
721	nnz_x = 1
722	nzb_x = 1 + myidx * nnz_x
723	nzt_x = ( myidx + 1 ) * nnz_x
724	sendrecvcount_zx = nnx * nny * nnz_x
725
726	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x,2) )
727	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
728
729	DO kk = 1, nblk+1
730	ind_odd = MOD( kk, 2 ) + 1
731	ind_even = MOD( kk+1, 2 ) + 1
732	!$OMP sections private(ki,knew,n)
733	!$OMP section
734	IF ( kk <= nblk ) THEN
735
736	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
737
738	n = isave(2) + kk - 1
739	CALL fft_x( ar1(:,:,:,ind_even), 'backward', f_out_z(:,:,n))
740
741	IF ( kk == nblk ) THEN
742	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
743	ELSE
744	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
745	ENDIF
746
747	ENDIF
748
749	!$OMP section
750	IF ( kk >= 2 ) THEN
751
752	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
753
754	CALL transpose_xz( ar1(:,:,:,ind_odd), f_in )
755
756	DO knew = 1, nz
757	ki = kk-1 + nblk * (knew-1)
758	f_inv(:,:,ki) = f_in(:,:,knew)
759	ENDDO
760
761	IF ( kk == nblk+1 ) THEN
762	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
763	ELSE
764	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
765	ENDIF
766
767	ENDIF
768	!$OMP end sections
769
770	ENDDO
771	!
772	!-- Restore original indices/counters
773	nz = isave(1)
774	nzb_x = isave(2)
775	nzt_x = isave(3)
776	sendrecvcount_zx = isave(4)
777
778	CALL resort_for_xz( f_inv, ar )
779
780	DEALLOCATE( ar1, f_in, f_inv )
781
782	ENDIF
783
784	CALL cpu_log( log_point_s(3), 'poisfft', 'stop' )
785
786	END SUBROUTINE poisfft
787
788
789
790	SUBROUTINE ffty_tr_yx( f_in, f_out )
791
792	!------------------------------------------------------------------------------!
793	! Fourier-transformation along y with subsequent transposition y --> x for
794	! a 1d-decomposition along x
795	!
796	! ATTENTION: The performance of this routine is much faster on the NEC-SX6,
797	! if the first index of work_ffty_vec is odd. Otherwise
798	! memory bank conflicts may occur (especially if the index is a
799	! multiple of 128). That's why work_ffty_vec is dimensioned as
800	! 0:ny+1.
801	! Of course, this will not work if users are using an odd number
802	! of gridpoints along y.
803	!------------------------------------------------------------------------------!
804
805	USE control_parameters
806	USE cpulog
807	USE indices
808	USE interfaces
809	USE pegrid
810	USE transpose_indices
811
812	IMPLICIT NONE
813
814	INTEGER :: i, iend, iouter, ir, j, k
815	INTEGER, PARAMETER :: stridex = 4
816
817	REAL, DIMENSION(0:ny,stridex) :: work_ffty
818	#if defined( __nec )
819	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
820	#endif
821	REAL, DIMENSION(1:nz,0:ny,nxl:nxr) :: f_in
822	REAL, DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_out
823	REAL, DIMENSION(nxl:nxr,1:nz,0:ny) :: work
824
825	!
826	!-- Carry out the FFT along y, where all data are present due to the
827	!-- 1d-decomposition along x. Resort the data in a way that x becomes
828	!-- the first index.
829	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'start' )
830
831	IF ( host(1:3) == 'nec' ) THEN
832	#if defined( __nec )
833	!
834	!-- Code optimized for vector processors
835	!$OMP PARALLEL PRIVATE ( i, j, k )
836	!$OMP DO
837	DO i = nxl, nxr
838
839	DO j = 0, ny
840	DO k = 1, nz
841	work_ffty_vec(j,k,i) = f_in(k,j,i)
842	ENDDO
843	ENDDO
844
845	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' )
846
847	ENDDO
848
849	!$OMP DO
850	DO k = 1, nz
851	DO j = 0, ny
852	DO i = nxl, nxr
853	work(i,k,j) = work_ffty_vec(j,k,i)
854	ENDDO
855	ENDDO
856	ENDDO
857	!$OMP END PARALLEL
858	#endif
859
860	ELSE
861
862	!
863	!-- Cache optimized code.
864	!-- The i-(x-)direction is split into a strided outer loop and an inner
865	!-- loop for better cache performance
866	!$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty)
867	!$OMP DO
868	DO iouter = nxl, nxr, stridex
869
870	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
871
872	DO k = 1, nz
873
874	DO i = iouter, iend
875
876	ir = i-iouter+1 ! counter within a stride
877	DO j = 0, ny
878	work_ffty(j,ir) = f_in(k,j,i)
879	ENDDO
880	!
881	!-- FFT along y
882	CALL fft_y_1d( work_ffty(:,ir), 'forward' )
883
884	ENDDO
885
886	!
887	!-- Resort
888	DO j = 0, ny
889	DO i = iouter, iend
890	work(i,k,j) = work_ffty(j,i-iouter+1)
891	ENDDO
892	ENDDO
893
894	ENDDO
895
896	ENDDO
897	!$OMP END PARALLEL
898
899	ENDIF
900	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'pause' )
901
902	!
903	!-- Transpose array
904	#if defined( __parallel )
905	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
906	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
907	CALL MPI_ALLTOALL( work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
908	f_out(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
909	comm1dx, ierr )
910	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
911	#endif
912
913	END SUBROUTINE ffty_tr_yx
914
915
916	SUBROUTINE tr_xy_ffty( f_in, f_out )
917
918	!------------------------------------------------------------------------------!
919	! Transposition x --> y with a subsequent backward Fourier transformation for
920	! a 1d-decomposition along x
921	!------------------------------------------------------------------------------!
922
923	USE control_parameters
924	USE cpulog
925	USE indices
926	USE interfaces
927	USE pegrid
928	USE transpose_indices
929
930	IMPLICIT NONE
931
932	INTEGER :: i, iend, iouter, ir, j, k
933	INTEGER, PARAMETER :: stridex = 4
934
935	REAL, DIMENSION(0:ny,stridex) :: work_ffty
936	#if defined( __nec )
937	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
938	#endif
939	REAL, DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_in
940	REAL, DIMENSION(1:nz,0:ny,nxl:nxr) :: f_out
941	REAL, DIMENSION(nxl:nxr,1:nz,0:ny) :: work
942
943	!
944	!-- Transpose array
945	#if defined( __parallel )
946	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
947	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
948	CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
949	work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
950	comm1dx, ierr )
951	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
952	#endif
953
954	!
955	!-- Resort the data in a way that y becomes the first index and carry out the
956	!-- backward fft along y.
957	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'continue' )
958
959	IF ( host(1:3) == 'nec' ) THEN
960	#if defined( __nec )
961	!
962	!-- Code optimized for vector processors
963	!$OMP PARALLEL PRIVATE ( i, j, k )
964	!$OMP DO
965	DO k = 1, nz
966	DO j = 0, ny
967	DO i = nxl, nxr
968	work_ffty_vec(j,k,i) = work(i,k,j)
969	ENDDO
970	ENDDO
971	ENDDO
972
973	!$OMP DO
974	DO i = nxl, nxr
975
976	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' )
977
978	DO j = 0, ny
979	DO k = 1, nz
980	f_out(k,j,i) = work_ffty_vec(j,k,i)
981	ENDDO
982	ENDDO
983
984	ENDDO
985	!$OMP END PARALLEL
986	#endif
987
988	ELSE
989
990	!
991	!-- Cache optimized code.
992	!-- The i-(x-)direction is split into a strided outer loop and an inner
993	!-- loop for better cache performance
994	!$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty )
995	!$OMP DO
996	DO iouter = nxl, nxr, stridex
997
998	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
999
1000	DO k = 1, nz
1001	!
1002	!-- Resort
1003	DO j = 0, ny
1004	DO i = iouter, iend
1005	work_ffty(j,i-iouter+1) = work(i,k,j)
1006	ENDDO
1007	ENDDO
1008
1009	DO i = iouter, iend
1010
1011	!
1012	!-- FFT along y
1013	ir = i-iouter+1 ! counter within a stride
1014	CALL fft_y_1d( work_ffty(:,ir), 'backward' )
1015
1016	DO j = 0, ny
1017	f_out(k,j,i) = work_ffty(j,ir)
1018	ENDDO
1019	ENDDO
1020
1021	ENDDO
1022
1023	ENDDO
1024	!$OMP END PARALLEL
1025
1026	ENDIF
1027
1028	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'stop' )
1029
1030	END SUBROUTINE tr_xy_ffty
1031
1032
1033	SUBROUTINE fftx_tri_fftx( ar )
1034
1035	!------------------------------------------------------------------------------!
1036	! FFT along x, solution of the tridiagonal system and backward FFT for
1037	! a 1d-decomposition along x
1038	!
1039	! WARNING: this subroutine may still not work for hybrid parallelization
1040	! with OpenMP (for possible necessary changes see the original
1041	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
1042	!------------------------------------------------------------------------------!
1043
1044	USE control_parameters
1045	USE cpulog
1046	USE grid_variables
1047	USE indices
1048	USE interfaces
1049	USE pegrid
1050	USE transpose_indices
1051
1052	IMPLICIT NONE
1053
1054	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
1055
1056	REAL, DIMENSION(0:nx) :: work_fftx
1057	REAL, DIMENSION(0:nx,1:nz) :: work_trix
1058	REAL, DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: ar
1059	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
1060
1061
1062	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'start' )
1063
1064	ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) )
1065
1066	tn = 0 ! Default thread number in case of one thread
1067	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_fftx, work_trix )
1068	DO j = nys_x, nyn_x
1069
1070	!$ tn = omp_get_thread_num()
1071
1072	IF ( host(1:3) == 'nec' ) THEN
1073	!
1074	!-- Code optimized for vector processors
1075	DO k = 1, nz
1076
1077	m = 0
1078	DO n = 1, pdims(1)
1079	DO i = 1, nnx
1080	work_trix(m,k) = ar(i,k,j,n)
1081	m = m + 1
1082	ENDDO
1083	ENDDO
1084
1085	ENDDO
1086
1087	CALL fft_x_m( work_trix, 'forward' )
1088
1089	ELSE
1090	!
1091	!-- Cache optimized code
1092	DO k = 1, nz
1093
1094	m = 0
1095	DO n = 1, pdims(1)
1096	DO i = 1, nnx
1097	work_fftx(m) = ar(i,k,j,n)
1098	m = m + 1
1099	ENDDO
1100	ENDDO
1101
1102	CALL fft_x_1d( work_fftx, 'forward' )
1103
1104	DO i = 0, nx
1105	work_trix(i,k) = work_fftx(i)
1106	ENDDO
1107
1108	ENDDO
1109
1110	ENDIF
1111
1112	!
1113	!-- Solve the linear equation system
1114	CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) )
1115
1116	IF ( host(1:3) == 'nec' ) THEN
1117	!
1118	!-- Code optimized for vector processors
1119	CALL fft_x_m( work_trix, 'backward' )
1120
1121	DO k = 1, nz
1122
1123	m = 0
1124	DO n = 1, pdims(1)
1125	DO i = 1, nnx
1126	ar(i,k,j,n) = work_trix(m,k)
1127	m = m + 1
1128	ENDDO
1129	ENDDO
1130
1131	ENDDO
1132
1133	ELSE
1134	!
1135	!-- Cache optimized code
1136	DO k = 1, nz
1137
1138	DO i = 0, nx
1139	work_fftx(i) = work_trix(i,k)
1140	ENDDO
1141
1142	CALL fft_x_1d( work_fftx, 'backward' )
1143
1144	m = 0
1145	DO n = 1, pdims(1)
1146	DO i = 1, nnx
1147	ar(i,k,j,n) = work_fftx(m)
1148	m = m + 1
1149	ENDDO
1150	ENDDO
1151
1152	ENDDO
1153
1154	ENDIF
1155
1156	ENDDO
1157
1158	DEALLOCATE( tri )
1159
1160	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'stop' )
1161
1162	END SUBROUTINE fftx_tri_fftx
1163
1164
1165	SUBROUTINE fftx_tr_xy( f_in, f_out )
1166
1167	!------------------------------------------------------------------------------!
1168	! Fourier-transformation along x with subsequent transposition x --> y for
1169	! a 1d-decomposition along y
1170	!
1171	! ATTENTION: The NEC-branch of this routine may significantly profit from
1172	! further optimizations. So far, performance is much worse than
1173	! for routine ffty_tr_yx (more than three times slower).
1174	!------------------------------------------------------------------------------!
1175
1176	USE control_parameters
1177	USE cpulog
1178	USE indices
1179	USE interfaces
1180	USE pegrid
1181	USE transpose_indices
1182
1183	IMPLICIT NONE
1184
1185	INTEGER :: i, j, k
1186
1187	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
1188	REAL, DIMENSION(1:nz,nys:nyn,0:nx) :: f_in
1189	REAL, DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_out
1190	REAL, DIMENSION(nys:nyn,1:nz,0:nx) :: work
1191
1192	!
1193	!-- Carry out the FFT along x, where all data are present due to the
1194	!-- 1d-decomposition along y. Resort the data in a way that y becomes
1195	!-- the first index.
1196	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'start' )
1197
1198	IF ( host(1:3) == 'nec' ) THEN
1199	!
1200	!-- Code for vector processors
1201	!$OMP PARALLEL PRIVATE ( i, j, k )
1202	!$OMP DO
1203	DO i = 0, nx
1204
1205	DO j = nys, nyn
1206	DO k = 1, nz
1207	work_fftx(i,k,j) = f_in(k,j,i)
1208	ENDDO
1209	ENDDO
1210
1211	ENDDO
1212
1213	!$OMP DO
1214	DO j = nys, nyn
1215
1216	CALL fft_x_m( work_fftx(:,:,j), 'forward' )
1217
1218	DO k = 1, nz
1219	DO i = 0, nx
1220	work(j,k,i) = work_fftx(i,k,j)
1221	ENDDO
1222	ENDDO
1223
1224	ENDDO
1225	!$OMP END PARALLEL
1226
1227	ELSE
1228
1229	!
1230	!-- Cache optimized code (there might be still a potential for better
1231	!-- optimization).
1232	!$OMP PARALLEL PRIVATE (i,j,k)
1233	!$OMP DO
1234	DO i = 0, nx
1235
1236	DO j = nys, nyn
1237	DO k = 1, nz
1238	work_fftx(i,k,j) = f_in(k,j,i)
1239	ENDDO
1240	ENDDO
1241
1242	ENDDO
1243
1244	!$OMP DO
1245	DO j = nys, nyn
1246	DO k = 1, nz
1247
1248	CALL fft_x_1d( work_fftx(0:nx,k,j), 'forward' )
1249
1250	DO i = 0, nx
1251	work(j,k,i) = work_fftx(i,k,j)
1252	ENDDO
1253	ENDDO
1254
1255	ENDDO
1256	!$OMP END PARALLEL
1257
1258	ENDIF
1259	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'pause' )
1260
1261	!
1262	!-- Transpose array
1263	#if defined( __parallel )
1264	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1265	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1266	CALL MPI_ALLTOALL( work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
1267	f_out(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
1268	comm1dy, ierr )
1269	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1270	#endif
1271
1272	END SUBROUTINE fftx_tr_xy
1273
1274
1275	SUBROUTINE tr_yx_fftx( f_in, f_out )
1276
1277	!------------------------------------------------------------------------------!
1278	! Transposition y --> x with a subsequent backward Fourier transformation for
1279	! a 1d-decomposition along x
1280	!------------------------------------------------------------------------------!
1281
1282	USE control_parameters
1283	USE cpulog
1284	USE indices
1285	USE interfaces
1286	USE pegrid
1287	USE transpose_indices
1288
1289	IMPLICIT NONE
1290
1291	INTEGER :: i, j, k
1292
1293	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
1294	REAL, DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_in
1295	REAL, DIMENSION(1:nz,nys:nyn,0:nx) :: f_out
1296	REAL, DIMENSION(nys:nyn,1:nz,0:nx) :: work
1297
1298	!
1299	!-- Transpose array
1300	#if defined( __parallel )
1301	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1302	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1303	CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
1304	work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
1305	comm1dy, ierr )
1306	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1307	#endif
1308
1309	!
1310	!-- Carry out the FFT along x, where all data are present due to the
1311	!-- 1d-decomposition along y. Resort the data in a way that y becomes
1312	!-- the first index.
1313	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'continue' )
1314
1315	IF ( host(1:3) == 'nec' ) THEN
1316	!
1317	!-- Code optimized for vector processors
1318	!$OMP PARALLEL PRIVATE ( i, j, k )
1319	!$OMP DO
1320	DO j = nys, nyn
1321
1322	DO k = 1, nz
1323	DO i = 0, nx
1324	work_fftx(i,k,j) = work(j,k,i)
1325	ENDDO
1326	ENDDO
1327
1328	CALL fft_x_m( work_fftx(:,:,j), 'backward' )
1329
1330	ENDDO
1331
1332	!$OMP DO
1333	DO i = 0, nx
1334	DO j = nys, nyn
1335	DO k = 1, nz
1336	f_out(k,j,i) = work_fftx(i,k,j)
1337	ENDDO
1338	ENDDO
1339	ENDDO
1340	!$OMP END PARALLEL
1341
1342	ELSE
1343
1344	!
1345	!-- Cache optimized code (there might be still a potential for better
1346	!-- optimization).
1347	!$OMP PARALLEL PRIVATE (i,j,k)
1348	!$OMP DO
1349	DO j = nys, nyn
1350	DO k = 1, nz
1351
1352	DO i = 0, nx
1353	work_fftx(i,k,j) = work(j,k,i)
1354	ENDDO
1355
1356	CALL fft_x_1d( work_fftx(0:nx,k,j), 'backward' )
1357
1358	ENDDO
1359	ENDDO
1360
1361	!$OMP DO
1362	DO i = 0, nx
1363	DO j = nys, nyn
1364	DO k = 1, nz
1365	f_out(k,j,i) = work_fftx(i,k,j)
1366	ENDDO
1367	ENDDO
1368	ENDDO
1369	!$OMP END PARALLEL
1370
1371	ENDIF
1372	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'stop' )
1373
1374	END SUBROUTINE tr_yx_fftx
1375
1376
1377	SUBROUTINE ffty_tri_ffty( ar )
1378
1379	!------------------------------------------------------------------------------!
1380	! FFT along y, solution of the tridiagonal system and backward FFT for
1381	! a 1d-decomposition along y
1382	!
1383	! WARNING: this subroutine may still not work for hybrid parallelization
1384	! with OpenMP (for possible necessary changes see the original
1385	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
1386	!------------------------------------------------------------------------------!
1387
1388	USE control_parameters
1389	USE cpulog
1390	USE grid_variables
1391	USE indices
1392	USE interfaces
1393	USE pegrid
1394	USE transpose_indices
1395
1396	IMPLICIT NONE
1397
1398	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
1399
1400	REAL, DIMENSION(0:ny) :: work_ffty
1401	REAL, DIMENSION(0:ny,1:nz) :: work_triy
1402	REAL, DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: ar
1403	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
1404
1405
1406	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'start' )
1407
1408	ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) )
1409
1410	tn = 0 ! Default thread number in case of one thread
1411	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy )
1412	DO i = nxl_y, nxr_y
1413
1414	!$ tn = omp_get_thread_num()
1415
1416	IF ( host(1:3) == 'nec' ) THEN
1417	!
1418	!-- Code optimized for vector processors
1419	DO k = 1, nz
1420
1421	m = 0
1422	DO n = 1, pdims(2)
1423	DO j = 1, nny
1424	work_triy(m,k) = ar(j,k,i,n)
1425	m = m + 1
1426	ENDDO
1427	ENDDO
1428
1429	ENDDO
1430
1431	CALL fft_y_m( work_triy, ny, 'forward' )
1432
1433	ELSE
1434	!
1435	!-- Cache optimized code
1436	DO k = 1, nz
1437
1438	m = 0
1439	DO n = 1, pdims(2)
1440	DO j = 1, nny
1441	work_ffty(m) = ar(j,k,i,n)
1442	m = m + 1
1443	ENDDO
1444	ENDDO
1445
1446	CALL fft_y_1d( work_ffty, 'forward' )
1447
1448	DO j = 0, ny
1449	work_triy(j,k) = work_ffty(j)
1450	ENDDO
1451
1452	ENDDO
1453
1454	ENDIF
1455
1456	!
1457	!-- Solve the linear equation system
1458	CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) )
1459
1460	IF ( host(1:3) == 'nec' ) THEN
1461	!
1462	!-- Code optimized for vector processors
1463	CALL fft_y_m( work_triy, ny, 'backward' )
1464
1465	DO k = 1, nz
1466
1467	m = 0
1468	DO n = 1, pdims(2)
1469	DO j = 1, nny
1470	ar(j,k,i,n) = work_triy(m,k)
1471	m = m + 1
1472	ENDDO
1473	ENDDO
1474
1475	ENDDO
1476
1477	ELSE
1478	!
1479	!-- Cache optimized code
1480	DO k = 1, nz
1481
1482	DO j = 0, ny
1483	work_ffty(j) = work_triy(j,k)
1484	ENDDO
1485
1486	CALL fft_y_1d( work_ffty, 'backward' )
1487
1488	m = 0
1489	DO n = 1, pdims(2)
1490	DO j = 1, nny
1491	ar(j,k,i,n) = work_ffty(m)
1492	m = m + 1
1493	ENDDO
1494	ENDDO
1495
1496	ENDDO
1497
1498	ENDIF
1499
1500	ENDDO
1501
1502	DEALLOCATE( tri )
1503
1504	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'stop' )
1505
1506	END SUBROUTINE ffty_tri_ffty
1507
1508	#endif
1509
1510	END MODULE poisfft_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |