Home

Context Navigation

source: palm/trunk/SOURCE/poisfft.f90 @ 1248

Last change on this file since 1248 was 1217, checked in by raasch, 11 years ago
last commit documented
Property svn:keywords set to `Id`
File size: 45.8 KB

Line
1	MODULE poisfft_mod
2
3	!--------------------------------------------------------------------------------!
4	! This file is part of PALM.
5	!
6	! PALM is free software: you can redistribute it and/or modify it under the terms
7	! of the GNU General Public License as published by the Free Software Foundation,
8	! either version 3 of the License, or (at your option) any later version.
9	!
10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13	!
14	! You should have received a copy of the GNU General Public License along with
15	! PALM. If not, see <http://www.gnu.org/licenses/>.
16	!
17	! Copyright 1997-2012 Leibniz University Hannover
18	!--------------------------------------------------------------------------------!
19	!
20	! Current revisions:
21	! -----------------
22	!
23	!
24	! Former revisions:
25	! -----------------
26	! $Id: poisfft.f90 1217 2013-08-26 11:30:05Z heinze $
27	!
28	! 1216 2013-08-26 09:31:42Z raasch
29	! resorting of arrays moved to separate routines resort_for_...,
30	! one argument, used as temporary work array, removed from all transpose
31	! routines
32	! overlapping fft / transposition implemented
33	!
34	! 1212 2013-08-15 08:46:27Z raasch
35	! tridia routines moved to seperate module tridia_solver
36	!
37	! 1208 2013-08-13 06:41:49Z raasch
38	! acc-update clauses added for "ar" so that ffts other than cufft can also be
39	! used (although they are not ported and will give a poor performance)
40	!
41	! 1111 2013-03-08 23:54:10Z raasch
42	! further openACC porting of non-parallel (MPI) branch:
43	! tridiagonal routines split into extermal subroutines (instead using CONTAINS),
44	! no distinction between parallel/non-parallel in poisfft and tridia any more,
45	! tridia routines moved to end of file because of probable bug in PGI compiler 12.5
46	! (otherwise "invalid device function" is indicated during runtime),
47	! optimization of tridia routines: constant elements and coefficients of tri are
48	! stored in seperate arrays ddzuw and tric, last dimension of tri reduced from 5
49	! to 2,
50	! poisfft_init is now called internally from poisfft, maketri is called from
51	! poisfft_init,
52	! ibc_p_b = 2 removed
53	!
54	! 1106 2013-03-04 05:31:38Z raasch
55	! routines fftx, ffty, fftxp, fftyp removed, calls replaced by fft_x, fft_y,
56	! in the 1D-decomposition routines fft_x, ffty are replaced by fft_x_1d,
57	! fft_y_1d
58	!
59	! 1103 2013-02-20 02:15:53Z raasch
60	! tri, ar, and ar1 arguments in tridia-routines (2d) are removed because they
61	! sometimes cause segmentation faults with intel 12.1 compiler
62	!
63	! 1092 2013-02-02 11:24:22Z raasch
64	! unused variables removed
65	!
66	! 1036 2012-10-22 13:43:42Z raasch
67	! code put under GPL (PALM 3.9)
68	!
69	! 2012-09-21 07:03:55Z raasch
70	! FLOAT type conversion replaced by REAL
71	!
72	! 1003 2012-09-14 14:35:53Z raasch
73	! indices nxa, nya, etc. replaced by nx, ny, etc.
74	!
75	! 940 2012-07-09 14:31:00Z raasch
76	! special handling of tri-array as an argument in tridia_1dd routines switched
77	! off because it caused segmentation faults with intel 12.1 compiler
78	!
79	! 877 2012-04-03 11:21:44Z suehring
80	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
81	! pressure at the top of the model domain.
82	!
83	! 809 2012-01-30 13:32:58Z maronga
84	! Bugfix: replaced .AND. and .NOT. with && and ! in the preprocessor directives
85	!
86	! 807 2012-01-25 11:53:51Z maronga
87	! New cpp directive "__check" implemented which is used by check_namelist_files
88	! (most of the code is unneeded by check_namelist_files).
89	!
90	! 763 2011-10-06 09:32:09Z suehring
91	! Comment added concerning the last change.
92	!
93	! 761 2011-10-05 17:58:52Z suehring
94	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
95	! pressure at the top of the model domain.
96	!
97	! 696 2011-03-18 07:03:49Z raasch
98	! work_fftx removed from PRIVATE clauses in fftx_tr_xy and tr_yx_fftx
99	!
100	! 683 2011-02-09 14:25:15Z raasch
101	! openMP parallelization for 2d-domain-decomposition
102	!
103	! 667 2010-12-23 12:06:00Z suehring/gryschka
104	! ddzu replaced by ddzu_pres due to changes in zu(0)
105	!
106	! 622 2010-12-10 08:08:13Z raasch
107	! optional barriers included in order to speed up collective operations
108	!
109	! 377 2009-09-04 11:09:00Z raasch
110	! __lcmuk changed to __lc to avoid problems with Intel compiler on sgi-ice
111	!
112	! 164 2008-05-15 08:46:15Z raasch
113	! Arguments removed from transpose routines
114	!
115	! 128 2007-10-26 13:11:14Z raasch
116	! Bugfix: wavenumber calculation for even nx in routines maketri
117	!
118	! 85 2007-05-11 09:35:14Z raasch
119	! Bugfix: work_fft*_vec removed from some PRIVATE-declarations
120	!
121	! 76 2007-03-29 00:58:32Z raasch
122	! Tridiagonal coefficients adjusted for Neumann boundary conditions both at
123	! the bottom and the top.
124	!
125	! RCS Log replace by Id keyword, revision history cleaned up
126	!
127	! Revision 1.24 2006/08/04 15:00:24 raasch
128	! Default setting of the thread number tn in case of not using OpenMP
129	!
130	! Revision 1.23 2006/02/23 12:48:38 raasch
131	! Additional compiler directive in routine tridia_1dd for preventing loop
132	! exchange on NEC-SX6
133	!
134	! Revision 1.20 2004/04/30 12:38:09 raasch
135	! Parts of former poisfft_hybrid moved to this subroutine,
136	! former subroutine changed to a module, renaming of FFT-subroutines and
137	! -module, FFTs completely substituted by calls of fft_x and fft_y,
138	! NAG fft used in the non-parallel case completely removed, l in maketri
139	! is now a 1d-array, variables passed by modules instead of using parameter
140	! lists, enlarged transposition arrays introduced
141	!
142	! Revision 1.1 1997/07/24 11:24:14 raasch
143	! Initial revision
144	!
145	!
146	! Description:
147	! ------------
148	! See below.
149	!------------------------------------------------------------------------------!
150
151	!--------------------------------------------------------------------------!
152	! poisfft !
153	! !
154	! Original version: Stephan Siano (pois3d) !
155	! !
156	! Institute of Meteorology and Climatology, University of Hannover !
157	! Germany !
158	! !
159	! Version as of July 23,1996 !
160	! !
161	! !
162	! Version for parallel computers: Siegfried Raasch !
163	! !
164	! Version as of July 03,1997 !
165	! !
166	! Solves the Poisson equation with a 2D spectral method !
167	! d^2 p / dx^2 + d^2 p / dy^2 + d^2 p / dz^2 = s !
168	! !
169	! Input: !
170	! real ar contains in the (nnx,nny,nnz) elements, !
171	! starting from the element (1,nys,nxl), the !
172	! values for s !
173	! real work Temporary array !
174	! !
175	! Output: !
176	! real ar contains the solution for p !
177	!--------------------------------------------------------------------------!
178
179	USE fft_xy
180	USE indices
181	USE transpose_indices
182	USE tridia_solver
183
184	IMPLICIT NONE
185
186	LOGICAL, SAVE :: poisfft_initialized = .FALSE.
187
188	PRIVATE
189
190	#if ! defined ( __check )
191	PUBLIC poisfft, poisfft_init
192
193	INTERFACE poisfft
194	MODULE PROCEDURE poisfft
195	END INTERFACE poisfft
196
197	INTERFACE poisfft_init
198	MODULE PROCEDURE poisfft_init
199	END INTERFACE poisfft_init
200	#else
201	PUBLIC poisfft_init
202
203	INTERFACE poisfft_init
204	MODULE PROCEDURE poisfft_init
205	END INTERFACE poisfft_init
206	#endif
207
208	CONTAINS
209
210	SUBROUTINE poisfft_init
211
212	USE arrays_3d, ONLY: ddzu_pres, ddzw
213
214	IMPLICIT NONE
215
216	INTEGER :: k
217
218
219	CALL fft_init
220
221	CALL tridia_init
222
223	poisfft_initialized = .TRUE.
224
225	END SUBROUTINE poisfft_init
226
227
228	#if ! defined ( __check )
229	SUBROUTINE poisfft( ar, ar_inv_test )
230
231	USE control_parameters, ONLY : fft_method, transpose_compute_overlap
232	USE cpulog
233	USE interfaces
234	USE pegrid
235
236	IMPLICIT NONE
237
238	INTEGER :: ind_even, ind_odd, ind_third, ii, iind, inew, jj, jind, &
239	jnew, ki, kk, knew, n, nblk, nnx_y, nny_z, nnz_t, nnz_x, &
240	nxl_y_bound, nxr_y_bound
241	INTEGER, DIMENSION(4) :: isave
242
243	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar
244	REAL, DIMENSION(nys:nyn,nxl:nxr,1:nz) :: ar_inv_test ! work array tend from pres
245	!$acc declare create( ar_inv )
246	REAL, DIMENSION(nys:nyn,nxl:nxr,1:nz) :: ar_inv
247
248	REAL, DIMENSION(:,:,:), ALLOCATABLE :: f_in, f_inv, f_out_y, f_out_z
249	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: ar1
250
251
252	CALL cpu_log( log_point_s(3), 'poisfft', 'start' )
253
254	IF ( .NOT. poisfft_initialized ) CALL poisfft_init
255
256	!
257	!-- Two-dimensional Fourier Transformation in x- and y-direction.
258	IF ( pdims(2) == 1 .AND. pdims(1) > 1 ) THEN
259
260	!
261	!-- 1d-domain-decomposition along x:
262	!-- FFT along y and transposition y --> x
263	CALL ffty_tr_yx( ar, ar )
264
265	!
266	!-- FFT along x, solving the tridiagonal system and backward FFT
267	CALL fftx_tri_fftx( ar )
268
269	!
270	!-- Transposition x --> y and backward FFT along y
271	CALL tr_xy_ffty( ar, ar )
272
273	ELSEIF ( pdims(1) == 1 .AND. pdims(2) > 1 ) THEN
274
275	!
276	!-- 1d-domain-decomposition along y:
277	!-- FFT along x and transposition x --> y
278	CALL fftx_tr_xy( ar, ar )
279
280	!
281	!-- FFT along y, solving the tridiagonal system and backward FFT
282	CALL ffty_tri_ffty( ar )
283
284	!
285	!-- Transposition y --> x and backward FFT along x
286	CALL tr_yx_fftx( ar, ar )
287
288	ELSEIF ( .NOT. transpose_compute_overlap ) THEN
289
290	!
291	!-- 2d-domain-decomposition or no decomposition (1 PE run)
292	!-- Transposition z --> x
293	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
294	CALL resort_for_zx( ar, ar_inv )
295	CALL transpose_zx( ar_inv, ar )
296	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
297
298	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
299	IF ( fft_method /= 'system-specific' ) THEN
300	!$acc update host( ar )
301	ENDIF
302	CALL fft_x( ar, 'forward' )
303	IF ( fft_method /= 'system-specific' ) THEN
304	!$acc update device( ar )
305	ENDIF
306	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
307
308	!
309	!-- Transposition x --> y
310	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
311	CALL resort_for_xy( ar, ar_inv )
312	CALL transpose_xy( ar_inv, ar )
313	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
314
315	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
316	IF ( fft_method /= 'system-specific' ) THEN
317	!$acc update host( ar )
318	ENDIF
319	CALL fft_y( ar, 'forward', ar_tr = ar, &
320	nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
321	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
322	IF ( fft_method /= 'system-specific' ) THEN
323	!$acc update device( ar )
324	ENDIF
325	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
326
327	!
328	!-- Transposition y --> z
329	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
330	CALL resort_for_yz( ar, ar_inv )
331	CALL transpose_yz( ar_inv, ar )
332	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
333
334	!
335	!-- Solve the tridiagonal equation system along z
336	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
337	CALL tridia_substi( ar )
338	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
339
340	!
341	!-- Inverse Fourier Transformation
342	!-- Transposition z --> y
343	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
344	CALL transpose_zy( ar, ar_inv )
345	CALL resort_for_zy( ar_inv, ar )
346	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
347
348	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
349	IF ( fft_method /= 'system-specific' ) THEN
350	!$acc update host( ar )
351	ENDIF
352	CALL fft_y( ar, 'backward', ar_tr = ar, &
353	nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
354	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
355	IF ( fft_method /= 'system-specific' ) THEN
356	!$acc update device( ar )
357	ENDIF
358	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
359
360	!
361	!-- Transposition y --> x
362	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
363	CALL transpose_yx( ar, ar_inv )
364	CALL resort_for_yx( ar_inv, ar )
365	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
366
367	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
368	IF ( fft_method /= 'system-specific' ) THEN
369	!$acc update host( ar )
370	ENDIF
371	CALL fft_x( ar, 'backward' )
372	IF ( fft_method /= 'system-specific' ) THEN
373	!$acc update device( ar )
374	ENDIF
375	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
376
377	!
378	!-- Transposition x --> z
379	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
380	CALL transpose_xz( ar, ar_inv )
381	CALL resort_for_xz( ar_inv, ar )
382	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
383
384	ELSE
385
386	!
387	!-- 2d-domain-decomposition or no decomposition (1 PE run) with
388	!-- overlapping transposition / fft
389	ALLOCATE( f_out_y(0:ny,nxl_y:nxr_y,nzb_y:nzt_y), &
390	f_out_z(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
391	!
392	!-- Transposition z --> x + subsequent fft along x
393	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
394	CALL resort_for_zx( ar, f_inv )
395	!
396	!-- Save original indices and gridpoint counter
397	isave(1) = nz
398	isave(2) = nzb_x
399	isave(3) = nzt_x
400	isave(4) = sendrecvcount_zx
401	!
402	!-- Set new indices for transformation
403	nblk = nz / pdims(1)
404	nz = pdims(1)
405	nnz_x = 1
406	nzb_x = 1 + myidx * nnz_x
407	nzt_x = ( myidx + 1 ) * nnz_x
408	sendrecvcount_zx = nnx * nny * nnz_x
409
410	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x,2) )
411	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
412
413	DO kk = 1, nblk+1
414	ind_odd = MOD( kk, 2 ) + 1
415	ind_even = MOD( kk+1, 2 ) + 1
416	!$OMP sections private(ki,knew,n)
417	!$OMP section
418	IF ( kk <= nblk ) THEN
419
420	IF ( kk == 1 ) THEN
421	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
422	ELSE
423	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
424	ENDIF
425
426	DO knew = 1, nz
427	ki = kk + nblk * ( knew - 1 )
428	f_in(:,:,knew) = f_inv(:,:,ki)
429	ENDDO
430
431	CALL transpose_zx( f_in, ar1(:,:,:,ind_odd))
432	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
433
434	ENDIF
435
436	!$OMP section
437	IF ( kk >= 2 ) THEN
438
439	IF ( kk == 2 ) THEN
440	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
441	ELSE
442	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
443	ENDIF
444
445	n = isave(2) + kk - 2
446	CALL fft_x( ar1(:,:,:,ind_even), 'forward', ar_2d = f_out_z(:,:,n))
447	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
448
449	ENDIF
450	!$OMP end sections
451
452	ENDDO
453	!
454	!-- Restore original indices/counters
455	nz = isave(1)
456	nzb_x = isave(2)
457	nzt_x = isave(3)
458	sendrecvcount_zx = isave(4)
459
460	DEALLOCATE( ar1, f_in, f_inv )
461
462	!
463	!-- Transposition x --> y + subsequent fft along y
464	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
465	CALL resort_for_xy( f_out_z, f_inv )
466	!
467	!-- Save original indices and gridpoint counter
468	isave(1) = nx
469	isave(2) = nxl_y
470	isave(3) = nxr_y
471	isave(4) = sendrecvcount_xy
472	!
473	!-- Set new indices for transformation
474	nblk = ( ( nx+1 ) / pdims(2) ) - 1
475	nx = pdims(2)
476	nnx_y = 1
477	nxl_y = myidy * nnx_y
478	nxr_y = ( myidy + 1 ) * nnx_y - 1
479	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
480
481	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y,2) )
482	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
483
484	DO ii = 0, nblk+1
485	ind_odd = MOD( ii+1, 2 ) + 1
486	ind_even = MOD( ii+2, 2 ) + 1
487	!$OMP sections private(ki,knew,n)
488	!$OMP section
489	IF ( ii <= nblk ) THEN
490
491	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
492
493	DO inew = 0, nx-1
494	iind = ii + ( nblk + 1 ) * inew
495	f_in(:,:,inew) = f_inv(:,:,iind)
496	ENDDO
497
498	CALL transpose_xy( f_in, ar1(:,:,:,ind_odd) )
499
500	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
501
502	ENDIF
503
504	!$OMP section
505	IF ( ii >= 1 ) THEN
506
507	IF ( ii == 1 ) THEN
508	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
509	ELSE
510	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
511	ENDIF
512
513	nxl_y_bound = isave(2)
514	nxr_y_bound = isave(3)
515	n = isave(2) + ii - 1
516	! CALL fft_y( ar1(:,:,:,ind_even), 'forward', ar_3d = f_out_y, &
517	! ni = n )
518	CALL fft_y( ar1(:,:,:,ind_even), 'forward', ar_tr = f_out_y, &
519	nxl_y_bound = nxl_y_bound, nxr_y_bound = nxr_y_bound, &
520	nxl_y_l = n, nxr_y_l = n )
521
522	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
523
524	ENDIF
525	!$OMP end sections
526
527	ENDDO
528	!
529	!-- Restore original indices/counters
530	nx = isave(1)
531	nxl_y = isave(2)
532	nxr_y = isave(3)
533	sendrecvcount_xy = isave(4)
534
535	DEALLOCATE( ar1, f_in, f_inv )
536
537	!
538	!-- Transposition y --> z + subsequent tridia + resort for z --> y
539	ALLOCATE( f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
540	CALL resort_for_yz( f_out_y, f_inv )
541	!
542	!-- Save original indices and gridpoint counter
543	isave(1) = ny
544	isave(2) = nys_z
545	isave(3) = nyn_z
546	isave(4) = sendrecvcount_yz
547	!
548	!-- Set new indices for transformation
549	nblk = ( ( ny+1 ) / pdims(1) ) - 1
550	ny = pdims(1)
551	nny_z = 1
552	nys_z = myidx * nny_z
553	nyn_z = ( myidx + 1 ) * nny_z - 1
554	sendrecvcount_yz = ( nxr_y-nxl_y+1 ) * nny_z * ( nzt_y-nzb_y+1 )
555
556	ALLOCATE( ar1(nxl_z:nxr_z,nys_z:nyn_z,1:nz,3) )
557	ALLOCATE( f_in(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
558
559	DO jj = 0, nblk+2
560	ind_odd = MOD( jj+3, 3 ) + 1
561	ind_even = MOD( jj+2, 3 ) + 1
562	ind_third = MOD( jj+1, 3 ) + 1
563	!$OMP sections private(ki,knew,n)
564	!$OMP section
565	IF ( jj <= nblk ) THEN
566	!
567	!-- Forward Fourier Transformation
568	!-- Transposition y --> z
569	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
570
571	DO jnew = 0, ny-1
572	jind = jj + ( nblk + 1 ) * jnew
573	f_in(:,:,jnew) =f_inv(:,:,jind)
574	ENDDO
575
576	CALL transpose_yz( f_in, ar1(:,:,:,ind_odd) )
577
578	IF ( jj == nblk ) THEN
579	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
580	ELSE
581	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
582	ENDIF
583
584	ENDIF
585
586	IF ( jj >= 2 ) THEN
587	!
588	!-- Inverse Fourier Transformation
589	!-- Transposition z --> y
590	!-- Only one thread should call MPI routines, therefore forward and
591	!-- backward tranpose are in the same section
592	IF ( jj == 2 ) THEN
593	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
594	ELSE
595	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
596	ENDIF
597
598	CALL transpose_zy( ar1(:,:,:,ind_third), f_in )
599
600	DO jnew = 0, ny-1
601	jind = jj-2 + ( nblk + 1 ) * jnew
602	f_inv(:,:,jind) = f_in(:,:,jnew)
603	ENDDO
604
605	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
606
607	ENDIF
608
609	!$OMP section
610	IF ( jj >= 1 .AND. jj <= nblk+1 ) THEN
611	!
612	!-- Solve the tridiagonal equation system along z
613	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
614
615	n = isave(2) + jj - 1
616	CALL tridia_substi_overlap( ar1(:,:,:,ind_even), n )
617
618	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
619	ENDIF
620	!$OMP end sections
621
622	ENDDO
623	!
624	!-- Restore original indices/counters
625	ny = isave(1)
626	nys_z = isave(2)
627	nyn_z = isave(3)
628	sendrecvcount_yz = isave(4)
629
630	CALL resort_for_zy( f_inv, f_out_y )
631
632	DEALLOCATE( ar1, f_in, f_inv )
633
634	!
635	!-- fft along y backward + subsequent transposition y --> x
636	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
637	!
638	!-- Save original indices and gridpoint counter
639	isave(1) = nx
640	isave(2) = nxl_y
641	isave(3) = nxr_y
642	isave(4) = sendrecvcount_xy
643	!
644	!-- Set new indices for transformation
645	nblk = (( nx+1 ) / pdims(2) ) - 1
646	nx = pdims(2)
647	nnx_y = 1
648	nxl_y = myidy * nnx_y
649	nxr_y = ( myidy + 1 ) * nnx_y - 1
650	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
651
652	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y,2) )
653	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
654
655	DO ii = 0, nblk+1
656	ind_odd = MOD( ii+1, 2 ) + 1
657	ind_even = MOD( ii+2, 2 ) + 1
658	!$OMP sections private(ki,knew,n)
659	!$OMP section
660	IF ( ii <= nblk ) THEN
661
662	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
663
664	n = isave(2) + ii
665	nxl_y_bound = isave(2)
666	nxr_y_bound = isave(3)
667
668	! CALL fft_y( ar1(:,:,:,ind_even), 'backward', ar_3d = f_out_y, &
669	! ni = n )
670	CALL fft_y( ar1(:,:,:,ind_even), 'backward', ar_tr = f_out_y, &
671	nxl_y_bound = nxl_y_bound, nxr_y_bound = nxr_y_bound, &
672	nxl_y_l = n, nxr_y_l = n )
673
674	IF ( ii == nblk ) THEN
675	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
676	ELSE
677	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
678	ENDIF
679
680	ENDIF
681
682	!$OMP section
683	IF ( ii >= 1 ) THEN
684
685	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
686
687	CALL transpose_yx( ar1(:,:,:,ind_odd), f_in )
688
689	DO inew = 0, nx-1
690	iind = ii-1 + (nblk+1) * inew
691	f_inv(:,:,iind) = f_in(:,:,inew)
692	ENDDO
693
694	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
695
696	ENDIF
697	!$OMP end sections
698
699	ENDDO
700	!
701	!-- Restore original indices/counters
702	nx = isave(1)
703	nxl_y = isave(2)
704	nxr_y = isave(3)
705	sendrecvcount_xy = isave(4)
706
707	CALL resort_for_yx( f_inv, f_out_z )
708
709	DEALLOCATE( ar1, f_in, f_inv )
710
711	!
712	!-- fft along x backward + subsequent final transposition x --> z
713	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
714	!
715	!-- Save original indices and gridpoint counter
716	isave(1) = nz
717	isave(2) = nzb_x
718	isave(3) = nzt_x
719	isave(4) = sendrecvcount_zx
720	!
721	!-- Set new indices for transformation
722	nblk = nz / pdims(1)
723	nz = pdims(1)
724	nnz_x = 1
725	nzb_x = 1 + myidx * nnz_x
726	nzt_x = ( myidx + 1 ) * nnz_x
727	sendrecvcount_zx = nnx * nny * nnz_x
728
729	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x,2) )
730	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
731
732	DO kk = 1, nblk+1
733	ind_odd = MOD( kk, 2 ) + 1
734	ind_even = MOD( kk+1, 2 ) + 1
735	!$OMP sections private(ki,knew,n)
736	!$OMP section
737	IF ( kk <= nblk ) THEN
738
739	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
740
741	n = isave(2) + kk - 1
742	CALL fft_x( ar1(:,:,:,ind_even), 'backward', f_out_z(:,:,n))
743
744	IF ( kk == nblk ) THEN
745	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
746	ELSE
747	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
748	ENDIF
749
750	ENDIF
751
752	!$OMP section
753	IF ( kk >= 2 ) THEN
754
755	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
756
757	CALL transpose_xz( ar1(:,:,:,ind_odd), f_in )
758
759	DO knew = 1, nz
760	ki = kk-1 + nblk * (knew-1)
761	f_inv(:,:,ki) = f_in(:,:,knew)
762	ENDDO
763
764	IF ( kk == nblk+1 ) THEN
765	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
766	ELSE
767	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
768	ENDIF
769
770	ENDIF
771	!$OMP end sections
772
773	ENDDO
774	!
775	!-- Restore original indices/counters
776	nz = isave(1)
777	nzb_x = isave(2)
778	nzt_x = isave(3)
779	sendrecvcount_zx = isave(4)
780
781	CALL resort_for_xz( f_inv, ar )
782
783	DEALLOCATE( ar1, f_in, f_inv )
784
785	ENDIF
786
787	CALL cpu_log( log_point_s(3), 'poisfft', 'stop' )
788
789	END SUBROUTINE poisfft
790
791
792
793	SUBROUTINE ffty_tr_yx( f_in, f_out )
794
795	!------------------------------------------------------------------------------!
796	! Fourier-transformation along y with subsequent transposition y --> x for
797	! a 1d-decomposition along x
798	!
799	! ATTENTION: The performance of this routine is much faster on the NEC-SX6,
800	! if the first index of work_ffty_vec is odd. Otherwise
801	! memory bank conflicts may occur (especially if the index is a
802	! multiple of 128). That's why work_ffty_vec is dimensioned as
803	! 0:ny+1.
804	! Of course, this will not work if users are using an odd number
805	! of gridpoints along y.
806	!------------------------------------------------------------------------------!
807
808	USE control_parameters
809	USE cpulog
810	USE indices
811	USE interfaces
812	USE pegrid
813	USE transpose_indices
814
815	IMPLICIT NONE
816
817	INTEGER :: i, iend, iouter, ir, j, k
818	INTEGER, PARAMETER :: stridex = 4
819
820	REAL, DIMENSION(0:ny,stridex) :: work_ffty
821	#if defined( __nec )
822	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
823	#endif
824	REAL, DIMENSION(1:nz,0:ny,nxl:nxr) :: f_in
825	REAL, DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_out
826	REAL, DIMENSION(nxl:nxr,1:nz,0:ny) :: work
827
828	!
829	!-- Carry out the FFT along y, where all data are present due to the
830	!-- 1d-decomposition along x. Resort the data in a way that x becomes
831	!-- the first index.
832	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'start' )
833
834	IF ( host(1:3) == 'nec' ) THEN
835	#if defined( __nec )
836	!
837	!-- Code optimized for vector processors
838	!$OMP PARALLEL PRIVATE ( i, j, k )
839	!$OMP DO
840	DO i = nxl, nxr
841
842	DO j = 0, ny
843	DO k = 1, nz
844	work_ffty_vec(j,k,i) = f_in(k,j,i)
845	ENDDO
846	ENDDO
847
848	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' )
849
850	ENDDO
851
852	!$OMP DO
853	DO k = 1, nz
854	DO j = 0, ny
855	DO i = nxl, nxr
856	work(i,k,j) = work_ffty_vec(j,k,i)
857	ENDDO
858	ENDDO
859	ENDDO
860	!$OMP END PARALLEL
861	#endif
862
863	ELSE
864
865	!
866	!-- Cache optimized code.
867	!-- The i-(x-)direction is split into a strided outer loop and an inner
868	!-- loop for better cache performance
869	!$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty)
870	!$OMP DO
871	DO iouter = nxl, nxr, stridex
872
873	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
874
875	DO k = 1, nz
876
877	DO i = iouter, iend
878
879	ir = i-iouter+1 ! counter within a stride
880	DO j = 0, ny
881	work_ffty(j,ir) = f_in(k,j,i)
882	ENDDO
883	!
884	!-- FFT along y
885	CALL fft_y_1d( work_ffty(:,ir), 'forward' )
886
887	ENDDO
888
889	!
890	!-- Resort
891	DO j = 0, ny
892	DO i = iouter, iend
893	work(i,k,j) = work_ffty(j,i-iouter+1)
894	ENDDO
895	ENDDO
896
897	ENDDO
898
899	ENDDO
900	!$OMP END PARALLEL
901
902	ENDIF
903	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'pause' )
904
905	!
906	!-- Transpose array
907	#if defined( __parallel )
908	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
909	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
910	CALL MPI_ALLTOALL( work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
911	f_out(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
912	comm1dx, ierr )
913	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
914	#endif
915
916	END SUBROUTINE ffty_tr_yx
917
918
919	SUBROUTINE tr_xy_ffty( f_in, f_out )
920
921	!------------------------------------------------------------------------------!
922	! Transposition x --> y with a subsequent backward Fourier transformation for
923	! a 1d-decomposition along x
924	!------------------------------------------------------------------------------!
925
926	USE control_parameters
927	USE cpulog
928	USE indices
929	USE interfaces
930	USE pegrid
931	USE transpose_indices
932
933	IMPLICIT NONE
934
935	INTEGER :: i, iend, iouter, ir, j, k
936	INTEGER, PARAMETER :: stridex = 4
937
938	REAL, DIMENSION(0:ny,stridex) :: work_ffty
939	#if defined( __nec )
940	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
941	#endif
942	REAL, DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_in
943	REAL, DIMENSION(1:nz,0:ny,nxl:nxr) :: f_out
944	REAL, DIMENSION(nxl:nxr,1:nz,0:ny) :: work
945
946	!
947	!-- Transpose array
948	#if defined( __parallel )
949	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
950	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
951	CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
952	work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
953	comm1dx, ierr )
954	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
955	#endif
956
957	!
958	!-- Resort the data in a way that y becomes the first index and carry out the
959	!-- backward fft along y.
960	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'continue' )
961
962	IF ( host(1:3) == 'nec' ) THEN
963	#if defined( __nec )
964	!
965	!-- Code optimized for vector processors
966	!$OMP PARALLEL PRIVATE ( i, j, k )
967	!$OMP DO
968	DO k = 1, nz
969	DO j = 0, ny
970	DO i = nxl, nxr
971	work_ffty_vec(j,k,i) = work(i,k,j)
972	ENDDO
973	ENDDO
974	ENDDO
975
976	!$OMP DO
977	DO i = nxl, nxr
978
979	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' )
980
981	DO j = 0, ny
982	DO k = 1, nz
983	f_out(k,j,i) = work_ffty_vec(j,k,i)
984	ENDDO
985	ENDDO
986
987	ENDDO
988	!$OMP END PARALLEL
989	#endif
990
991	ELSE
992
993	!
994	!-- Cache optimized code.
995	!-- The i-(x-)direction is split into a strided outer loop and an inner
996	!-- loop for better cache performance
997	!$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty )
998	!$OMP DO
999	DO iouter = nxl, nxr, stridex
1000
1001	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
1002
1003	DO k = 1, nz
1004	!
1005	!-- Resort
1006	DO j = 0, ny
1007	DO i = iouter, iend
1008	work_ffty(j,i-iouter+1) = work(i,k,j)
1009	ENDDO
1010	ENDDO
1011
1012	DO i = iouter, iend
1013
1014	!
1015	!-- FFT along y
1016	ir = i-iouter+1 ! counter within a stride
1017	CALL fft_y_1d( work_ffty(:,ir), 'backward' )
1018
1019	DO j = 0, ny
1020	f_out(k,j,i) = work_ffty(j,ir)
1021	ENDDO
1022	ENDDO
1023
1024	ENDDO
1025
1026	ENDDO
1027	!$OMP END PARALLEL
1028
1029	ENDIF
1030
1031	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'stop' )
1032
1033	END SUBROUTINE tr_xy_ffty
1034
1035
1036	SUBROUTINE fftx_tri_fftx( ar )
1037
1038	!------------------------------------------------------------------------------!
1039	! FFT along x, solution of the tridiagonal system and backward FFT for
1040	! a 1d-decomposition along x
1041	!
1042	! WARNING: this subroutine may still not work for hybrid parallelization
1043	! with OpenMP (for possible necessary changes see the original
1044	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
1045	!------------------------------------------------------------------------------!
1046
1047	USE control_parameters
1048	USE cpulog
1049	USE grid_variables
1050	USE indices
1051	USE interfaces
1052	USE pegrid
1053	USE transpose_indices
1054
1055	IMPLICIT NONE
1056
1057	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
1058
1059	REAL, DIMENSION(0:nx) :: work_fftx
1060	REAL, DIMENSION(0:nx,1:nz) :: work_trix
1061	REAL, DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: ar
1062	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
1063
1064
1065	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'start' )
1066
1067	ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) )
1068
1069	tn = 0 ! Default thread number in case of one thread
1070	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_fftx, work_trix )
1071	DO j = nys_x, nyn_x
1072
1073	!$ tn = omp_get_thread_num()
1074
1075	IF ( host(1:3) == 'nec' ) THEN
1076	!
1077	!-- Code optimized for vector processors
1078	DO k = 1, nz
1079
1080	m = 0
1081	DO n = 1, pdims(1)
1082	DO i = 1, nnx
1083	work_trix(m,k) = ar(i,k,j,n)
1084	m = m + 1
1085	ENDDO
1086	ENDDO
1087
1088	ENDDO
1089
1090	CALL fft_x_m( work_trix, 'forward' )
1091
1092	ELSE
1093	!
1094	!-- Cache optimized code
1095	DO k = 1, nz
1096
1097	m = 0
1098	DO n = 1, pdims(1)
1099	DO i = 1, nnx
1100	work_fftx(m) = ar(i,k,j,n)
1101	m = m + 1
1102	ENDDO
1103	ENDDO
1104
1105	CALL fft_x_1d( work_fftx, 'forward' )
1106
1107	DO i = 0, nx
1108	work_trix(i,k) = work_fftx(i)
1109	ENDDO
1110
1111	ENDDO
1112
1113	ENDIF
1114
1115	!
1116	!-- Solve the linear equation system
1117	CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) )
1118
1119	IF ( host(1:3) == 'nec' ) THEN
1120	!
1121	!-- Code optimized for vector processors
1122	CALL fft_x_m( work_trix, 'backward' )
1123
1124	DO k = 1, nz
1125
1126	m = 0
1127	DO n = 1, pdims(1)
1128	DO i = 1, nnx
1129	ar(i,k,j,n) = work_trix(m,k)
1130	m = m + 1
1131	ENDDO
1132	ENDDO
1133
1134	ENDDO
1135
1136	ELSE
1137	!
1138	!-- Cache optimized code
1139	DO k = 1, nz
1140
1141	DO i = 0, nx
1142	work_fftx(i) = work_trix(i,k)
1143	ENDDO
1144
1145	CALL fft_x_1d( work_fftx, 'backward' )
1146
1147	m = 0
1148	DO n = 1, pdims(1)
1149	DO i = 1, nnx
1150	ar(i,k,j,n) = work_fftx(m)
1151	m = m + 1
1152	ENDDO
1153	ENDDO
1154
1155	ENDDO
1156
1157	ENDIF
1158
1159	ENDDO
1160
1161	DEALLOCATE( tri )
1162
1163	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'stop' )
1164
1165	END SUBROUTINE fftx_tri_fftx
1166
1167
1168	SUBROUTINE fftx_tr_xy( f_in, f_out )
1169
1170	!------------------------------------------------------------------------------!
1171	! Fourier-transformation along x with subsequent transposition x --> y for
1172	! a 1d-decomposition along y
1173	!
1174	! ATTENTION: The NEC-branch of this routine may significantly profit from
1175	! further optimizations. So far, performance is much worse than
1176	! for routine ffty_tr_yx (more than three times slower).
1177	!------------------------------------------------------------------------------!
1178
1179	USE control_parameters
1180	USE cpulog
1181	USE indices
1182	USE interfaces
1183	USE pegrid
1184	USE transpose_indices
1185
1186	IMPLICIT NONE
1187
1188	INTEGER :: i, j, k
1189
1190	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
1191	REAL, DIMENSION(1:nz,nys:nyn,0:nx) :: f_in
1192	REAL, DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_out
1193	REAL, DIMENSION(nys:nyn,1:nz,0:nx) :: work
1194
1195	!
1196	!-- Carry out the FFT along x, where all data are present due to the
1197	!-- 1d-decomposition along y. Resort the data in a way that y becomes
1198	!-- the first index.
1199	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'start' )
1200
1201	IF ( host(1:3) == 'nec' ) THEN
1202	!
1203	!-- Code for vector processors
1204	!$OMP PARALLEL PRIVATE ( i, j, k )
1205	!$OMP DO
1206	DO i = 0, nx
1207
1208	DO j = nys, nyn
1209	DO k = 1, nz
1210	work_fftx(i,k,j) = f_in(k,j,i)
1211	ENDDO
1212	ENDDO
1213
1214	ENDDO
1215
1216	!$OMP DO
1217	DO j = nys, nyn
1218
1219	CALL fft_x_m( work_fftx(:,:,j), 'forward' )
1220
1221	DO k = 1, nz
1222	DO i = 0, nx
1223	work(j,k,i) = work_fftx(i,k,j)
1224	ENDDO
1225	ENDDO
1226
1227	ENDDO
1228	!$OMP END PARALLEL
1229
1230	ELSE
1231
1232	!
1233	!-- Cache optimized code (there might be still a potential for better
1234	!-- optimization).
1235	!$OMP PARALLEL PRIVATE (i,j,k)
1236	!$OMP DO
1237	DO i = 0, nx
1238
1239	DO j = nys, nyn
1240	DO k = 1, nz
1241	work_fftx(i,k,j) = f_in(k,j,i)
1242	ENDDO
1243	ENDDO
1244
1245	ENDDO
1246
1247	!$OMP DO
1248	DO j = nys, nyn
1249	DO k = 1, nz
1250
1251	CALL fft_x_1d( work_fftx(0:nx,k,j), 'forward' )
1252
1253	DO i = 0, nx
1254	work(j,k,i) = work_fftx(i,k,j)
1255	ENDDO
1256	ENDDO
1257
1258	ENDDO
1259	!$OMP END PARALLEL
1260
1261	ENDIF
1262	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'pause' )
1263
1264	!
1265	!-- Transpose array
1266	#if defined( __parallel )
1267	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1268	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1269	CALL MPI_ALLTOALL( work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
1270	f_out(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
1271	comm1dy, ierr )
1272	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1273	#endif
1274
1275	END SUBROUTINE fftx_tr_xy
1276
1277
1278	SUBROUTINE tr_yx_fftx( f_in, f_out )
1279
1280	!------------------------------------------------------------------------------!
1281	! Transposition y --> x with a subsequent backward Fourier transformation for
1282	! a 1d-decomposition along x
1283	!------------------------------------------------------------------------------!
1284
1285	USE control_parameters
1286	USE cpulog
1287	USE indices
1288	USE interfaces
1289	USE pegrid
1290	USE transpose_indices
1291
1292	IMPLICIT NONE
1293
1294	INTEGER :: i, j, k
1295
1296	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
1297	REAL, DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_in
1298	REAL, DIMENSION(1:nz,nys:nyn,0:nx) :: f_out
1299	REAL, DIMENSION(nys:nyn,1:nz,0:nx) :: work
1300
1301	!
1302	!-- Transpose array
1303	#if defined( __parallel )
1304	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1305	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1306	CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
1307	work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
1308	comm1dy, ierr )
1309	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1310	#endif
1311
1312	!
1313	!-- Carry out the FFT along x, where all data are present due to the
1314	!-- 1d-decomposition along y. Resort the data in a way that y becomes
1315	!-- the first index.
1316	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'continue' )
1317
1318	IF ( host(1:3) == 'nec' ) THEN
1319	!
1320	!-- Code optimized for vector processors
1321	!$OMP PARALLEL PRIVATE ( i, j, k )
1322	!$OMP DO
1323	DO j = nys, nyn
1324
1325	DO k = 1, nz
1326	DO i = 0, nx
1327	work_fftx(i,k,j) = work(j,k,i)
1328	ENDDO
1329	ENDDO
1330
1331	CALL fft_x_m( work_fftx(:,:,j), 'backward' )
1332
1333	ENDDO
1334
1335	!$OMP DO
1336	DO i = 0, nx
1337	DO j = nys, nyn
1338	DO k = 1, nz
1339	f_out(k,j,i) = work_fftx(i,k,j)
1340	ENDDO
1341	ENDDO
1342	ENDDO
1343	!$OMP END PARALLEL
1344
1345	ELSE
1346
1347	!
1348	!-- Cache optimized code (there might be still a potential for better
1349	!-- optimization).
1350	!$OMP PARALLEL PRIVATE (i,j,k)
1351	!$OMP DO
1352	DO j = nys, nyn
1353	DO k = 1, nz
1354
1355	DO i = 0, nx
1356	work_fftx(i,k,j) = work(j,k,i)
1357	ENDDO
1358
1359	CALL fft_x_1d( work_fftx(0:nx,k,j), 'backward' )
1360
1361	ENDDO
1362	ENDDO
1363
1364	!$OMP DO
1365	DO i = 0, nx
1366	DO j = nys, nyn
1367	DO k = 1, nz
1368	f_out(k,j,i) = work_fftx(i,k,j)
1369	ENDDO
1370	ENDDO
1371	ENDDO
1372	!$OMP END PARALLEL
1373
1374	ENDIF
1375	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'stop' )
1376
1377	END SUBROUTINE tr_yx_fftx
1378
1379
1380	SUBROUTINE ffty_tri_ffty( ar )
1381
1382	!------------------------------------------------------------------------------!
1383	! FFT along y, solution of the tridiagonal system and backward FFT for
1384	! a 1d-decomposition along y
1385	!
1386	! WARNING: this subroutine may still not work for hybrid parallelization
1387	! with OpenMP (for possible necessary changes see the original
1388	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
1389	!------------------------------------------------------------------------------!
1390
1391	USE control_parameters
1392	USE cpulog
1393	USE grid_variables
1394	USE indices
1395	USE interfaces
1396	USE pegrid
1397	USE transpose_indices
1398
1399	IMPLICIT NONE
1400
1401	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
1402
1403	REAL, DIMENSION(0:ny) :: work_ffty
1404	REAL, DIMENSION(0:ny,1:nz) :: work_triy
1405	REAL, DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: ar
1406	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
1407
1408
1409	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'start' )
1410
1411	ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) )
1412
1413	tn = 0 ! Default thread number in case of one thread
1414	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy )
1415	DO i = nxl_y, nxr_y
1416
1417	!$ tn = omp_get_thread_num()
1418
1419	IF ( host(1:3) == 'nec' ) THEN
1420	!
1421	!-- Code optimized for vector processors
1422	DO k = 1, nz
1423
1424	m = 0
1425	DO n = 1, pdims(2)
1426	DO j = 1, nny
1427	work_triy(m,k) = ar(j,k,i,n)
1428	m = m + 1
1429	ENDDO
1430	ENDDO
1431
1432	ENDDO
1433
1434	CALL fft_y_m( work_triy, ny, 'forward' )
1435
1436	ELSE
1437	!
1438	!-- Cache optimized code
1439	DO k = 1, nz
1440
1441	m = 0
1442	DO n = 1, pdims(2)
1443	DO j = 1, nny
1444	work_ffty(m) = ar(j,k,i,n)
1445	m = m + 1
1446	ENDDO
1447	ENDDO
1448
1449	CALL fft_y_1d( work_ffty, 'forward' )
1450
1451	DO j = 0, ny
1452	work_triy(j,k) = work_ffty(j)
1453	ENDDO
1454
1455	ENDDO
1456
1457	ENDIF
1458
1459	!
1460	!-- Solve the linear equation system
1461	CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) )
1462
1463	IF ( host(1:3) == 'nec' ) THEN
1464	!
1465	!-- Code optimized for vector processors
1466	CALL fft_y_m( work_triy, ny, 'backward' )
1467
1468	DO k = 1, nz
1469
1470	m = 0
1471	DO n = 1, pdims(2)
1472	DO j = 1, nny
1473	ar(j,k,i,n) = work_triy(m,k)
1474	m = m + 1
1475	ENDDO
1476	ENDDO
1477
1478	ENDDO
1479
1480	ELSE
1481	!
1482	!-- Cache optimized code
1483	DO k = 1, nz
1484
1485	DO j = 0, ny
1486	work_ffty(j) = work_triy(j,k)
1487	ENDDO
1488
1489	CALL fft_y_1d( work_ffty, 'backward' )
1490
1491	m = 0
1492	DO n = 1, pdims(2)
1493	DO j = 1, nny
1494	ar(j,k,i,n) = work_ffty(m)
1495	m = m + 1
1496	ENDDO
1497	ENDDO
1498
1499	ENDDO
1500
1501	ENDIF
1502
1503	ENDDO
1504
1505	DEALLOCATE( tri )
1506
1507	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'stop' )
1508
1509	END SUBROUTINE ffty_tri_ffty
1510
1511	#endif
1512
1513	END MODULE poisfft_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |