Home

Context Navigation

source: palm/trunk/SOURCE/poisfft.f90 @ 1318

Last change on this file since 1318 was 1318, checked in by raasch, 10 years ago
former files/routines cpu_log and cpu_statistics combined to one module, which also includes the former data module cpulog from the modules-file, module interfaces removed
Property svn:keywords set to `Id`
File size: 42.3 KB

Line
1	MODULE poisfft_mod
2
3	!--------------------------------------------------------------------------------!
4	! This file is part of PALM.
5	!
6	! PALM is free software: you can redistribute it and/or modify it under the terms
7	! of the GNU General Public License as published by the Free Software Foundation,
8	! either version 3 of the License, or (at your option) any later version.
9	!
10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13	!
14	! You should have received a copy of the GNU General Public License along with
15	! PALM. If not, see <http://www.gnu.org/licenses/>.
16	!
17	! Copyright 1997-2014 Leibniz Universitaet Hannover
18	!--------------------------------------------------------------------------------!
19	!
20	! Current revisions:
21	! -----------------
22	! module interfaces removed
23	!
24	! Former revisions:
25	! -----------------
26	! $Id: poisfft.f90 1318 2014-03-17 13:35:16Z raasch $
27	!
28	! 1306 2014-03-13 14:30:59Z raasch
29	! openmp sections removed from the overlap branch,
30	! second argument removed from parameter list
31	!
32	! 1216 2013-08-26 09:31:42Z raasch
33	! resorting of arrays moved to separate routines resort_for_...,
34	! one argument, used as temporary work array, removed from all transpose
35	! routines
36	! overlapping fft / transposition implemented
37	!
38	! 1212 2013-08-15 08:46:27Z raasch
39	! tridia routines moved to seperate module tridia_solver
40	!
41	! 1208 2013-08-13 06:41:49Z raasch
42	! acc-update clauses added for "ar" so that ffts other than cufft can also be
43	! used (although they are not ported and will give a poor performance)
44	!
45	! 1111 2013-03-08 23:54:10Z raasch
46	! further openACC porting of non-parallel (MPI) branch:
47	! tridiagonal routines split into extermal subroutines (instead using CONTAINS),
48	! no distinction between parallel/non-parallel in poisfft and tridia any more,
49	! tridia routines moved to end of file because of probable bug in PGI compiler 12.5
50	! (otherwise "invalid device function" is indicated during runtime),
51	! optimization of tridia routines: constant elements and coefficients of tri are
52	! stored in seperate arrays ddzuw and tric, last dimension of tri reduced from 5
53	! to 2,
54	! poisfft_init is now called internally from poisfft, maketri is called from
55	! poisfft_init,
56	! ibc_p_b = 2 removed
57	!
58	! 1106 2013-03-04 05:31:38Z raasch
59	! routines fftx, ffty, fftxp, fftyp removed, calls replaced by fft_x, fft_y,
60	! in the 1D-decomposition routines fft_x, ffty are replaced by fft_x_1d,
61	! fft_y_1d
62	!
63	! 1103 2013-02-20 02:15:53Z raasch
64	! tri, ar, and ar1 arguments in tridia-routines (2d) are removed because they
65	! sometimes cause segmentation faults with intel 12.1 compiler
66	!
67	! 1092 2013-02-02 11:24:22Z raasch
68	! unused variables removed
69	!
70	! 1036 2012-10-22 13:43:42Z raasch
71	! code put under GPL (PALM 3.9)
72	!
73	! 2012-09-21 07:03:55Z raasch
74	! FLOAT type conversion replaced by REAL
75	!
76	! 1003 2012-09-14 14:35:53Z raasch
77	! indices nxa, nya, etc. replaced by nx, ny, etc.
78	!
79	! 940 2012-07-09 14:31:00Z raasch
80	! special handling of tri-array as an argument in tridia_1dd routines switched
81	! off because it caused segmentation faults with intel 12.1 compiler
82	!
83	! 877 2012-04-03 11:21:44Z suehring
84	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
85	! pressure at the top of the model domain.
86	!
87	! 809 2012-01-30 13:32:58Z maronga
88	! Bugfix: replaced .AND. and .NOT. with && and ! in the preprocessor directives
89	!
90	! 807 2012-01-25 11:53:51Z maronga
91	! New cpp directive "__check" implemented which is used by check_namelist_files
92	! (most of the code is unneeded by check_namelist_files).
93	!
94	! 763 2011-10-06 09:32:09Z suehring
95	! Comment added concerning the last change.
96	!
97	! 761 2011-10-05 17:58:52Z suehring
98	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
99	! pressure at the top of the model domain.
100	!
101	! 696 2011-03-18 07:03:49Z raasch
102	! work_fftx removed from PRIVATE clauses in fftx_tr_xy and tr_yx_fftx
103	!
104	! 683 2011-02-09 14:25:15Z raasch
105	! openMP parallelization for 2d-domain-decomposition
106	!
107	! 667 2010-12-23 12:06:00Z suehring/gryschka
108	! ddzu replaced by ddzu_pres due to changes in zu(0)
109	!
110	! 622 2010-12-10 08:08:13Z raasch
111	! optional barriers included in order to speed up collective operations
112	!
113	! 377 2009-09-04 11:09:00Z raasch
114	! __lcmuk changed to __lc to avoid problems with Intel compiler on sgi-ice
115	!
116	! 164 2008-05-15 08:46:15Z raasch
117	! Arguments removed from transpose routines
118	!
119	! 128 2007-10-26 13:11:14Z raasch
120	! Bugfix: wavenumber calculation for even nx in routines maketri
121	!
122	! 85 2007-05-11 09:35:14Z raasch
123	! Bugfix: work_fft*_vec removed from some PRIVATE-declarations
124	!
125	! 76 2007-03-29 00:58:32Z raasch
126	! Tridiagonal coefficients adjusted for Neumann boundary conditions both at
127	! the bottom and the top.
128	!
129	! RCS Log replace by Id keyword, revision history cleaned up
130	!
131	! Revision 1.24 2006/08/04 15:00:24 raasch
132	! Default setting of the thread number tn in case of not using OpenMP
133	!
134	! Revision 1.23 2006/02/23 12:48:38 raasch
135	! Additional compiler directive in routine tridia_1dd for preventing loop
136	! exchange on NEC-SX6
137	!
138	! Revision 1.20 2004/04/30 12:38:09 raasch
139	! Parts of former poisfft_hybrid moved to this subroutine,
140	! former subroutine changed to a module, renaming of FFT-subroutines and
141	! -module, FFTs completely substituted by calls of fft_x and fft_y,
142	! NAG fft used in the non-parallel case completely removed, l in maketri
143	! is now a 1d-array, variables passed by modules instead of using parameter
144	! lists, enlarged transposition arrays introduced
145	!
146	! Revision 1.1 1997/07/24 11:24:14 raasch
147	! Initial revision
148	!
149	!
150	! Description:
151	! ------------
152	! Original version by Stephan Siano (pois3d), as of July 23, 1996
153	! Adapted for 2D-domain-decomposition by Siegfried Raasch, July 3, 1997
154	!
155	! Solves the Poisson equation with a 2D spectral method
156	! d^2 p / dx^2 + d^2 p / dy^2 + d^2 p / dz^2 = s
157	!
158	! Input:
159	! real ar contains (nnz,nny,nnx) elements of the velocity divergence,
160	! starting from (1,nys,nxl)
161	!
162	! Output:
163	! real ar contains the solution for perturbation pressure p
164	!------------------------------------------------------------------------------!
165
166	USE fft_xy
167	USE indices
168	USE transpose_indices
169	USE tridia_solver
170
171	IMPLICIT NONE
172
173	LOGICAL, SAVE :: poisfft_initialized = .FALSE.
174
175	PRIVATE
176
177	#if ! defined ( __check )
178	PUBLIC poisfft, poisfft_init
179
180	INTERFACE poisfft
181	MODULE PROCEDURE poisfft
182	END INTERFACE poisfft
183
184	INTERFACE poisfft_init
185	MODULE PROCEDURE poisfft_init
186	END INTERFACE poisfft_init
187	#else
188	PUBLIC poisfft_init
189
190	INTERFACE poisfft_init
191	MODULE PROCEDURE poisfft_init
192	END INTERFACE poisfft_init
193	#endif
194
195	CONTAINS
196
197	SUBROUTINE poisfft_init
198
199	USE arrays_3d, ONLY: ddzu_pres, ddzw
200
201	IMPLICIT NONE
202
203	INTEGER :: k
204
205
206	CALL fft_init
207
208	CALL tridia_init
209
210	poisfft_initialized = .TRUE.
211
212	END SUBROUTINE poisfft_init
213
214
215	#if ! defined ( __check )
216	SUBROUTINE poisfft( ar )
217
218	USE control_parameters, ONLY : fft_method, transpose_compute_overlap
219	USE cpulog
220	USE pegrid
221
222	IMPLICIT NONE
223
224	INTEGER :: ii, iind, inew, jj, jind, jnew, ki, kk, knew, n, nblk, &
225	nnx_y, nny_z, nnz_t, nnz_x, nxl_y_bound, nxr_y_bound
226	INTEGER, DIMENSION(4) :: isave
227
228	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar
229	!$acc declare create( ar_inv )
230	REAL, DIMENSION(nys:nyn,nxl:nxr,1:nz) :: ar_inv
231
232	REAL, DIMENSION(:,:,:), ALLOCATABLE :: ar1, f_in, f_inv, f_out_y, &
233	f_out_z
234
235
236	CALL cpu_log( log_point_s(3), 'poisfft', 'start' )
237
238	IF ( .NOT. poisfft_initialized ) CALL poisfft_init
239
240	!
241	!-- Two-dimensional Fourier Transformation in x- and y-direction.
242	IF ( pdims(2) == 1 .AND. pdims(1) > 1 ) THEN
243
244	!
245	!-- 1d-domain-decomposition along x:
246	!-- FFT along y and transposition y --> x
247	CALL ffty_tr_yx( ar, ar )
248
249	!
250	!-- FFT along x, solving the tridiagonal system and backward FFT
251	CALL fftx_tri_fftx( ar )
252
253	!
254	!-- Transposition x --> y and backward FFT along y
255	CALL tr_xy_ffty( ar, ar )
256
257	ELSEIF ( pdims(1) == 1 .AND. pdims(2) > 1 ) THEN
258
259	!
260	!-- 1d-domain-decomposition along y:
261	!-- FFT along x and transposition x --> y
262	CALL fftx_tr_xy( ar, ar )
263
264	!
265	!-- FFT along y, solving the tridiagonal system and backward FFT
266	CALL ffty_tri_ffty( ar )
267
268	!
269	!-- Transposition y --> x and backward FFT along x
270	CALL tr_yx_fftx( ar, ar )
271
272	ELSEIF ( .NOT. transpose_compute_overlap ) THEN
273
274	!
275	!-- 2d-domain-decomposition or no decomposition (1 PE run)
276	!-- Transposition z --> x
277	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
278	CALL resort_for_zx( ar, ar_inv )
279	CALL transpose_zx( ar_inv, ar )
280	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
281
282	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
283	IF ( fft_method /= 'system-specific' ) THEN
284	!$acc update host( ar )
285	ENDIF
286	CALL fft_x( ar, 'forward' )
287	IF ( fft_method /= 'system-specific' ) THEN
288	!$acc update device( ar )
289	ENDIF
290	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
291
292	!
293	!-- Transposition x --> y
294	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
295	CALL resort_for_xy( ar, ar_inv )
296	CALL transpose_xy( ar_inv, ar )
297	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
298
299	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
300	IF ( fft_method /= 'system-specific' ) THEN
301	!$acc update host( ar )
302	ENDIF
303	CALL fft_y( ar, 'forward', ar_tr = ar, &
304	nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
305	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
306	IF ( fft_method /= 'system-specific' ) THEN
307	!$acc update device( ar )
308	ENDIF
309	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
310
311	!
312	!-- Transposition y --> z
313	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
314	CALL resort_for_yz( ar, ar_inv )
315	CALL transpose_yz( ar_inv, ar )
316	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
317
318	!
319	!-- Solve the tridiagonal equation system along z
320	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
321	CALL tridia_substi( ar )
322	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
323
324	!
325	!-- Inverse Fourier Transformation
326	!-- Transposition z --> y
327	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
328	CALL transpose_zy( ar, ar_inv )
329	CALL resort_for_zy( ar_inv, ar )
330	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
331
332	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
333	IF ( fft_method /= 'system-specific' ) THEN
334	!$acc update host( ar )
335	ENDIF
336	CALL fft_y( ar, 'backward', ar_tr = ar, &
337	nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
338	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
339	IF ( fft_method /= 'system-specific' ) THEN
340	!$acc update device( ar )
341	ENDIF
342	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
343
344	!
345	!-- Transposition y --> x
346	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
347	CALL transpose_yx( ar, ar_inv )
348	CALL resort_for_yx( ar_inv, ar )
349	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
350
351	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
352	IF ( fft_method /= 'system-specific' ) THEN
353	!$acc update host( ar )
354	ENDIF
355	CALL fft_x( ar, 'backward' )
356	IF ( fft_method /= 'system-specific' ) THEN
357	!$acc update device( ar )
358	ENDIF
359	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
360
361	!
362	!-- Transposition x --> z
363	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
364	CALL transpose_xz( ar, ar_inv )
365	CALL resort_for_xz( ar_inv, ar )
366	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
367
368	ELSE
369
370	!
371	!-- 2d-domain-decomposition or no decomposition (1 PE run) with
372	!-- overlapping transposition / fft
373	!-- cputime logging must not use barriers, which would prevent overlapping
374	ALLOCATE( f_out_y(0:ny,nxl_y:nxr_y,nzb_y:nzt_y), &
375	f_out_z(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
376	!
377	!-- Transposition z --> x + subsequent fft along x
378	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
379	CALL resort_for_zx( ar, f_inv )
380	!
381	!-- Save original indices and gridpoint counter
382	isave(1) = nz
383	isave(2) = nzb_x
384	isave(3) = nzt_x
385	isave(4) = sendrecvcount_zx
386	!
387	!-- Set new indices for transformation
388	nblk = nz / pdims(1)
389	nz = pdims(1)
390	nnz_x = 1
391	nzb_x = 1 + myidx * nnz_x
392	nzt_x = ( myidx + 1 ) * nnz_x
393	sendrecvcount_zx = nnx * nny * nnz_x
394
395	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
396	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
397
398	DO kk = 1, nblk
399
400	IF ( kk == 1 ) THEN
401	CALL cpu_log( log_point_s(5), 'transpo forward', 'start', cpu_log_nowait )
402	ELSE
403	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
404	ENDIF
405
406	DO knew = 1, nz
407	ki = kk + nblk * ( knew - 1 )
408	f_in(:,:,knew) = f_inv(:,:,ki)
409	ENDDO
410
411	CALL transpose_zx( f_in, ar1(:,:,:))
412	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
413
414	IF ( kk == 1 ) THEN
415	CALL cpu_log( log_point_s(4), 'fft_x', 'start', cpu_log_nowait )
416	ELSE
417	CALL cpu_log( log_point_s(4), 'fft_x', 'continue', cpu_log_nowait )
418	ENDIF
419
420	n = isave(2) + kk - 1
421	CALL fft_x( ar1(:,:,:), 'forward', ar_2d = f_out_z(:,:,n))
422	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
423
424	ENDDO
425	!
426	!-- Restore original indices/counters
427	nz = isave(1)
428	nzb_x = isave(2)
429	nzt_x = isave(3)
430	sendrecvcount_zx = isave(4)
431
432	DEALLOCATE( ar1, f_in, f_inv )
433
434	!
435	!-- Transposition x --> y + subsequent fft along y
436	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
437	CALL resort_for_xy( f_out_z, f_inv )
438	!
439	!-- Save original indices and gridpoint counter
440	isave(1) = nx
441	isave(2) = nxl_y
442	isave(3) = nxr_y
443	isave(4) = sendrecvcount_xy
444	!
445	!-- Set new indices for transformation
446	nblk = ( ( nx+1 ) / pdims(2) ) - 1
447	nx = pdims(2)
448	nnx_y = 1
449	nxl_y = myidy * nnx_y
450	nxr_y = ( myidy + 1 ) * nnx_y - 1
451	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
452
453	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
454	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
455
456	DO ii = 0, nblk
457
458	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
459
460	DO inew = 0, nx-1
461	iind = ii + ( nblk + 1 ) * inew
462	f_in(:,:,inew) = f_inv(:,:,iind)
463	ENDDO
464
465	CALL transpose_xy( f_in, ar1(:,:,:) )
466
467	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
468
469	IF ( ii == 1 ) THEN
470	CALL cpu_log( log_point_s(7), 'fft_y', 'start', cpu_log_nowait )
471	ELSE
472	CALL cpu_log( log_point_s(7), 'fft_y', 'continue', cpu_log_nowait )
473	ENDIF
474
475	nxl_y_bound = isave(2)
476	nxr_y_bound = isave(3)
477	n = isave(2) + ii
478	CALL fft_y( ar1(:,:,:), 'forward', ar_tr = f_out_y, &
479	nxl_y_bound = nxl_y_bound, nxr_y_bound = nxr_y_bound, &
480	nxl_y_l = n, nxr_y_l = n )
481
482	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
483
484	ENDDO
485	!
486	!-- Restore original indices/counters
487	nx = isave(1)
488	nxl_y = isave(2)
489	nxr_y = isave(3)
490	sendrecvcount_xy = isave(4)
491
492	DEALLOCATE( ar1, f_in, f_inv )
493
494	!
495	!-- Transposition y --> z + subsequent tridia + resort for z --> y
496	ALLOCATE( f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
497	CALL resort_for_yz( f_out_y, f_inv )
498	!
499	!-- Save original indices and gridpoint counter
500	isave(1) = ny
501	isave(2) = nys_z
502	isave(3) = nyn_z
503	isave(4) = sendrecvcount_yz
504	!
505	!-- Set new indices for transformation
506	nblk = ( ( ny+1 ) / pdims(1) ) - 1
507	ny = pdims(1)
508	nny_z = 1
509	nys_z = myidx * nny_z
510	nyn_z = ( myidx + 1 ) * nny_z - 1
511	sendrecvcount_yz = ( nxr_y-nxl_y+1 ) * nny_z * ( nzt_y-nzb_y+1 )
512
513	ALLOCATE( ar1(nxl_z:nxr_z,nys_z:nyn_z,1:nz) )
514	ALLOCATE( f_in(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
515
516	DO jj = 0, nblk
517	!
518	!-- Forward Fourier Transformation
519	!-- Transposition y --> z
520	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
521
522	DO jnew = 0, ny-1
523	jind = jj + ( nblk + 1 ) * jnew
524	f_in(:,:,jnew) = f_inv(:,:,jind)
525	ENDDO
526
527	CALL transpose_yz( f_in, ar1(:,:,:) )
528
529	IF ( jj == nblk ) THEN
530	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
531	ELSE
532	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
533	ENDIF
534
535	!
536	!-- Solve the tridiagonal equation system along z
537	CALL cpu_log( log_point_s(6), 'tridia', 'start', cpu_log_nowait )
538
539	n = isave(2) + jj
540	CALL tridia_substi_overlap( ar1(:,:,:), n )
541
542	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
543
544	!
545	!-- Inverse Fourier Transformation
546	!-- Transposition z --> y
547	!-- Only one thread should call MPI routines, therefore forward and
548	!-- backward tranpose are in the same section
549	IF ( jj == 0 ) THEN
550	CALL cpu_log( log_point_s(8), 'transpo invers', 'start', cpu_log_nowait )
551	ELSE
552	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
553	ENDIF
554
555	CALL transpose_zy( ar1(:,:,:), f_in )
556
557	DO jnew = 0, ny-1
558	jind = jj + ( nblk + 1 ) * jnew
559	f_inv(:,:,jind) = f_in(:,:,jnew)
560	ENDDO
561
562	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
563
564	ENDDO
565	!
566	!-- Restore original indices/counters
567	ny = isave(1)
568	nys_z = isave(2)
569	nyn_z = isave(3)
570	sendrecvcount_yz = isave(4)
571
572	CALL resort_for_zy( f_inv, f_out_y )
573
574	DEALLOCATE( ar1, f_in, f_inv )
575
576	!
577	!-- fft along y backward + subsequent transposition y --> x
578	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
579	!
580	!-- Save original indices and gridpoint counter
581	isave(1) = nx
582	isave(2) = nxl_y
583	isave(3) = nxr_y
584	isave(4) = sendrecvcount_xy
585	!
586	!-- Set new indices for transformation
587	nblk = (( nx+1 ) / pdims(2) ) - 1
588	nx = pdims(2)
589	nnx_y = 1
590	nxl_y = myidy * nnx_y
591	nxr_y = ( myidy + 1 ) * nnx_y - 1
592	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
593
594	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
595	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
596
597	DO ii = 0, nblk
598
599	CALL cpu_log( log_point_s(7), 'fft_y', 'continue', cpu_log_nowait )
600
601	n = isave(2) + ii
602	nxl_y_bound = isave(2)
603	nxr_y_bound = isave(3)
604
605	CALL fft_y( ar1(:,:,:), 'backward', ar_tr = f_out_y, &
606	nxl_y_bound = nxl_y_bound, nxr_y_bound = nxr_y_bound, &
607	nxl_y_l = n, nxr_y_l = n )
608
609	IF ( ii == nblk ) THEN
610	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
611	ELSE
612	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
613	ENDIF
614
615	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
616
617	CALL transpose_yx( ar1(:,:,:), f_in )
618
619	DO inew = 0, nx-1
620	iind = ii + (nblk+1) * inew
621	f_inv(:,:,iind) = f_in(:,:,inew)
622	ENDDO
623
624	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
625
626	ENDDO
627	!
628	!-- Restore original indices/counters
629	nx = isave(1)
630	nxl_y = isave(2)
631	nxr_y = isave(3)
632	sendrecvcount_xy = isave(4)
633
634	CALL resort_for_yx( f_inv, f_out_z )
635
636	DEALLOCATE( ar1, f_in, f_inv )
637
638	!
639	!-- fft along x backward + subsequent final transposition x --> z
640	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
641	!
642	!-- Save original indices and gridpoint counter
643	isave(1) = nz
644	isave(2) = nzb_x
645	isave(3) = nzt_x
646	isave(4) = sendrecvcount_zx
647	!
648	!-- Set new indices for transformation
649	nblk = nz / pdims(1)
650	nz = pdims(1)
651	nnz_x = 1
652	nzb_x = 1 + myidx * nnz_x
653	nzt_x = ( myidx + 1 ) * nnz_x
654	sendrecvcount_zx = nnx * nny * nnz_x
655
656	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
657	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
658
659	DO kk = 1, nblk
660
661	CALL cpu_log( log_point_s(4), 'fft_x', 'continue', cpu_log_nowait )
662
663	n = isave(2) + kk - 1
664	CALL fft_x( ar1(:,:,:), 'backward', f_out_z(:,:,n))
665
666	IF ( kk == nblk ) THEN
667	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
668	ELSE
669	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
670	ENDIF
671
672	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
673
674	CALL transpose_xz( ar1(:,:,:), f_in )
675
676	DO knew = 1, nz
677	ki = kk + nblk * (knew-1)
678	f_inv(:,:,ki) = f_in(:,:,knew)
679	ENDDO
680
681	IF ( kk == nblk ) THEN
682	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
683	ELSE
684	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
685	ENDIF
686
687	ENDDO
688	!
689	!-- Restore original indices/counters
690	nz = isave(1)
691	nzb_x = isave(2)
692	nzt_x = isave(3)
693	sendrecvcount_zx = isave(4)
694
695	CALL resort_for_xz( f_inv, ar )
696
697	DEALLOCATE( ar1, f_in, f_inv )
698
699	ENDIF
700
701	CALL cpu_log( log_point_s(3), 'poisfft', 'stop' )
702
703	END SUBROUTINE poisfft
704
705
706
707	SUBROUTINE ffty_tr_yx( f_in, f_out )
708
709	!------------------------------------------------------------------------------!
710	! Fourier-transformation along y with subsequent transposition y --> x for
711	! a 1d-decomposition along x
712	!
713	! ATTENTION: The performance of this routine is much faster on the NEC-SX6,
714	! if the first index of work_ffty_vec is odd. Otherwise
715	! memory bank conflicts may occur (especially if the index is a
716	! multiple of 128). That's why work_ffty_vec is dimensioned as
717	! 0:ny+1.
718	! Of course, this will not work if users are using an odd number
719	! of gridpoints along y.
720	!------------------------------------------------------------------------------!
721
722	USE control_parameters
723	USE cpulog
724	USE indices
725	USE pegrid
726	USE transpose_indices
727
728	IMPLICIT NONE
729
730	INTEGER :: i, iend, iouter, ir, j, k
731	INTEGER, PARAMETER :: stridex = 4
732
733	REAL, DIMENSION(0:ny,stridex) :: work_ffty
734	#if defined( __nec )
735	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
736	#endif
737	REAL, DIMENSION(1:nz,0:ny,nxl:nxr) :: f_in
738	REAL, DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_out
739	REAL, DIMENSION(nxl:nxr,1:nz,0:ny) :: work
740
741	!
742	!-- Carry out the FFT along y, where all data are present due to the
743	!-- 1d-decomposition along x. Resort the data in a way that x becomes
744	!-- the first index.
745	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'start' )
746
747	IF ( host(1:3) == 'nec' ) THEN
748	#if defined( __nec )
749	!
750	!-- Code optimized for vector processors
751	!$OMP PARALLEL PRIVATE ( i, j, k )
752	!$OMP DO
753	DO i = nxl, nxr
754
755	DO j = 0, ny
756	DO k = 1, nz
757	work_ffty_vec(j,k,i) = f_in(k,j,i)
758	ENDDO
759	ENDDO
760
761	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' )
762
763	ENDDO
764
765	!$OMP DO
766	DO k = 1, nz
767	DO j = 0, ny
768	DO i = nxl, nxr
769	work(i,k,j) = work_ffty_vec(j,k,i)
770	ENDDO
771	ENDDO
772	ENDDO
773	!$OMP END PARALLEL
774	#endif
775
776	ELSE
777
778	!
779	!-- Cache optimized code.
780	!-- The i-(x-)direction is split into a strided outer loop and an inner
781	!-- loop for better cache performance
782	!$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty)
783	!$OMP DO
784	DO iouter = nxl, nxr, stridex
785
786	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
787
788	DO k = 1, nz
789
790	DO i = iouter, iend
791
792	ir = i-iouter+1 ! counter within a stride
793	DO j = 0, ny
794	work_ffty(j,ir) = f_in(k,j,i)
795	ENDDO
796	!
797	!-- FFT along y
798	CALL fft_y_1d( work_ffty(:,ir), 'forward' )
799
800	ENDDO
801
802	!
803	!-- Resort
804	DO j = 0, ny
805	DO i = iouter, iend
806	work(i,k,j) = work_ffty(j,i-iouter+1)
807	ENDDO
808	ENDDO
809
810	ENDDO
811
812	ENDDO
813	!$OMP END PARALLEL
814
815	ENDIF
816	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'pause' )
817
818	!
819	!-- Transpose array
820	#if defined( __parallel )
821	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
822	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
823	CALL MPI_ALLTOALL( work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
824	f_out(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
825	comm1dx, ierr )
826	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
827	#endif
828
829	END SUBROUTINE ffty_tr_yx
830
831
832	SUBROUTINE tr_xy_ffty( f_in, f_out )
833
834	!------------------------------------------------------------------------------!
835	! Transposition x --> y with a subsequent backward Fourier transformation for
836	! a 1d-decomposition along x
837	!------------------------------------------------------------------------------!
838
839	USE control_parameters
840	USE cpulog
841	USE indices
842	USE pegrid
843	USE transpose_indices
844
845	IMPLICIT NONE
846
847	INTEGER :: i, iend, iouter, ir, j, k
848	INTEGER, PARAMETER :: stridex = 4
849
850	REAL, DIMENSION(0:ny,stridex) :: work_ffty
851	#if defined( __nec )
852	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
853	#endif
854	REAL, DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_in
855	REAL, DIMENSION(1:nz,0:ny,nxl:nxr) :: f_out
856	REAL, DIMENSION(nxl:nxr,1:nz,0:ny) :: work
857
858	!
859	!-- Transpose array
860	#if defined( __parallel )
861	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
862	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
863	CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
864	work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
865	comm1dx, ierr )
866	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
867	#endif
868
869	!
870	!-- Resort the data in a way that y becomes the first index and carry out the
871	!-- backward fft along y.
872	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'continue' )
873
874	IF ( host(1:3) == 'nec' ) THEN
875	#if defined( __nec )
876	!
877	!-- Code optimized for vector processors
878	!$OMP PARALLEL PRIVATE ( i, j, k )
879	!$OMP DO
880	DO k = 1, nz
881	DO j = 0, ny
882	DO i = nxl, nxr
883	work_ffty_vec(j,k,i) = work(i,k,j)
884	ENDDO
885	ENDDO
886	ENDDO
887
888	!$OMP DO
889	DO i = nxl, nxr
890
891	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' )
892
893	DO j = 0, ny
894	DO k = 1, nz
895	f_out(k,j,i) = work_ffty_vec(j,k,i)
896	ENDDO
897	ENDDO
898
899	ENDDO
900	!$OMP END PARALLEL
901	#endif
902
903	ELSE
904
905	!
906	!-- Cache optimized code.
907	!-- The i-(x-)direction is split into a strided outer loop and an inner
908	!-- loop for better cache performance
909	!$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty )
910	!$OMP DO
911	DO iouter = nxl, nxr, stridex
912
913	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
914
915	DO k = 1, nz
916	!
917	!-- Resort
918	DO j = 0, ny
919	DO i = iouter, iend
920	work_ffty(j,i-iouter+1) = work(i,k,j)
921	ENDDO
922	ENDDO
923
924	DO i = iouter, iend
925
926	!
927	!-- FFT along y
928	ir = i-iouter+1 ! counter within a stride
929	CALL fft_y_1d( work_ffty(:,ir), 'backward' )
930
931	DO j = 0, ny
932	f_out(k,j,i) = work_ffty(j,ir)
933	ENDDO
934	ENDDO
935
936	ENDDO
937
938	ENDDO
939	!$OMP END PARALLEL
940
941	ENDIF
942
943	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'stop' )
944
945	END SUBROUTINE tr_xy_ffty
946
947
948	SUBROUTINE fftx_tri_fftx( ar )
949
950	!------------------------------------------------------------------------------!
951	! FFT along x, solution of the tridiagonal system and backward FFT for
952	! a 1d-decomposition along x
953	!
954	! WARNING: this subroutine may still not work for hybrid parallelization
955	! with OpenMP (for possible necessary changes see the original
956	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
957	!------------------------------------------------------------------------------!
958
959	USE control_parameters
960	USE cpulog
961	USE grid_variables
962	USE indices
963	USE pegrid
964	USE transpose_indices
965
966	IMPLICIT NONE
967
968	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
969
970	REAL, DIMENSION(0:nx) :: work_fftx
971	REAL, DIMENSION(0:nx,1:nz) :: work_trix
972	REAL, DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: ar
973	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
974
975
976	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'start' )
977
978	ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) )
979
980	tn = 0 ! Default thread number in case of one thread
981	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_fftx, work_trix )
982	DO j = nys_x, nyn_x
983
984	!$ tn = omp_get_thread_num()
985
986	IF ( host(1:3) == 'nec' ) THEN
987	!
988	!-- Code optimized for vector processors
989	DO k = 1, nz
990
991	m = 0
992	DO n = 1, pdims(1)
993	DO i = 1, nnx
994	work_trix(m,k) = ar(i,k,j,n)
995	m = m + 1
996	ENDDO
997	ENDDO
998
999	ENDDO
1000
1001	CALL fft_x_m( work_trix, 'forward' )
1002
1003	ELSE
1004	!
1005	!-- Cache optimized code
1006	DO k = 1, nz
1007
1008	m = 0
1009	DO n = 1, pdims(1)
1010	DO i = 1, nnx
1011	work_fftx(m) = ar(i,k,j,n)
1012	m = m + 1
1013	ENDDO
1014	ENDDO
1015
1016	CALL fft_x_1d( work_fftx, 'forward' )
1017
1018	DO i = 0, nx
1019	work_trix(i,k) = work_fftx(i)
1020	ENDDO
1021
1022	ENDDO
1023
1024	ENDIF
1025
1026	!
1027	!-- Solve the linear equation system
1028	CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) )
1029
1030	IF ( host(1:3) == 'nec' ) THEN
1031	!
1032	!-- Code optimized for vector processors
1033	CALL fft_x_m( work_trix, 'backward' )
1034
1035	DO k = 1, nz
1036
1037	m = 0
1038	DO n = 1, pdims(1)
1039	DO i = 1, nnx
1040	ar(i,k,j,n) = work_trix(m,k)
1041	m = m + 1
1042	ENDDO
1043	ENDDO
1044
1045	ENDDO
1046
1047	ELSE
1048	!
1049	!-- Cache optimized code
1050	DO k = 1, nz
1051
1052	DO i = 0, nx
1053	work_fftx(i) = work_trix(i,k)
1054	ENDDO
1055
1056	CALL fft_x_1d( work_fftx, 'backward' )
1057
1058	m = 0
1059	DO n = 1, pdims(1)
1060	DO i = 1, nnx
1061	ar(i,k,j,n) = work_fftx(m)
1062	m = m + 1
1063	ENDDO
1064	ENDDO
1065
1066	ENDDO
1067
1068	ENDIF
1069
1070	ENDDO
1071
1072	DEALLOCATE( tri )
1073
1074	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'stop' )
1075
1076	END SUBROUTINE fftx_tri_fftx
1077
1078
1079	SUBROUTINE fftx_tr_xy( f_in, f_out )
1080
1081	!------------------------------------------------------------------------------!
1082	! Fourier-transformation along x with subsequent transposition x --> y for
1083	! a 1d-decomposition along y
1084	!
1085	! ATTENTION: The NEC-branch of this routine may significantly profit from
1086	! further optimizations. So far, performance is much worse than
1087	! for routine ffty_tr_yx (more than three times slower).
1088	!------------------------------------------------------------------------------!
1089
1090	USE control_parameters
1091	USE cpulog
1092	USE indices
1093	USE pegrid
1094	USE transpose_indices
1095
1096	IMPLICIT NONE
1097
1098	INTEGER :: i, j, k
1099
1100	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
1101	REAL, DIMENSION(1:nz,nys:nyn,0:nx) :: f_in
1102	REAL, DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_out
1103	REAL, DIMENSION(nys:nyn,1:nz,0:nx) :: work
1104
1105	!
1106	!-- Carry out the FFT along x, where all data are present due to the
1107	!-- 1d-decomposition along y. Resort the data in a way that y becomes
1108	!-- the first index.
1109	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'start' )
1110
1111	IF ( host(1:3) == 'nec' ) THEN
1112	!
1113	!-- Code for vector processors
1114	!$OMP PARALLEL PRIVATE ( i, j, k )
1115	!$OMP DO
1116	DO i = 0, nx
1117
1118	DO j = nys, nyn
1119	DO k = 1, nz
1120	work_fftx(i,k,j) = f_in(k,j,i)
1121	ENDDO
1122	ENDDO
1123
1124	ENDDO
1125
1126	!$OMP DO
1127	DO j = nys, nyn
1128
1129	CALL fft_x_m( work_fftx(:,:,j), 'forward' )
1130
1131	DO k = 1, nz
1132	DO i = 0, nx
1133	work(j,k,i) = work_fftx(i,k,j)
1134	ENDDO
1135	ENDDO
1136
1137	ENDDO
1138	!$OMP END PARALLEL
1139
1140	ELSE
1141
1142	!
1143	!-- Cache optimized code (there might be still a potential for better
1144	!-- optimization).
1145	!$OMP PARALLEL PRIVATE (i,j,k)
1146	!$OMP DO
1147	DO i = 0, nx
1148
1149	DO j = nys, nyn
1150	DO k = 1, nz
1151	work_fftx(i,k,j) = f_in(k,j,i)
1152	ENDDO
1153	ENDDO
1154
1155	ENDDO
1156
1157	!$OMP DO
1158	DO j = nys, nyn
1159	DO k = 1, nz
1160
1161	CALL fft_x_1d( work_fftx(0:nx,k,j), 'forward' )
1162
1163	DO i = 0, nx
1164	work(j,k,i) = work_fftx(i,k,j)
1165	ENDDO
1166	ENDDO
1167
1168	ENDDO
1169	!$OMP END PARALLEL
1170
1171	ENDIF
1172	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'pause' )
1173
1174	!
1175	!-- Transpose array
1176	#if defined( __parallel )
1177	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1178	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1179	CALL MPI_ALLTOALL( work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
1180	f_out(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
1181	comm1dy, ierr )
1182	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1183	#endif
1184
1185	END SUBROUTINE fftx_tr_xy
1186
1187
1188	SUBROUTINE tr_yx_fftx( f_in, f_out )
1189
1190	!------------------------------------------------------------------------------!
1191	! Transposition y --> x with a subsequent backward Fourier transformation for
1192	! a 1d-decomposition along x
1193	!------------------------------------------------------------------------------!
1194
1195	USE control_parameters
1196	USE cpulog
1197	USE indices
1198	USE pegrid
1199	USE transpose_indices
1200
1201	IMPLICIT NONE
1202
1203	INTEGER :: i, j, k
1204
1205	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
1206	REAL, DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_in
1207	REAL, DIMENSION(1:nz,nys:nyn,0:nx) :: f_out
1208	REAL, DIMENSION(nys:nyn,1:nz,0:nx) :: work
1209
1210	!
1211	!-- Transpose array
1212	#if defined( __parallel )
1213	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1214	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1215	CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
1216	work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
1217	comm1dy, ierr )
1218	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1219	#endif
1220
1221	!
1222	!-- Carry out the FFT along x, where all data are present due to the
1223	!-- 1d-decomposition along y. Resort the data in a way that y becomes
1224	!-- the first index.
1225	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'continue' )
1226
1227	IF ( host(1:3) == 'nec' ) THEN
1228	!
1229	!-- Code optimized for vector processors
1230	!$OMP PARALLEL PRIVATE ( i, j, k )
1231	!$OMP DO
1232	DO j = nys, nyn
1233
1234	DO k = 1, nz
1235	DO i = 0, nx
1236	work_fftx(i,k,j) = work(j,k,i)
1237	ENDDO
1238	ENDDO
1239
1240	CALL fft_x_m( work_fftx(:,:,j), 'backward' )
1241
1242	ENDDO
1243
1244	!$OMP DO
1245	DO i = 0, nx
1246	DO j = nys, nyn
1247	DO k = 1, nz
1248	f_out(k,j,i) = work_fftx(i,k,j)
1249	ENDDO
1250	ENDDO
1251	ENDDO
1252	!$OMP END PARALLEL
1253
1254	ELSE
1255
1256	!
1257	!-- Cache optimized code (there might be still a potential for better
1258	!-- optimization).
1259	!$OMP PARALLEL PRIVATE (i,j,k)
1260	!$OMP DO
1261	DO j = nys, nyn
1262	DO k = 1, nz
1263
1264	DO i = 0, nx
1265	work_fftx(i,k,j) = work(j,k,i)
1266	ENDDO
1267
1268	CALL fft_x_1d( work_fftx(0:nx,k,j), 'backward' )
1269
1270	ENDDO
1271	ENDDO
1272
1273	!$OMP DO
1274	DO i = 0, nx
1275	DO j = nys, nyn
1276	DO k = 1, nz
1277	f_out(k,j,i) = work_fftx(i,k,j)
1278	ENDDO
1279	ENDDO
1280	ENDDO
1281	!$OMP END PARALLEL
1282
1283	ENDIF
1284	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'stop' )
1285
1286	END SUBROUTINE tr_yx_fftx
1287
1288
1289	SUBROUTINE ffty_tri_ffty( ar )
1290
1291	!------------------------------------------------------------------------------!
1292	! FFT along y, solution of the tridiagonal system and backward FFT for
1293	! a 1d-decomposition along y
1294	!
1295	! WARNING: this subroutine may still not work for hybrid parallelization
1296	! with OpenMP (for possible necessary changes see the original
1297	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
1298	!------------------------------------------------------------------------------!
1299
1300	USE control_parameters
1301	USE cpulog
1302	USE grid_variables
1303	USE indices
1304	USE pegrid
1305	USE transpose_indices
1306
1307	IMPLICIT NONE
1308
1309	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
1310
1311	REAL, DIMENSION(0:ny) :: work_ffty
1312	REAL, DIMENSION(0:ny,1:nz) :: work_triy
1313	REAL, DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: ar
1314	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
1315
1316
1317	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'start' )
1318
1319	ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) )
1320
1321	tn = 0 ! Default thread number in case of one thread
1322	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy )
1323	DO i = nxl_y, nxr_y
1324
1325	!$ tn = omp_get_thread_num()
1326
1327	IF ( host(1:3) == 'nec' ) THEN
1328	!
1329	!-- Code optimized for vector processors
1330	DO k = 1, nz
1331
1332	m = 0
1333	DO n = 1, pdims(2)
1334	DO j = 1, nny
1335	work_triy(m,k) = ar(j,k,i,n)
1336	m = m + 1
1337	ENDDO
1338	ENDDO
1339
1340	ENDDO
1341
1342	CALL fft_y_m( work_triy, ny, 'forward' )
1343
1344	ELSE
1345	!
1346	!-- Cache optimized code
1347	DO k = 1, nz
1348
1349	m = 0
1350	DO n = 1, pdims(2)
1351	DO j = 1, nny
1352	work_ffty(m) = ar(j,k,i,n)
1353	m = m + 1
1354	ENDDO
1355	ENDDO
1356
1357	CALL fft_y_1d( work_ffty, 'forward' )
1358
1359	DO j = 0, ny
1360	work_triy(j,k) = work_ffty(j)
1361	ENDDO
1362
1363	ENDDO
1364
1365	ENDIF
1366
1367	!
1368	!-- Solve the linear equation system
1369	CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) )
1370
1371	IF ( host(1:3) == 'nec' ) THEN
1372	!
1373	!-- Code optimized for vector processors
1374	CALL fft_y_m( work_triy, ny, 'backward' )
1375
1376	DO k = 1, nz
1377
1378	m = 0
1379	DO n = 1, pdims(2)
1380	DO j = 1, nny
1381	ar(j,k,i,n) = work_triy(m,k)
1382	m = m + 1
1383	ENDDO
1384	ENDDO
1385
1386	ENDDO
1387
1388	ELSE
1389	!
1390	!-- Cache optimized code
1391	DO k = 1, nz
1392
1393	DO j = 0, ny
1394	work_ffty(j) = work_triy(j,k)
1395	ENDDO
1396
1397	CALL fft_y_1d( work_ffty, 'backward' )
1398
1399	m = 0
1400	DO n = 1, pdims(2)
1401	DO j = 1, nny
1402	ar(j,k,i,n) = work_ffty(m)
1403	m = m + 1
1404	ENDDO
1405	ENDDO
1406
1407	ENDDO
1408
1409	ENDIF
1410
1411	ENDDO
1412
1413	DEALLOCATE( tri )
1414
1415	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'stop' )
1416
1417	END SUBROUTINE ffty_tri_ffty
1418
1419	#endif
1420
1421	END MODULE poisfft_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |