Home

Context Navigation

source: palm/trunk/SOURCE/poisfft.f90 @ 1319

Last change on this file since 1319 was 1319, checked in by raasch, 10 years ago
last commit documented
Property svn:keywords set to `Id`
File size: 42.3 KB

Line
1	MODULE poisfft_mod
2
3	!--------------------------------------------------------------------------------!
4	! This file is part of PALM.
5	!
6	! PALM is free software: you can redistribute it and/or modify it under the terms
7	! of the GNU General Public License as published by the Free Software Foundation,
8	! either version 3 of the License, or (at your option) any later version.
9	!
10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13	!
14	! You should have received a copy of the GNU General Public License along with
15	! PALM. If not, see <http://www.gnu.org/licenses/>.
16	!
17	! Copyright 1997-2014 Leibniz Universitaet Hannover
18	!--------------------------------------------------------------------------------!
19	!
20	! Current revisions:
21	! -----------------
22	!
23	!
24	! Former revisions:
25	! -----------------
26	! $Id: poisfft.f90 1319 2014-03-17 15:08:44Z raasch $
27	!
28	! 1318 2014-03-17 13:35:16Z raasch
29	! module interfaces removed
30	!
31	! 1306 2014-03-13 14:30:59Z raasch
32	! openmp sections removed from the overlap branch,
33	! second argument removed from parameter list
34	!
35	! 1216 2013-08-26 09:31:42Z raasch
36	! resorting of arrays moved to separate routines resort_for_...,
37	! one argument, used as temporary work array, removed from all transpose
38	! routines
39	! overlapping fft / transposition implemented
40	!
41	! 1212 2013-08-15 08:46:27Z raasch
42	! tridia routines moved to seperate module tridia_solver
43	!
44	! 1208 2013-08-13 06:41:49Z raasch
45	! acc-update clauses added for "ar" so that ffts other than cufft can also be
46	! used (although they are not ported and will give a poor performance)
47	!
48	! 1111 2013-03-08 23:54:10Z raasch
49	! further openACC porting of non-parallel (MPI) branch:
50	! tridiagonal routines split into extermal subroutines (instead using CONTAINS),
51	! no distinction between parallel/non-parallel in poisfft and tridia any more,
52	! tridia routines moved to end of file because of probable bug in PGI compiler 12.5
53	! (otherwise "invalid device function" is indicated during runtime),
54	! optimization of tridia routines: constant elements and coefficients of tri are
55	! stored in seperate arrays ddzuw and tric, last dimension of tri reduced from 5
56	! to 2,
57	! poisfft_init is now called internally from poisfft, maketri is called from
58	! poisfft_init,
59	! ibc_p_b = 2 removed
60	!
61	! 1106 2013-03-04 05:31:38Z raasch
62	! routines fftx, ffty, fftxp, fftyp removed, calls replaced by fft_x, fft_y,
63	! in the 1D-decomposition routines fft_x, ffty are replaced by fft_x_1d,
64	! fft_y_1d
65	!
66	! 1103 2013-02-20 02:15:53Z raasch
67	! tri, ar, and ar1 arguments in tridia-routines (2d) are removed because they
68	! sometimes cause segmentation faults with intel 12.1 compiler
69	!
70	! 1092 2013-02-02 11:24:22Z raasch
71	! unused variables removed
72	!
73	! 1036 2012-10-22 13:43:42Z raasch
74	! code put under GPL (PALM 3.9)
75	!
76	! 2012-09-21 07:03:55Z raasch
77	! FLOAT type conversion replaced by REAL
78	!
79	! 1003 2012-09-14 14:35:53Z raasch
80	! indices nxa, nya, etc. replaced by nx, ny, etc.
81	!
82	! 940 2012-07-09 14:31:00Z raasch
83	! special handling of tri-array as an argument in tridia_1dd routines switched
84	! off because it caused segmentation faults with intel 12.1 compiler
85	!
86	! 877 2012-04-03 11:21:44Z suehring
87	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
88	! pressure at the top of the model domain.
89	!
90	! 809 2012-01-30 13:32:58Z maronga
91	! Bugfix: replaced .AND. and .NOT. with && and ! in the preprocessor directives
92	!
93	! 807 2012-01-25 11:53:51Z maronga
94	! New cpp directive "__check" implemented which is used by check_namelist_files
95	! (most of the code is unneeded by check_namelist_files).
96	!
97	! 763 2011-10-06 09:32:09Z suehring
98	! Comment added concerning the last change.
99	!
100	! 761 2011-10-05 17:58:52Z suehring
101	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
102	! pressure at the top of the model domain.
103	!
104	! 696 2011-03-18 07:03:49Z raasch
105	! work_fftx removed from PRIVATE clauses in fftx_tr_xy and tr_yx_fftx
106	!
107	! 683 2011-02-09 14:25:15Z raasch
108	! openMP parallelization for 2d-domain-decomposition
109	!
110	! 667 2010-12-23 12:06:00Z suehring/gryschka
111	! ddzu replaced by ddzu_pres due to changes in zu(0)
112	!
113	! 622 2010-12-10 08:08:13Z raasch
114	! optional barriers included in order to speed up collective operations
115	!
116	! 377 2009-09-04 11:09:00Z raasch
117	! __lcmuk changed to __lc to avoid problems with Intel compiler on sgi-ice
118	!
119	! 164 2008-05-15 08:46:15Z raasch
120	! Arguments removed from transpose routines
121	!
122	! 128 2007-10-26 13:11:14Z raasch
123	! Bugfix: wavenumber calculation for even nx in routines maketri
124	!
125	! 85 2007-05-11 09:35:14Z raasch
126	! Bugfix: work_fft*_vec removed from some PRIVATE-declarations
127	!
128	! 76 2007-03-29 00:58:32Z raasch
129	! Tridiagonal coefficients adjusted for Neumann boundary conditions both at
130	! the bottom and the top.
131	!
132	! RCS Log replace by Id keyword, revision history cleaned up
133	!
134	! Revision 1.24 2006/08/04 15:00:24 raasch
135	! Default setting of the thread number tn in case of not using OpenMP
136	!
137	! Revision 1.23 2006/02/23 12:48:38 raasch
138	! Additional compiler directive in routine tridia_1dd for preventing loop
139	! exchange on NEC-SX6
140	!
141	! Revision 1.20 2004/04/30 12:38:09 raasch
142	! Parts of former poisfft_hybrid moved to this subroutine,
143	! former subroutine changed to a module, renaming of FFT-subroutines and
144	! -module, FFTs completely substituted by calls of fft_x and fft_y,
145	! NAG fft used in the non-parallel case completely removed, l in maketri
146	! is now a 1d-array, variables passed by modules instead of using parameter
147	! lists, enlarged transposition arrays introduced
148	!
149	! Revision 1.1 1997/07/24 11:24:14 raasch
150	! Initial revision
151	!
152	!
153	! Description:
154	! ------------
155	! Original version by Stephan Siano (pois3d), as of July 23, 1996
156	! Adapted for 2D-domain-decomposition by Siegfried Raasch, July 3, 1997
157	!
158	! Solves the Poisson equation with a 2D spectral method
159	! d^2 p / dx^2 + d^2 p / dy^2 + d^2 p / dz^2 = s
160	!
161	! Input:
162	! real ar contains (nnz,nny,nnx) elements of the velocity divergence,
163	! starting from (1,nys,nxl)
164	!
165	! Output:
166	! real ar contains the solution for perturbation pressure p
167	!------------------------------------------------------------------------------!
168
169	USE fft_xy
170	USE indices
171	USE transpose_indices
172	USE tridia_solver
173
174	IMPLICIT NONE
175
176	LOGICAL, SAVE :: poisfft_initialized = .FALSE.
177
178	PRIVATE
179
180	#if ! defined ( __check )
181	PUBLIC poisfft, poisfft_init
182
183	INTERFACE poisfft
184	MODULE PROCEDURE poisfft
185	END INTERFACE poisfft
186
187	INTERFACE poisfft_init
188	MODULE PROCEDURE poisfft_init
189	END INTERFACE poisfft_init
190	#else
191	PUBLIC poisfft_init
192
193	INTERFACE poisfft_init
194	MODULE PROCEDURE poisfft_init
195	END INTERFACE poisfft_init
196	#endif
197
198	CONTAINS
199
200	SUBROUTINE poisfft_init
201
202	USE arrays_3d, ONLY: ddzu_pres, ddzw
203
204	IMPLICIT NONE
205
206	INTEGER :: k
207
208
209	CALL fft_init
210
211	CALL tridia_init
212
213	poisfft_initialized = .TRUE.
214
215	END SUBROUTINE poisfft_init
216
217
218	#if ! defined ( __check )
219	SUBROUTINE poisfft( ar )
220
221	USE control_parameters, ONLY : fft_method, transpose_compute_overlap
222	USE cpulog
223	USE pegrid
224
225	IMPLICIT NONE
226
227	INTEGER :: ii, iind, inew, jj, jind, jnew, ki, kk, knew, n, nblk, &
228	nnx_y, nny_z, nnz_t, nnz_x, nxl_y_bound, nxr_y_bound
229	INTEGER, DIMENSION(4) :: isave
230
231	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar
232	!$acc declare create( ar_inv )
233	REAL, DIMENSION(nys:nyn,nxl:nxr,1:nz) :: ar_inv
234
235	REAL, DIMENSION(:,:,:), ALLOCATABLE :: ar1, f_in, f_inv, f_out_y, &
236	f_out_z
237
238
239	CALL cpu_log( log_point_s(3), 'poisfft', 'start' )
240
241	IF ( .NOT. poisfft_initialized ) CALL poisfft_init
242
243	!
244	!-- Two-dimensional Fourier Transformation in x- and y-direction.
245	IF ( pdims(2) == 1 .AND. pdims(1) > 1 ) THEN
246
247	!
248	!-- 1d-domain-decomposition along x:
249	!-- FFT along y and transposition y --> x
250	CALL ffty_tr_yx( ar, ar )
251
252	!
253	!-- FFT along x, solving the tridiagonal system and backward FFT
254	CALL fftx_tri_fftx( ar )
255
256	!
257	!-- Transposition x --> y and backward FFT along y
258	CALL tr_xy_ffty( ar, ar )
259
260	ELSEIF ( pdims(1) == 1 .AND. pdims(2) > 1 ) THEN
261
262	!
263	!-- 1d-domain-decomposition along y:
264	!-- FFT along x and transposition x --> y
265	CALL fftx_tr_xy( ar, ar )
266
267	!
268	!-- FFT along y, solving the tridiagonal system and backward FFT
269	CALL ffty_tri_ffty( ar )
270
271	!
272	!-- Transposition y --> x and backward FFT along x
273	CALL tr_yx_fftx( ar, ar )
274
275	ELSEIF ( .NOT. transpose_compute_overlap ) THEN
276
277	!
278	!-- 2d-domain-decomposition or no decomposition (1 PE run)
279	!-- Transposition z --> x
280	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
281	CALL resort_for_zx( ar, ar_inv )
282	CALL transpose_zx( ar_inv, ar )
283	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
284
285	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
286	IF ( fft_method /= 'system-specific' ) THEN
287	!$acc update host( ar )
288	ENDIF
289	CALL fft_x( ar, 'forward' )
290	IF ( fft_method /= 'system-specific' ) THEN
291	!$acc update device( ar )
292	ENDIF
293	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
294
295	!
296	!-- Transposition x --> y
297	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
298	CALL resort_for_xy( ar, ar_inv )
299	CALL transpose_xy( ar_inv, ar )
300	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
301
302	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
303	IF ( fft_method /= 'system-specific' ) THEN
304	!$acc update host( ar )
305	ENDIF
306	CALL fft_y( ar, 'forward', ar_tr = ar, &
307	nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
308	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
309	IF ( fft_method /= 'system-specific' ) THEN
310	!$acc update device( ar )
311	ENDIF
312	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
313
314	!
315	!-- Transposition y --> z
316	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
317	CALL resort_for_yz( ar, ar_inv )
318	CALL transpose_yz( ar_inv, ar )
319	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
320
321	!
322	!-- Solve the tridiagonal equation system along z
323	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
324	CALL tridia_substi( ar )
325	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
326
327	!
328	!-- Inverse Fourier Transformation
329	!-- Transposition z --> y
330	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
331	CALL transpose_zy( ar, ar_inv )
332	CALL resort_for_zy( ar_inv, ar )
333	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
334
335	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
336	IF ( fft_method /= 'system-specific' ) THEN
337	!$acc update host( ar )
338	ENDIF
339	CALL fft_y( ar, 'backward', ar_tr = ar, &
340	nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
341	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
342	IF ( fft_method /= 'system-specific' ) THEN
343	!$acc update device( ar )
344	ENDIF
345	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
346
347	!
348	!-- Transposition y --> x
349	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
350	CALL transpose_yx( ar, ar_inv )
351	CALL resort_for_yx( ar_inv, ar )
352	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
353
354	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
355	IF ( fft_method /= 'system-specific' ) THEN
356	!$acc update host( ar )
357	ENDIF
358	CALL fft_x( ar, 'backward' )
359	IF ( fft_method /= 'system-specific' ) THEN
360	!$acc update device( ar )
361	ENDIF
362	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
363
364	!
365	!-- Transposition x --> z
366	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
367	CALL transpose_xz( ar, ar_inv )
368	CALL resort_for_xz( ar_inv, ar )
369	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
370
371	ELSE
372
373	!
374	!-- 2d-domain-decomposition or no decomposition (1 PE run) with
375	!-- overlapping transposition / fft
376	!-- cputime logging must not use barriers, which would prevent overlapping
377	ALLOCATE( f_out_y(0:ny,nxl_y:nxr_y,nzb_y:nzt_y), &
378	f_out_z(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
379	!
380	!-- Transposition z --> x + subsequent fft along x
381	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
382	CALL resort_for_zx( ar, f_inv )
383	!
384	!-- Save original indices and gridpoint counter
385	isave(1) = nz
386	isave(2) = nzb_x
387	isave(3) = nzt_x
388	isave(4) = sendrecvcount_zx
389	!
390	!-- Set new indices for transformation
391	nblk = nz / pdims(1)
392	nz = pdims(1)
393	nnz_x = 1
394	nzb_x = 1 + myidx * nnz_x
395	nzt_x = ( myidx + 1 ) * nnz_x
396	sendrecvcount_zx = nnx * nny * nnz_x
397
398	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
399	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
400
401	DO kk = 1, nblk
402
403	IF ( kk == 1 ) THEN
404	CALL cpu_log( log_point_s(5), 'transpo forward', 'start', cpu_log_nowait )
405	ELSE
406	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
407	ENDIF
408
409	DO knew = 1, nz
410	ki = kk + nblk * ( knew - 1 )
411	f_in(:,:,knew) = f_inv(:,:,ki)
412	ENDDO
413
414	CALL transpose_zx( f_in, ar1(:,:,:))
415	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
416
417	IF ( kk == 1 ) THEN
418	CALL cpu_log( log_point_s(4), 'fft_x', 'start', cpu_log_nowait )
419	ELSE
420	CALL cpu_log( log_point_s(4), 'fft_x', 'continue', cpu_log_nowait )
421	ENDIF
422
423	n = isave(2) + kk - 1
424	CALL fft_x( ar1(:,:,:), 'forward', ar_2d = f_out_z(:,:,n))
425	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
426
427	ENDDO
428	!
429	!-- Restore original indices/counters
430	nz = isave(1)
431	nzb_x = isave(2)
432	nzt_x = isave(3)
433	sendrecvcount_zx = isave(4)
434
435	DEALLOCATE( ar1, f_in, f_inv )
436
437	!
438	!-- Transposition x --> y + subsequent fft along y
439	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
440	CALL resort_for_xy( f_out_z, f_inv )
441	!
442	!-- Save original indices and gridpoint counter
443	isave(1) = nx
444	isave(2) = nxl_y
445	isave(3) = nxr_y
446	isave(4) = sendrecvcount_xy
447	!
448	!-- Set new indices for transformation
449	nblk = ( ( nx+1 ) / pdims(2) ) - 1
450	nx = pdims(2)
451	nnx_y = 1
452	nxl_y = myidy * nnx_y
453	nxr_y = ( myidy + 1 ) * nnx_y - 1
454	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
455
456	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
457	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
458
459	DO ii = 0, nblk
460
461	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
462
463	DO inew = 0, nx-1
464	iind = ii + ( nblk + 1 ) * inew
465	f_in(:,:,inew) = f_inv(:,:,iind)
466	ENDDO
467
468	CALL transpose_xy( f_in, ar1(:,:,:) )
469
470	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
471
472	IF ( ii == 1 ) THEN
473	CALL cpu_log( log_point_s(7), 'fft_y', 'start', cpu_log_nowait )
474	ELSE
475	CALL cpu_log( log_point_s(7), 'fft_y', 'continue', cpu_log_nowait )
476	ENDIF
477
478	nxl_y_bound = isave(2)
479	nxr_y_bound = isave(3)
480	n = isave(2) + ii
481	CALL fft_y( ar1(:,:,:), 'forward', ar_tr = f_out_y, &
482	nxl_y_bound = nxl_y_bound, nxr_y_bound = nxr_y_bound, &
483	nxl_y_l = n, nxr_y_l = n )
484
485	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
486
487	ENDDO
488	!
489	!-- Restore original indices/counters
490	nx = isave(1)
491	nxl_y = isave(2)
492	nxr_y = isave(3)
493	sendrecvcount_xy = isave(4)
494
495	DEALLOCATE( ar1, f_in, f_inv )
496
497	!
498	!-- Transposition y --> z + subsequent tridia + resort for z --> y
499	ALLOCATE( f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
500	CALL resort_for_yz( f_out_y, f_inv )
501	!
502	!-- Save original indices and gridpoint counter
503	isave(1) = ny
504	isave(2) = nys_z
505	isave(3) = nyn_z
506	isave(4) = sendrecvcount_yz
507	!
508	!-- Set new indices for transformation
509	nblk = ( ( ny+1 ) / pdims(1) ) - 1
510	ny = pdims(1)
511	nny_z = 1
512	nys_z = myidx * nny_z
513	nyn_z = ( myidx + 1 ) * nny_z - 1
514	sendrecvcount_yz = ( nxr_y-nxl_y+1 ) * nny_z * ( nzt_y-nzb_y+1 )
515
516	ALLOCATE( ar1(nxl_z:nxr_z,nys_z:nyn_z,1:nz) )
517	ALLOCATE( f_in(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
518
519	DO jj = 0, nblk
520	!
521	!-- Forward Fourier Transformation
522	!-- Transposition y --> z
523	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
524
525	DO jnew = 0, ny-1
526	jind = jj + ( nblk + 1 ) * jnew
527	f_in(:,:,jnew) = f_inv(:,:,jind)
528	ENDDO
529
530	CALL transpose_yz( f_in, ar1(:,:,:) )
531
532	IF ( jj == nblk ) THEN
533	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
534	ELSE
535	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
536	ENDIF
537
538	!
539	!-- Solve the tridiagonal equation system along z
540	CALL cpu_log( log_point_s(6), 'tridia', 'start', cpu_log_nowait )
541
542	n = isave(2) + jj
543	CALL tridia_substi_overlap( ar1(:,:,:), n )
544
545	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
546
547	!
548	!-- Inverse Fourier Transformation
549	!-- Transposition z --> y
550	!-- Only one thread should call MPI routines, therefore forward and
551	!-- backward tranpose are in the same section
552	IF ( jj == 0 ) THEN
553	CALL cpu_log( log_point_s(8), 'transpo invers', 'start', cpu_log_nowait )
554	ELSE
555	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
556	ENDIF
557
558	CALL transpose_zy( ar1(:,:,:), f_in )
559
560	DO jnew = 0, ny-1
561	jind = jj + ( nblk + 1 ) * jnew
562	f_inv(:,:,jind) = f_in(:,:,jnew)
563	ENDDO
564
565	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
566
567	ENDDO
568	!
569	!-- Restore original indices/counters
570	ny = isave(1)
571	nys_z = isave(2)
572	nyn_z = isave(3)
573	sendrecvcount_yz = isave(4)
574
575	CALL resort_for_zy( f_inv, f_out_y )
576
577	DEALLOCATE( ar1, f_in, f_inv )
578
579	!
580	!-- fft along y backward + subsequent transposition y --> x
581	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
582	!
583	!-- Save original indices and gridpoint counter
584	isave(1) = nx
585	isave(2) = nxl_y
586	isave(3) = nxr_y
587	isave(4) = sendrecvcount_xy
588	!
589	!-- Set new indices for transformation
590	nblk = (( nx+1 ) / pdims(2) ) - 1
591	nx = pdims(2)
592	nnx_y = 1
593	nxl_y = myidy * nnx_y
594	nxr_y = ( myidy + 1 ) * nnx_y - 1
595	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
596
597	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
598	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
599
600	DO ii = 0, nblk
601
602	CALL cpu_log( log_point_s(7), 'fft_y', 'continue', cpu_log_nowait )
603
604	n = isave(2) + ii
605	nxl_y_bound = isave(2)
606	nxr_y_bound = isave(3)
607
608	CALL fft_y( ar1(:,:,:), 'backward', ar_tr = f_out_y, &
609	nxl_y_bound = nxl_y_bound, nxr_y_bound = nxr_y_bound, &
610	nxl_y_l = n, nxr_y_l = n )
611
612	IF ( ii == nblk ) THEN
613	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
614	ELSE
615	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
616	ENDIF
617
618	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
619
620	CALL transpose_yx( ar1(:,:,:), f_in )
621
622	DO inew = 0, nx-1
623	iind = ii + (nblk+1) * inew
624	f_inv(:,:,iind) = f_in(:,:,inew)
625	ENDDO
626
627	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
628
629	ENDDO
630	!
631	!-- Restore original indices/counters
632	nx = isave(1)
633	nxl_y = isave(2)
634	nxr_y = isave(3)
635	sendrecvcount_xy = isave(4)
636
637	CALL resort_for_yx( f_inv, f_out_z )
638
639	DEALLOCATE( ar1, f_in, f_inv )
640
641	!
642	!-- fft along x backward + subsequent final transposition x --> z
643	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
644	!
645	!-- Save original indices and gridpoint counter
646	isave(1) = nz
647	isave(2) = nzb_x
648	isave(3) = nzt_x
649	isave(4) = sendrecvcount_zx
650	!
651	!-- Set new indices for transformation
652	nblk = nz / pdims(1)
653	nz = pdims(1)
654	nnz_x = 1
655	nzb_x = 1 + myidx * nnz_x
656	nzt_x = ( myidx + 1 ) * nnz_x
657	sendrecvcount_zx = nnx * nny * nnz_x
658
659	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
660	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
661
662	DO kk = 1, nblk
663
664	CALL cpu_log( log_point_s(4), 'fft_x', 'continue', cpu_log_nowait )
665
666	n = isave(2) + kk - 1
667	CALL fft_x( ar1(:,:,:), 'backward', f_out_z(:,:,n))
668
669	IF ( kk == nblk ) THEN
670	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
671	ELSE
672	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
673	ENDIF
674
675	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
676
677	CALL transpose_xz( ar1(:,:,:), f_in )
678
679	DO knew = 1, nz
680	ki = kk + nblk * (knew-1)
681	f_inv(:,:,ki) = f_in(:,:,knew)
682	ENDDO
683
684	IF ( kk == nblk ) THEN
685	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
686	ELSE
687	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
688	ENDIF
689
690	ENDDO
691	!
692	!-- Restore original indices/counters
693	nz = isave(1)
694	nzb_x = isave(2)
695	nzt_x = isave(3)
696	sendrecvcount_zx = isave(4)
697
698	CALL resort_for_xz( f_inv, ar )
699
700	DEALLOCATE( ar1, f_in, f_inv )
701
702	ENDIF
703
704	CALL cpu_log( log_point_s(3), 'poisfft', 'stop' )
705
706	END SUBROUTINE poisfft
707
708
709
710	SUBROUTINE ffty_tr_yx( f_in, f_out )
711
712	!------------------------------------------------------------------------------!
713	! Fourier-transformation along y with subsequent transposition y --> x for
714	! a 1d-decomposition along x
715	!
716	! ATTENTION: The performance of this routine is much faster on the NEC-SX6,
717	! if the first index of work_ffty_vec is odd. Otherwise
718	! memory bank conflicts may occur (especially if the index is a
719	! multiple of 128). That's why work_ffty_vec is dimensioned as
720	! 0:ny+1.
721	! Of course, this will not work if users are using an odd number
722	! of gridpoints along y.
723	!------------------------------------------------------------------------------!
724
725	USE control_parameters
726	USE cpulog
727	USE indices
728	USE pegrid
729	USE transpose_indices
730
731	IMPLICIT NONE
732
733	INTEGER :: i, iend, iouter, ir, j, k
734	INTEGER, PARAMETER :: stridex = 4
735
736	REAL, DIMENSION(0:ny,stridex) :: work_ffty
737	#if defined( __nec )
738	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
739	#endif
740	REAL, DIMENSION(1:nz,0:ny,nxl:nxr) :: f_in
741	REAL, DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_out
742	REAL, DIMENSION(nxl:nxr,1:nz,0:ny) :: work
743
744	!
745	!-- Carry out the FFT along y, where all data are present due to the
746	!-- 1d-decomposition along x. Resort the data in a way that x becomes
747	!-- the first index.
748	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'start' )
749
750	IF ( host(1:3) == 'nec' ) THEN
751	#if defined( __nec )
752	!
753	!-- Code optimized for vector processors
754	!$OMP PARALLEL PRIVATE ( i, j, k )
755	!$OMP DO
756	DO i = nxl, nxr
757
758	DO j = 0, ny
759	DO k = 1, nz
760	work_ffty_vec(j,k,i) = f_in(k,j,i)
761	ENDDO
762	ENDDO
763
764	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' )
765
766	ENDDO
767
768	!$OMP DO
769	DO k = 1, nz
770	DO j = 0, ny
771	DO i = nxl, nxr
772	work(i,k,j) = work_ffty_vec(j,k,i)
773	ENDDO
774	ENDDO
775	ENDDO
776	!$OMP END PARALLEL
777	#endif
778
779	ELSE
780
781	!
782	!-- Cache optimized code.
783	!-- The i-(x-)direction is split into a strided outer loop and an inner
784	!-- loop for better cache performance
785	!$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty)
786	!$OMP DO
787	DO iouter = nxl, nxr, stridex
788
789	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
790
791	DO k = 1, nz
792
793	DO i = iouter, iend
794
795	ir = i-iouter+1 ! counter within a stride
796	DO j = 0, ny
797	work_ffty(j,ir) = f_in(k,j,i)
798	ENDDO
799	!
800	!-- FFT along y
801	CALL fft_y_1d( work_ffty(:,ir), 'forward' )
802
803	ENDDO
804
805	!
806	!-- Resort
807	DO j = 0, ny
808	DO i = iouter, iend
809	work(i,k,j) = work_ffty(j,i-iouter+1)
810	ENDDO
811	ENDDO
812
813	ENDDO
814
815	ENDDO
816	!$OMP END PARALLEL
817
818	ENDIF
819	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'pause' )
820
821	!
822	!-- Transpose array
823	#if defined( __parallel )
824	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
825	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
826	CALL MPI_ALLTOALL( work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
827	f_out(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
828	comm1dx, ierr )
829	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
830	#endif
831
832	END SUBROUTINE ffty_tr_yx
833
834
835	SUBROUTINE tr_xy_ffty( f_in, f_out )
836
837	!------------------------------------------------------------------------------!
838	! Transposition x --> y with a subsequent backward Fourier transformation for
839	! a 1d-decomposition along x
840	!------------------------------------------------------------------------------!
841
842	USE control_parameters
843	USE cpulog
844	USE indices
845	USE pegrid
846	USE transpose_indices
847
848	IMPLICIT NONE
849
850	INTEGER :: i, iend, iouter, ir, j, k
851	INTEGER, PARAMETER :: stridex = 4
852
853	REAL, DIMENSION(0:ny,stridex) :: work_ffty
854	#if defined( __nec )
855	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
856	#endif
857	REAL, DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_in
858	REAL, DIMENSION(1:nz,0:ny,nxl:nxr) :: f_out
859	REAL, DIMENSION(nxl:nxr,1:nz,0:ny) :: work
860
861	!
862	!-- Transpose array
863	#if defined( __parallel )
864	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
865	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
866	CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
867	work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
868	comm1dx, ierr )
869	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
870	#endif
871
872	!
873	!-- Resort the data in a way that y becomes the first index and carry out the
874	!-- backward fft along y.
875	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'continue' )
876
877	IF ( host(1:3) == 'nec' ) THEN
878	#if defined( __nec )
879	!
880	!-- Code optimized for vector processors
881	!$OMP PARALLEL PRIVATE ( i, j, k )
882	!$OMP DO
883	DO k = 1, nz
884	DO j = 0, ny
885	DO i = nxl, nxr
886	work_ffty_vec(j,k,i) = work(i,k,j)
887	ENDDO
888	ENDDO
889	ENDDO
890
891	!$OMP DO
892	DO i = nxl, nxr
893
894	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' )
895
896	DO j = 0, ny
897	DO k = 1, nz
898	f_out(k,j,i) = work_ffty_vec(j,k,i)
899	ENDDO
900	ENDDO
901
902	ENDDO
903	!$OMP END PARALLEL
904	#endif
905
906	ELSE
907
908	!
909	!-- Cache optimized code.
910	!-- The i-(x-)direction is split into a strided outer loop and an inner
911	!-- loop for better cache performance
912	!$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty )
913	!$OMP DO
914	DO iouter = nxl, nxr, stridex
915
916	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
917
918	DO k = 1, nz
919	!
920	!-- Resort
921	DO j = 0, ny
922	DO i = iouter, iend
923	work_ffty(j,i-iouter+1) = work(i,k,j)
924	ENDDO
925	ENDDO
926
927	DO i = iouter, iend
928
929	!
930	!-- FFT along y
931	ir = i-iouter+1 ! counter within a stride
932	CALL fft_y_1d( work_ffty(:,ir), 'backward' )
933
934	DO j = 0, ny
935	f_out(k,j,i) = work_ffty(j,ir)
936	ENDDO
937	ENDDO
938
939	ENDDO
940
941	ENDDO
942	!$OMP END PARALLEL
943
944	ENDIF
945
946	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'stop' )
947
948	END SUBROUTINE tr_xy_ffty
949
950
951	SUBROUTINE fftx_tri_fftx( ar )
952
953	!------------------------------------------------------------------------------!
954	! FFT along x, solution of the tridiagonal system and backward FFT for
955	! a 1d-decomposition along x
956	!
957	! WARNING: this subroutine may still not work for hybrid parallelization
958	! with OpenMP (for possible necessary changes see the original
959	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
960	!------------------------------------------------------------------------------!
961
962	USE control_parameters
963	USE cpulog
964	USE grid_variables
965	USE indices
966	USE pegrid
967	USE transpose_indices
968
969	IMPLICIT NONE
970
971	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
972
973	REAL, DIMENSION(0:nx) :: work_fftx
974	REAL, DIMENSION(0:nx,1:nz) :: work_trix
975	REAL, DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: ar
976	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
977
978
979	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'start' )
980
981	ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) )
982
983	tn = 0 ! Default thread number in case of one thread
984	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_fftx, work_trix )
985	DO j = nys_x, nyn_x
986
987	!$ tn = omp_get_thread_num()
988
989	IF ( host(1:3) == 'nec' ) THEN
990	!
991	!-- Code optimized for vector processors
992	DO k = 1, nz
993
994	m = 0
995	DO n = 1, pdims(1)
996	DO i = 1, nnx
997	work_trix(m,k) = ar(i,k,j,n)
998	m = m + 1
999	ENDDO
1000	ENDDO
1001
1002	ENDDO
1003
1004	CALL fft_x_m( work_trix, 'forward' )
1005
1006	ELSE
1007	!
1008	!-- Cache optimized code
1009	DO k = 1, nz
1010
1011	m = 0
1012	DO n = 1, pdims(1)
1013	DO i = 1, nnx
1014	work_fftx(m) = ar(i,k,j,n)
1015	m = m + 1
1016	ENDDO
1017	ENDDO
1018
1019	CALL fft_x_1d( work_fftx, 'forward' )
1020
1021	DO i = 0, nx
1022	work_trix(i,k) = work_fftx(i)
1023	ENDDO
1024
1025	ENDDO
1026
1027	ENDIF
1028
1029	!
1030	!-- Solve the linear equation system
1031	CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) )
1032
1033	IF ( host(1:3) == 'nec' ) THEN
1034	!
1035	!-- Code optimized for vector processors
1036	CALL fft_x_m( work_trix, 'backward' )
1037
1038	DO k = 1, nz
1039
1040	m = 0
1041	DO n = 1, pdims(1)
1042	DO i = 1, nnx
1043	ar(i,k,j,n) = work_trix(m,k)
1044	m = m + 1
1045	ENDDO
1046	ENDDO
1047
1048	ENDDO
1049
1050	ELSE
1051	!
1052	!-- Cache optimized code
1053	DO k = 1, nz
1054
1055	DO i = 0, nx
1056	work_fftx(i) = work_trix(i,k)
1057	ENDDO
1058
1059	CALL fft_x_1d( work_fftx, 'backward' )
1060
1061	m = 0
1062	DO n = 1, pdims(1)
1063	DO i = 1, nnx
1064	ar(i,k,j,n) = work_fftx(m)
1065	m = m + 1
1066	ENDDO
1067	ENDDO
1068
1069	ENDDO
1070
1071	ENDIF
1072
1073	ENDDO
1074
1075	DEALLOCATE( tri )
1076
1077	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'stop' )
1078
1079	END SUBROUTINE fftx_tri_fftx
1080
1081
1082	SUBROUTINE fftx_tr_xy( f_in, f_out )
1083
1084	!------------------------------------------------------------------------------!
1085	! Fourier-transformation along x with subsequent transposition x --> y for
1086	! a 1d-decomposition along y
1087	!
1088	! ATTENTION: The NEC-branch of this routine may significantly profit from
1089	! further optimizations. So far, performance is much worse than
1090	! for routine ffty_tr_yx (more than three times slower).
1091	!------------------------------------------------------------------------------!
1092
1093	USE control_parameters
1094	USE cpulog
1095	USE indices
1096	USE pegrid
1097	USE transpose_indices
1098
1099	IMPLICIT NONE
1100
1101	INTEGER :: i, j, k
1102
1103	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
1104	REAL, DIMENSION(1:nz,nys:nyn,0:nx) :: f_in
1105	REAL, DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_out
1106	REAL, DIMENSION(nys:nyn,1:nz,0:nx) :: work
1107
1108	!
1109	!-- Carry out the FFT along x, where all data are present due to the
1110	!-- 1d-decomposition along y. Resort the data in a way that y becomes
1111	!-- the first index.
1112	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'start' )
1113
1114	IF ( host(1:3) == 'nec' ) THEN
1115	!
1116	!-- Code for vector processors
1117	!$OMP PARALLEL PRIVATE ( i, j, k )
1118	!$OMP DO
1119	DO i = 0, nx
1120
1121	DO j = nys, nyn
1122	DO k = 1, nz
1123	work_fftx(i,k,j) = f_in(k,j,i)
1124	ENDDO
1125	ENDDO
1126
1127	ENDDO
1128
1129	!$OMP DO
1130	DO j = nys, nyn
1131
1132	CALL fft_x_m( work_fftx(:,:,j), 'forward' )
1133
1134	DO k = 1, nz
1135	DO i = 0, nx
1136	work(j,k,i) = work_fftx(i,k,j)
1137	ENDDO
1138	ENDDO
1139
1140	ENDDO
1141	!$OMP END PARALLEL
1142
1143	ELSE
1144
1145	!
1146	!-- Cache optimized code (there might be still a potential for better
1147	!-- optimization).
1148	!$OMP PARALLEL PRIVATE (i,j,k)
1149	!$OMP DO
1150	DO i = 0, nx
1151
1152	DO j = nys, nyn
1153	DO k = 1, nz
1154	work_fftx(i,k,j) = f_in(k,j,i)
1155	ENDDO
1156	ENDDO
1157
1158	ENDDO
1159
1160	!$OMP DO
1161	DO j = nys, nyn
1162	DO k = 1, nz
1163
1164	CALL fft_x_1d( work_fftx(0:nx,k,j), 'forward' )
1165
1166	DO i = 0, nx
1167	work(j,k,i) = work_fftx(i,k,j)
1168	ENDDO
1169	ENDDO
1170
1171	ENDDO
1172	!$OMP END PARALLEL
1173
1174	ENDIF
1175	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'pause' )
1176
1177	!
1178	!-- Transpose array
1179	#if defined( __parallel )
1180	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1181	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1182	CALL MPI_ALLTOALL( work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
1183	f_out(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
1184	comm1dy, ierr )
1185	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1186	#endif
1187
1188	END SUBROUTINE fftx_tr_xy
1189
1190
1191	SUBROUTINE tr_yx_fftx( f_in, f_out )
1192
1193	!------------------------------------------------------------------------------!
1194	! Transposition y --> x with a subsequent backward Fourier transformation for
1195	! a 1d-decomposition along x
1196	!------------------------------------------------------------------------------!
1197
1198	USE control_parameters
1199	USE cpulog
1200	USE indices
1201	USE pegrid
1202	USE transpose_indices
1203
1204	IMPLICIT NONE
1205
1206	INTEGER :: i, j, k
1207
1208	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
1209	REAL, DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_in
1210	REAL, DIMENSION(1:nz,nys:nyn,0:nx) :: f_out
1211	REAL, DIMENSION(nys:nyn,1:nz,0:nx) :: work
1212
1213	!
1214	!-- Transpose array
1215	#if defined( __parallel )
1216	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1217	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1218	CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
1219	work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
1220	comm1dy, ierr )
1221	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1222	#endif
1223
1224	!
1225	!-- Carry out the FFT along x, where all data are present due to the
1226	!-- 1d-decomposition along y. Resort the data in a way that y becomes
1227	!-- the first index.
1228	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'continue' )
1229
1230	IF ( host(1:3) == 'nec' ) THEN
1231	!
1232	!-- Code optimized for vector processors
1233	!$OMP PARALLEL PRIVATE ( i, j, k )
1234	!$OMP DO
1235	DO j = nys, nyn
1236
1237	DO k = 1, nz
1238	DO i = 0, nx
1239	work_fftx(i,k,j) = work(j,k,i)
1240	ENDDO
1241	ENDDO
1242
1243	CALL fft_x_m( work_fftx(:,:,j), 'backward' )
1244
1245	ENDDO
1246
1247	!$OMP DO
1248	DO i = 0, nx
1249	DO j = nys, nyn
1250	DO k = 1, nz
1251	f_out(k,j,i) = work_fftx(i,k,j)
1252	ENDDO
1253	ENDDO
1254	ENDDO
1255	!$OMP END PARALLEL
1256
1257	ELSE
1258
1259	!
1260	!-- Cache optimized code (there might be still a potential for better
1261	!-- optimization).
1262	!$OMP PARALLEL PRIVATE (i,j,k)
1263	!$OMP DO
1264	DO j = nys, nyn
1265	DO k = 1, nz
1266
1267	DO i = 0, nx
1268	work_fftx(i,k,j) = work(j,k,i)
1269	ENDDO
1270
1271	CALL fft_x_1d( work_fftx(0:nx,k,j), 'backward' )
1272
1273	ENDDO
1274	ENDDO
1275
1276	!$OMP DO
1277	DO i = 0, nx
1278	DO j = nys, nyn
1279	DO k = 1, nz
1280	f_out(k,j,i) = work_fftx(i,k,j)
1281	ENDDO
1282	ENDDO
1283	ENDDO
1284	!$OMP END PARALLEL
1285
1286	ENDIF
1287	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'stop' )
1288
1289	END SUBROUTINE tr_yx_fftx
1290
1291
1292	SUBROUTINE ffty_tri_ffty( ar )
1293
1294	!------------------------------------------------------------------------------!
1295	! FFT along y, solution of the tridiagonal system and backward FFT for
1296	! a 1d-decomposition along y
1297	!
1298	! WARNING: this subroutine may still not work for hybrid parallelization
1299	! with OpenMP (for possible necessary changes see the original
1300	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
1301	!------------------------------------------------------------------------------!
1302
1303	USE control_parameters
1304	USE cpulog
1305	USE grid_variables
1306	USE indices
1307	USE pegrid
1308	USE transpose_indices
1309
1310	IMPLICIT NONE
1311
1312	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
1313
1314	REAL, DIMENSION(0:ny) :: work_ffty
1315	REAL, DIMENSION(0:ny,1:nz) :: work_triy
1316	REAL, DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: ar
1317	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
1318
1319
1320	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'start' )
1321
1322	ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) )
1323
1324	tn = 0 ! Default thread number in case of one thread
1325	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy )
1326	DO i = nxl_y, nxr_y
1327
1328	!$ tn = omp_get_thread_num()
1329
1330	IF ( host(1:3) == 'nec' ) THEN
1331	!
1332	!-- Code optimized for vector processors
1333	DO k = 1, nz
1334
1335	m = 0
1336	DO n = 1, pdims(2)
1337	DO j = 1, nny
1338	work_triy(m,k) = ar(j,k,i,n)
1339	m = m + 1
1340	ENDDO
1341	ENDDO
1342
1343	ENDDO
1344
1345	CALL fft_y_m( work_triy, ny, 'forward' )
1346
1347	ELSE
1348	!
1349	!-- Cache optimized code
1350	DO k = 1, nz
1351
1352	m = 0
1353	DO n = 1, pdims(2)
1354	DO j = 1, nny
1355	work_ffty(m) = ar(j,k,i,n)
1356	m = m + 1
1357	ENDDO
1358	ENDDO
1359
1360	CALL fft_y_1d( work_ffty, 'forward' )
1361
1362	DO j = 0, ny
1363	work_triy(j,k) = work_ffty(j)
1364	ENDDO
1365
1366	ENDDO
1367
1368	ENDIF
1369
1370	!
1371	!-- Solve the linear equation system
1372	CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) )
1373
1374	IF ( host(1:3) == 'nec' ) THEN
1375	!
1376	!-- Code optimized for vector processors
1377	CALL fft_y_m( work_triy, ny, 'backward' )
1378
1379	DO k = 1, nz
1380
1381	m = 0
1382	DO n = 1, pdims(2)
1383	DO j = 1, nny
1384	ar(j,k,i,n) = work_triy(m,k)
1385	m = m + 1
1386	ENDDO
1387	ENDDO
1388
1389	ENDDO
1390
1391	ELSE
1392	!
1393	!-- Cache optimized code
1394	DO k = 1, nz
1395
1396	DO j = 0, ny
1397	work_ffty(j) = work_triy(j,k)
1398	ENDDO
1399
1400	CALL fft_y_1d( work_ffty, 'backward' )
1401
1402	m = 0
1403	DO n = 1, pdims(2)
1404	DO j = 1, nny
1405	ar(j,k,i,n) = work_ffty(m)
1406	m = m + 1
1407	ENDDO
1408	ENDDO
1409
1410	ENDDO
1411
1412	ENDIF
1413
1414	ENDDO
1415
1416	DEALLOCATE( tri )
1417
1418	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'stop' )
1419
1420	END SUBROUTINE ffty_tri_ffty
1421
1422	#endif
1423
1424	END MODULE poisfft_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |