Home

Context Navigation

source: palm/trunk/SOURCE/poisfft.f90 @ 1357

Last change on this file since 1357 was 1321, checked in by raasch, 11 years ago
last commit documented
Property svn:keywords set to `Id`
File size: 44.8 KB

Line
1	MODULE poisfft_mod
2
3	!--------------------------------------------------------------------------------!
4	! This file is part of PALM.
5	!
6	! PALM is free software: you can redistribute it and/or modify it under the terms
7	! of the GNU General Public License as published by the Free Software Foundation,
8	! either version 3 of the License, or (at your option) any later version.
9	!
10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13	!
14	! You should have received a copy of the GNU General Public License along with
15	! PALM. If not, see <http://www.gnu.org/licenses/>.
16	!
17	! Copyright 1997-2014 Leibniz Universitaet Hannover
18	!--------------------------------------------------------------------------------!
19	!
20	! Current revisions:
21	! -----------------
22	!
23	!
24	! Former revisions:
25	! -----------------
26	! $Id: poisfft.f90 1321 2014-03-20 09:40:40Z witha $
27	!
28	! 1320 2014-03-20 08:40:49Z raasch
29	! ONLY-attribute added to USE-statements,
30	! kind-parameters added to all INTEGER and REAL declaration statements,
31	! kinds are defined in new module kinds,
32	! old module precision_kind is removed,
33	! revision history before 2012 removed,
34	! comment fields (!:) to be used for variable explanations added to
35	! all variable declaration statements
36	!
37	! 1318 2014-03-17 13:35:16Z raasch
38	! module interfaces removed
39	!
40	! 1306 2014-03-13 14:30:59Z raasch
41	! openmp sections removed from the overlap branch,
42	! second argument removed from parameter list
43	!
44	! 1216 2013-08-26 09:31:42Z raasch
45	! resorting of arrays moved to separate routines resort_for_...,
46	! one argument, used as temporary work array, removed from all transpose
47	! routines
48	! overlapping fft / transposition implemented
49	!
50	! 1212 2013-08-15 08:46:27Z raasch
51	! tridia routines moved to seperate module tridia_solver
52	!
53	! 1208 2013-08-13 06:41:49Z raasch
54	! acc-update clauses added for "ar" so that ffts other than cufft can also be
55	! used (although they are not ported and will give a poor performance)
56	!
57	! 1111 2013-03-08 23:54:10Z raasch
58	! further openACC porting of non-parallel (MPI) branch:
59	! tridiagonal routines split into extermal subroutines (instead using CONTAINS),
60	! no distinction between parallel/non-parallel in poisfft and tridia any more,
61	! tridia routines moved to end of file because of probable bug in PGI compiler 12.5
62	! (otherwise "invalid device function" is indicated during runtime),
63	! optimization of tridia routines: constant elements and coefficients of tri are
64	! stored in seperate arrays ddzuw and tric, last dimension of tri reduced from 5
65	! to 2,
66	! poisfft_init is now called internally from poisfft, maketri is called from
67	! poisfft_init,
68	! ibc_p_b = 2 removed
69	!
70	! 1106 2013-03-04 05:31:38Z raasch
71	! routines fftx, ffty, fftxp, fftyp removed, calls replaced by fft_x, fft_y,
72	! in the 1D-decomposition routines fft_x, ffty are replaced by fft_x_1d,
73	! fft_y_1d
74	!
75	! 1103 2013-02-20 02:15:53Z raasch
76	! tri, ar, and ar1 arguments in tridia-routines (2d) are removed because they
77	! sometimes cause segmentation faults with intel 12.1 compiler
78	!
79	! 1092 2013-02-02 11:24:22Z raasch
80	! unused variables removed
81	!
82	! 1036 2012-10-22 13:43:42Z raasch
83	! code put under GPL (PALM 3.9)
84	!
85	! 2012-09-21 07:03:55Z raasch
86	! FLOAT type conversion replaced by REAL
87	!
88	! 1003 2012-09-14 14:35:53Z raasch
89	! indices nxa, nya, etc. replaced by nx, ny, etc.
90	!
91	! 940 2012-07-09 14:31:00Z raasch
92	! special handling of tri-array as an argument in tridia_1dd routines switched
93	! off because it caused segmentation faults with intel 12.1 compiler
94	!
95	! 877 2012-04-03 11:21:44Z suehring
96	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
97	! pressure at the top of the model domain.
98	!
99	! 809 2012-01-30 13:32:58Z maronga
100	! Bugfix: replaced .AND. and .NOT. with && and ! in the preprocessor directives
101	!
102	! 807 2012-01-25 11:53:51Z maronga
103	! New cpp directive "__check" implemented which is used by check_namelist_files
104	! (most of the code is unneeded by check_namelist_files).
105	!
106	! Revision 1.1 1997/07/24 11:24:14 raasch
107	! Initial revision
108	!
109	!
110	! Description:
111	! ------------
112	! Original version by Stephan Siano (pois3d), as of July 23, 1996
113	! Adapted for 2D-domain-decomposition by Siegfried Raasch, July 3, 1997
114	!
115	! Solves the Poisson equation with a 2D spectral method
116	! d^2 p / dx^2 + d^2 p / dy^2 + d^2 p / dz^2 = s
117	!
118	! Input:
119	! real ar contains (nnz,nny,nnx) elements of the velocity divergence,
120	! starting from (1,nys,nxl)
121	!
122	! Output:
123	! real ar contains the solution for perturbation pressure p
124	!------------------------------------------------------------------------------!
125
126	USE fft_xy, &
127	ONLY: fft_init, fft_y, fft_y_1d, fft_y_m, fft_x, fft_x_1d, fft_x_m
128
129	USE indices, &
130	ONLY: nnx, nny, nx, nxl, nxr, ny, nys, nyn, nz
131
132	USE transpose_indices, &
133	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nys_x, nys_z, nyn_x, nyn_z, nzb_x, &
134	nzb_y, nzt_x, nzt_y
135
136	USE tridia_solver, &
137	ONLY: tridia_1dd, tridia_init, tridia_substi, tridia_substi_overlap
138
139	IMPLICIT NONE
140
141	LOGICAL, SAVE :: poisfft_initialized = .FALSE.
142
143	PRIVATE
144
145	#if ! defined ( __check )
146	PUBLIC poisfft, poisfft_init
147
148	INTERFACE poisfft
149	MODULE PROCEDURE poisfft
150	END INTERFACE poisfft
151
152	INTERFACE poisfft_init
153	MODULE PROCEDURE poisfft_init
154	END INTERFACE poisfft_init
155	#else
156	PUBLIC poisfft_init
157
158	INTERFACE poisfft_init
159	MODULE PROCEDURE poisfft_init
160	END INTERFACE poisfft_init
161	#endif
162
163	CONTAINS
164
165	SUBROUTINE poisfft_init
166
167	USE arrays_3d, &
168	ONLY: ddzu_pres, ddzw
169
170	USE kinds
171
172	IMPLICIT NONE
173
174	INTEGER(iwp) :: k !:
175
176
177	CALL fft_init
178
179	CALL tridia_init
180
181	poisfft_initialized = .TRUE.
182
183	END SUBROUTINE poisfft_init
184
185
186	#if ! defined ( __check )
187	SUBROUTINE poisfft( ar )
188
189	USE control_parameters, &
190	ONLY: fft_method, transpose_compute_overlap
191
192	USE cpulog, &
193	ONLY: cpu_log, cpu_log_nowait, log_point_s
194
195	USE kinds
196
197	USE pegrid
198
199	IMPLICIT NONE
200
201	INTEGER(iwp) :: ii !:
202	INTEGER(iwp) :: iind !:
203	INTEGER(iwp) :: inew !:
204	INTEGER(iwp) :: jj !:
205	INTEGER(iwp) :: jind !:
206	INTEGER(iwp) :: jnew !:
207	INTEGER(iwp) :: ki !:
208	INTEGER(iwp) :: kk !:
209	INTEGER(iwp) :: knew !:
210	INTEGER(iwp) :: n !:
211	INTEGER(iwp) :: nblk !:
212	INTEGER(iwp) :: nnx_y !:
213	INTEGER(iwp) :: nny_z !:
214	INTEGER(iwp) :: nnz_t !:
215	INTEGER(iwp) :: nnz_x !:
216	INTEGER(iwp) :: nxl_y_bound !:
217	INTEGER(iwp) :: nxr_y_bound !:
218
219	INTEGER(iwp), DIMENSION(4) :: isave !:
220
221	REAL(wp), DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar !:
222	!$acc declare create( ar_inv )
223	REAL(wp), DIMENSION(nys:nyn,nxl:nxr,1:nz) :: ar_inv !:
224
225	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: ar1 !:
226	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_in !:
227	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_inv !:
228	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_out_y !:
229	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_out_z !:
230
231
232	CALL cpu_log( log_point_s(3), 'poisfft', 'start' )
233
234	IF ( .NOT. poisfft_initialized ) CALL poisfft_init
235
236	!
237	!-- Two-dimensional Fourier Transformation in x- and y-direction.
238	IF ( pdims(2) == 1 .AND. pdims(1) > 1 ) THEN
239
240	!
241	!-- 1d-domain-decomposition along x:
242	!-- FFT along y and transposition y --> x
243	CALL ffty_tr_yx( ar, ar )
244
245	!
246	!-- FFT along x, solving the tridiagonal system and backward FFT
247	CALL fftx_tri_fftx( ar )
248
249	!
250	!-- Transposition x --> y and backward FFT along y
251	CALL tr_xy_ffty( ar, ar )
252
253	ELSEIF ( pdims(1) == 1 .AND. pdims(2) > 1 ) THEN
254
255	!
256	!-- 1d-domain-decomposition along y:
257	!-- FFT along x and transposition x --> y
258	CALL fftx_tr_xy( ar, ar )
259
260	!
261	!-- FFT along y, solving the tridiagonal system and backward FFT
262	CALL ffty_tri_ffty( ar )
263
264	!
265	!-- Transposition y --> x and backward FFT along x
266	CALL tr_yx_fftx( ar, ar )
267
268	ELSEIF ( .NOT. transpose_compute_overlap ) THEN
269
270	!
271	!-- 2d-domain-decomposition or no decomposition (1 PE run)
272	!-- Transposition z --> x
273	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
274	CALL resort_for_zx( ar, ar_inv )
275	CALL transpose_zx( ar_inv, ar )
276	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
277
278	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
279	IF ( fft_method /= 'system-specific' ) THEN
280	!$acc update host( ar )
281	ENDIF
282	CALL fft_x( ar, 'forward' )
283	IF ( fft_method /= 'system-specific' ) THEN
284	!$acc update device( ar )
285	ENDIF
286	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
287
288	!
289	!-- Transposition x --> y
290	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
291	CALL resort_for_xy( ar, ar_inv )
292	CALL transpose_xy( ar_inv, ar )
293	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
294
295	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
296	IF ( fft_method /= 'system-specific' ) THEN
297	!$acc update host( ar )
298	ENDIF
299	CALL fft_y( ar, 'forward', ar_tr = ar, &
300	nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
301	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
302	IF ( fft_method /= 'system-specific' ) THEN
303	!$acc update device( ar )
304	ENDIF
305	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
306
307	!
308	!-- Transposition y --> z
309	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
310	CALL resort_for_yz( ar, ar_inv )
311	CALL transpose_yz( ar_inv, ar )
312	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
313
314	!
315	!-- Solve the tridiagonal equation system along z
316	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
317	CALL tridia_substi( ar )
318	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
319
320	!
321	!-- Inverse Fourier Transformation
322	!-- Transposition z --> y
323	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
324	CALL transpose_zy( ar, ar_inv )
325	CALL resort_for_zy( ar_inv, ar )
326	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
327
328	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
329	IF ( fft_method /= 'system-specific' ) THEN
330	!$acc update host( ar )
331	ENDIF
332	CALL fft_y( ar, 'backward', ar_tr = ar, &
333	nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
334	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
335	IF ( fft_method /= 'system-specific' ) THEN
336	!$acc update device( ar )
337	ENDIF
338	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
339
340	!
341	!-- Transposition y --> x
342	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
343	CALL transpose_yx( ar, ar_inv )
344	CALL resort_for_yx( ar_inv, ar )
345	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
346
347	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
348	IF ( fft_method /= 'system-specific' ) THEN
349	!$acc update host( ar )
350	ENDIF
351	CALL fft_x( ar, 'backward' )
352	IF ( fft_method /= 'system-specific' ) THEN
353	!$acc update device( ar )
354	ENDIF
355	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
356
357	!
358	!-- Transposition x --> z
359	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
360	CALL transpose_xz( ar, ar_inv )
361	CALL resort_for_xz( ar_inv, ar )
362	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
363
364	ELSE
365
366	!
367	!-- 2d-domain-decomposition or no decomposition (1 PE run) with
368	!-- overlapping transposition / fft
369	!-- cputime logging must not use barriers, which would prevent overlapping
370	ALLOCATE( f_out_y(0:ny,nxl_y:nxr_y,nzb_y:nzt_y), &
371	f_out_z(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
372	!
373	!-- Transposition z --> x + subsequent fft along x
374	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
375	CALL resort_for_zx( ar, f_inv )
376	!
377	!-- Save original indices and gridpoint counter
378	isave(1) = nz
379	isave(2) = nzb_x
380	isave(3) = nzt_x
381	isave(4) = sendrecvcount_zx
382	!
383	!-- Set new indices for transformation
384	nblk = nz / pdims(1)
385	nz = pdims(1)
386	nnz_x = 1
387	nzb_x = 1 + myidx * nnz_x
388	nzt_x = ( myidx + 1 ) * nnz_x
389	sendrecvcount_zx = nnx * nny * nnz_x
390
391	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
392	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
393
394	DO kk = 1, nblk
395
396	IF ( kk == 1 ) THEN
397	CALL cpu_log( log_point_s(5), 'transpo forward', 'start', cpu_log_nowait )
398	ELSE
399	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
400	ENDIF
401
402	DO knew = 1, nz
403	ki = kk + nblk * ( knew - 1 )
404	f_in(:,:,knew) = f_inv(:,:,ki)
405	ENDDO
406
407	CALL transpose_zx( f_in, ar1(:,:,:))
408	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
409
410	IF ( kk == 1 ) THEN
411	CALL cpu_log( log_point_s(4), 'fft_x', 'start', cpu_log_nowait )
412	ELSE
413	CALL cpu_log( log_point_s(4), 'fft_x', 'continue', cpu_log_nowait )
414	ENDIF
415
416	n = isave(2) + kk - 1
417	CALL fft_x( ar1(:,:,:), 'forward', ar_2d = f_out_z(:,:,n))
418	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
419
420	ENDDO
421	!
422	!-- Restore original indices/counters
423	nz = isave(1)
424	nzb_x = isave(2)
425	nzt_x = isave(3)
426	sendrecvcount_zx = isave(4)
427
428	DEALLOCATE( ar1, f_in, f_inv )
429
430	!
431	!-- Transposition x --> y + subsequent fft along y
432	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
433	CALL resort_for_xy( f_out_z, f_inv )
434	!
435	!-- Save original indices and gridpoint counter
436	isave(1) = nx
437	isave(2) = nxl_y
438	isave(3) = nxr_y
439	isave(4) = sendrecvcount_xy
440	!
441	!-- Set new indices for transformation
442	nblk = ( ( nx+1 ) / pdims(2) ) - 1
443	nx = pdims(2)
444	nnx_y = 1
445	nxl_y = myidy * nnx_y
446	nxr_y = ( myidy + 1 ) * nnx_y - 1
447	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
448
449	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
450	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
451
452	DO ii = 0, nblk
453
454	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
455
456	DO inew = 0, nx-1
457	iind = ii + ( nblk + 1 ) * inew
458	f_in(:,:,inew) = f_inv(:,:,iind)
459	ENDDO
460
461	CALL transpose_xy( f_in, ar1(:,:,:) )
462
463	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
464
465	IF ( ii == 1 ) THEN
466	CALL cpu_log( log_point_s(7), 'fft_y', 'start', cpu_log_nowait )
467	ELSE
468	CALL cpu_log( log_point_s(7), 'fft_y', 'continue', cpu_log_nowait )
469	ENDIF
470
471	nxl_y_bound = isave(2)
472	nxr_y_bound = isave(3)
473	n = isave(2) + ii
474	CALL fft_y( ar1(:,:,:), 'forward', ar_tr = f_out_y, &
475	nxl_y_bound = nxl_y_bound, nxr_y_bound = nxr_y_bound, &
476	nxl_y_l = n, nxr_y_l = n )
477
478	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
479
480	ENDDO
481	!
482	!-- Restore original indices/counters
483	nx = isave(1)
484	nxl_y = isave(2)
485	nxr_y = isave(3)
486	sendrecvcount_xy = isave(4)
487
488	DEALLOCATE( ar1, f_in, f_inv )
489
490	!
491	!-- Transposition y --> z + subsequent tridia + resort for z --> y
492	ALLOCATE( f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
493	CALL resort_for_yz( f_out_y, f_inv )
494	!
495	!-- Save original indices and gridpoint counter
496	isave(1) = ny
497	isave(2) = nys_z
498	isave(3) = nyn_z
499	isave(4) = sendrecvcount_yz
500	!
501	!-- Set new indices for transformation
502	nblk = ( ( ny+1 ) / pdims(1) ) - 1
503	ny = pdims(1)
504	nny_z = 1
505	nys_z = myidx * nny_z
506	nyn_z = ( myidx + 1 ) * nny_z - 1
507	sendrecvcount_yz = ( nxr_y-nxl_y+1 ) * nny_z * ( nzt_y-nzb_y+1 )
508
509	ALLOCATE( ar1(nxl_z:nxr_z,nys_z:nyn_z,1:nz) )
510	ALLOCATE( f_in(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
511
512	DO jj = 0, nblk
513	!
514	!-- Forward Fourier Transformation
515	!-- Transposition y --> z
516	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
517
518	DO jnew = 0, ny-1
519	jind = jj + ( nblk + 1 ) * jnew
520	f_in(:,:,jnew) = f_inv(:,:,jind)
521	ENDDO
522
523	CALL transpose_yz( f_in, ar1(:,:,:) )
524
525	IF ( jj == nblk ) THEN
526	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
527	ELSE
528	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
529	ENDIF
530
531	!
532	!-- Solve the tridiagonal equation system along z
533	CALL cpu_log( log_point_s(6), 'tridia', 'start', cpu_log_nowait )
534
535	n = isave(2) + jj
536	CALL tridia_substi_overlap( ar1(:,:,:), n )
537
538	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
539
540	!
541	!-- Inverse Fourier Transformation
542	!-- Transposition z --> y
543	!-- Only one thread should call MPI routines, therefore forward and
544	!-- backward tranpose are in the same section
545	IF ( jj == 0 ) THEN
546	CALL cpu_log( log_point_s(8), 'transpo invers', 'start', cpu_log_nowait )
547	ELSE
548	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
549	ENDIF
550
551	CALL transpose_zy( ar1(:,:,:), f_in )
552
553	DO jnew = 0, ny-1
554	jind = jj + ( nblk + 1 ) * jnew
555	f_inv(:,:,jind) = f_in(:,:,jnew)
556	ENDDO
557
558	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
559
560	ENDDO
561	!
562	!-- Restore original indices/counters
563	ny = isave(1)
564	nys_z = isave(2)
565	nyn_z = isave(3)
566	sendrecvcount_yz = isave(4)
567
568	CALL resort_for_zy( f_inv, f_out_y )
569
570	DEALLOCATE( ar1, f_in, f_inv )
571
572	!
573	!-- fft along y backward + subsequent transposition y --> x
574	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
575	!
576	!-- Save original indices and gridpoint counter
577	isave(1) = nx
578	isave(2) = nxl_y
579	isave(3) = nxr_y
580	isave(4) = sendrecvcount_xy
581	!
582	!-- Set new indices for transformation
583	nblk = (( nx+1 ) / pdims(2) ) - 1
584	nx = pdims(2)
585	nnx_y = 1
586	nxl_y = myidy * nnx_y
587	nxr_y = ( myidy + 1 ) * nnx_y - 1
588	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
589
590	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
591	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
592
593	DO ii = 0, nblk
594
595	CALL cpu_log( log_point_s(7), 'fft_y', 'continue', cpu_log_nowait )
596
597	n = isave(2) + ii
598	nxl_y_bound = isave(2)
599	nxr_y_bound = isave(3)
600
601	CALL fft_y( ar1(:,:,:), 'backward', ar_tr = f_out_y, &
602	nxl_y_bound = nxl_y_bound, nxr_y_bound = nxr_y_bound, &
603	nxl_y_l = n, nxr_y_l = n )
604
605	IF ( ii == nblk ) THEN
606	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
607	ELSE
608	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
609	ENDIF
610
611	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
612
613	CALL transpose_yx( ar1(:,:,:), f_in )
614
615	DO inew = 0, nx-1
616	iind = ii + (nblk+1) * inew
617	f_inv(:,:,iind) = f_in(:,:,inew)
618	ENDDO
619
620	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
621
622	ENDDO
623	!
624	!-- Restore original indices/counters
625	nx = isave(1)
626	nxl_y = isave(2)
627	nxr_y = isave(3)
628	sendrecvcount_xy = isave(4)
629
630	CALL resort_for_yx( f_inv, f_out_z )
631
632	DEALLOCATE( ar1, f_in, f_inv )
633
634	!
635	!-- fft along x backward + subsequent final transposition x --> z
636	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
637	!
638	!-- Save original indices and gridpoint counter
639	isave(1) = nz
640	isave(2) = nzb_x
641	isave(3) = nzt_x
642	isave(4) = sendrecvcount_zx
643	!
644	!-- Set new indices for transformation
645	nblk = nz / pdims(1)
646	nz = pdims(1)
647	nnz_x = 1
648	nzb_x = 1 + myidx * nnz_x
649	nzt_x = ( myidx + 1 ) * nnz_x
650	sendrecvcount_zx = nnx * nny * nnz_x
651
652	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
653	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
654
655	DO kk = 1, nblk
656
657	CALL cpu_log( log_point_s(4), 'fft_x', 'continue', cpu_log_nowait )
658
659	n = isave(2) + kk - 1
660	CALL fft_x( ar1(:,:,:), 'backward', f_out_z(:,:,n))
661
662	IF ( kk == nblk ) THEN
663	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
664	ELSE
665	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
666	ENDIF
667
668	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
669
670	CALL transpose_xz( ar1(:,:,:), f_in )
671
672	DO knew = 1, nz
673	ki = kk + nblk * (knew-1)
674	f_inv(:,:,ki) = f_in(:,:,knew)
675	ENDDO
676
677	IF ( kk == nblk ) THEN
678	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
679	ELSE
680	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
681	ENDIF
682
683	ENDDO
684	!
685	!-- Restore original indices/counters
686	nz = isave(1)
687	nzb_x = isave(2)
688	nzt_x = isave(3)
689	sendrecvcount_zx = isave(4)
690
691	CALL resort_for_xz( f_inv, ar )
692
693	DEALLOCATE( ar1, f_in, f_inv )
694
695	ENDIF
696
697	CALL cpu_log( log_point_s(3), 'poisfft', 'stop' )
698
699	END SUBROUTINE poisfft
700
701
702
703	SUBROUTINE ffty_tr_yx( f_in, f_out )
704
705	!------------------------------------------------------------------------------!
706	! Fourier-transformation along y with subsequent transposition y --> x for
707	! a 1d-decomposition along x
708	!
709	! ATTENTION: The performance of this routine is much faster on the NEC-SX6,
710	! if the first index of work_ffty_vec is odd. Otherwise
711	! memory bank conflicts may occur (especially if the index is a
712	! multiple of 128). That's why work_ffty_vec is dimensioned as
713	! 0:ny+1.
714	! Of course, this will not work if users are using an odd number
715	! of gridpoints along y.
716	!------------------------------------------------------------------------------!
717
718	USE control_parameters, &
719	ONLY: host
720
721	USE cpulog, &
722	ONLY: cpu_log, log_point_s
723
724	USE kinds
725
726	USE pegrid
727
728	IMPLICIT NONE
729
730	INTEGER(iwp) :: i !:
731	INTEGER(iwp) :: iend !:
732	INTEGER(iwp) :: iouter !:
733	INTEGER(iwp) :: ir !:
734	INTEGER(iwp) :: j !:
735	INTEGER(iwp) :: k !:
736
737	INTEGER(iwp), PARAMETER :: stridex = 4 !:
738
739	REAL(wp), DIMENSION(0:ny,stridex) :: work_ffty !:
740	#if defined( __nec )
741	REAL(wp), DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec !:
742	#endif
743	REAL(wp), DIMENSION(1:nz,0:ny,nxl:nxr) :: f_in !:
744	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_out !:
745	REAL(wp), DIMENSION(nxl:nxr,1:nz,0:ny) :: work !:
746
747	!
748	!-- Carry out the FFT along y, where all data are present due to the
749	!-- 1d-decomposition along x. Resort the data in a way that x becomes
750	!-- the first index.
751	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'start' )
752
753	IF ( host(1:3) == 'nec' ) THEN
754	#if defined( __nec )
755	!
756	!-- Code optimized for vector processors
757	!$OMP PARALLEL PRIVATE ( i, j, k )
758	!$OMP DO
759	DO i = nxl, nxr
760
761	DO j = 0, ny
762	DO k = 1, nz
763	work_ffty_vec(j,k,i) = f_in(k,j,i)
764	ENDDO
765	ENDDO
766
767	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' )
768
769	ENDDO
770
771	!$OMP DO
772	DO k = 1, nz
773	DO j = 0, ny
774	DO i = nxl, nxr
775	work(i,k,j) = work_ffty_vec(j,k,i)
776	ENDDO
777	ENDDO
778	ENDDO
779	!$OMP END PARALLEL
780	#endif
781
782	ELSE
783
784	!
785	!-- Cache optimized code.
786	!-- The i-(x-)direction is split into a strided outer loop and an inner
787	!-- loop for better cache performance
788	!$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty)
789	!$OMP DO
790	DO iouter = nxl, nxr, stridex
791
792	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
793
794	DO k = 1, nz
795
796	DO i = iouter, iend
797
798	ir = i-iouter+1 ! counter within a stride
799	DO j = 0, ny
800	work_ffty(j,ir) = f_in(k,j,i)
801	ENDDO
802	!
803	!-- FFT along y
804	CALL fft_y_1d( work_ffty(:,ir), 'forward' )
805
806	ENDDO
807
808	!
809	!-- Resort
810	DO j = 0, ny
811	DO i = iouter, iend
812	work(i,k,j) = work_ffty(j,i-iouter+1)
813	ENDDO
814	ENDDO
815
816	ENDDO
817
818	ENDDO
819	!$OMP END PARALLEL
820
821	ENDIF
822	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'pause' )
823
824	!
825	!-- Transpose array
826	#if defined( __parallel )
827	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
828	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
829	CALL MPI_ALLTOALL( work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
830	f_out(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
831	comm1dx, ierr )
832	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
833	#endif
834
835	END SUBROUTINE ffty_tr_yx
836
837
838	SUBROUTINE tr_xy_ffty( f_in, f_out )
839
840	!------------------------------------------------------------------------------!
841	! Transposition x --> y with a subsequent backward Fourier transformation for
842	! a 1d-decomposition along x
843	!------------------------------------------------------------------------------!
844
845	USE control_parameters, &
846	ONLY: host
847
848	USE cpulog, &
849	ONLY: cpu_log, log_point_s
850
851	USE kinds
852
853	USE pegrid
854
855	IMPLICIT NONE
856
857	INTEGER(iwp) :: i !:
858	INTEGER(iwp) :: iend !:
859	INTEGER(iwp) :: iouter !:
860	INTEGER(iwp) :: ir !:
861	INTEGER(iwp) :: j !:
862	INTEGER(iwp) :: k !:
863
864	INTEGER(iwp), PARAMETER :: stridex = 4 !:
865
866	REAL(wp), DIMENSION(0:ny,stridex) :: work_ffty !:
867	#if defined( __nec )
868	REAL(wp), DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec !:
869	#endif
870	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_in !:
871	REAL(wp), DIMENSION(1:nz,0:ny,nxl:nxr) :: f_out !:
872	REAL(wp), DIMENSION(nxl:nxr,1:nz,0:ny) :: work !:
873
874	!
875	!-- Transpose array
876	#if defined( __parallel )
877	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
878	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
879	CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
880	work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
881	comm1dx, ierr )
882	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
883	#endif
884
885	!
886	!-- Resort the data in a way that y becomes the first index and carry out the
887	!-- backward fft along y.
888	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'continue' )
889
890	IF ( host(1:3) == 'nec' ) THEN
891	#if defined( __nec )
892	!
893	!-- Code optimized for vector processors
894	!$OMP PARALLEL PRIVATE ( i, j, k )
895	!$OMP DO
896	DO k = 1, nz
897	DO j = 0, ny
898	DO i = nxl, nxr
899	work_ffty_vec(j,k,i) = work(i,k,j)
900	ENDDO
901	ENDDO
902	ENDDO
903
904	!$OMP DO
905	DO i = nxl, nxr
906
907	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' )
908
909	DO j = 0, ny
910	DO k = 1, nz
911	f_out(k,j,i) = work_ffty_vec(j,k,i)
912	ENDDO
913	ENDDO
914
915	ENDDO
916	!$OMP END PARALLEL
917	#endif
918
919	ELSE
920
921	!
922	!-- Cache optimized code.
923	!-- The i-(x-)direction is split into a strided outer loop and an inner
924	!-- loop for better cache performance
925	!$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty )
926	!$OMP DO
927	DO iouter = nxl, nxr, stridex
928
929	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
930
931	DO k = 1, nz
932	!
933	!-- Resort
934	DO j = 0, ny
935	DO i = iouter, iend
936	work_ffty(j,i-iouter+1) = work(i,k,j)
937	ENDDO
938	ENDDO
939
940	DO i = iouter, iend
941
942	!
943	!-- FFT along y
944	ir = i-iouter+1 ! counter within a stride
945	CALL fft_y_1d( work_ffty(:,ir), 'backward' )
946
947	DO j = 0, ny
948	f_out(k,j,i) = work_ffty(j,ir)
949	ENDDO
950	ENDDO
951
952	ENDDO
953
954	ENDDO
955	!$OMP END PARALLEL
956
957	ENDIF
958
959	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'stop' )
960
961	END SUBROUTINE tr_xy_ffty
962
963
964	SUBROUTINE fftx_tri_fftx( ar )
965
966	!------------------------------------------------------------------------------!
967	! FFT along x, solution of the tridiagonal system and backward FFT for
968	! a 1d-decomposition along x
969	!
970	! WARNING: this subroutine may still not work for hybrid parallelization
971	! with OpenMP (for possible necessary changes see the original
972	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
973	!------------------------------------------------------------------------------!
974
975	USE control_parameters, &
976	ONLY: host
977
978	USE cpulog, &
979	ONLY: cpu_log, log_point_s
980
981	USE grid_variables, &
982	ONLY: ddx2, ddy2
983
984	USE kinds
985
986	USE pegrid
987
988	IMPLICIT NONE
989
990	INTEGER(iwp) :: i !:
991	INTEGER(iwp) :: j !:
992	INTEGER(iwp) :: k !:
993	INTEGER(iwp) :: m !:
994	INTEGER(iwp) :: n !:
995	INTEGER(iwp) :: omp_get_thread_num !:
996	INTEGER(iwp) :: tn !:
997
998	REAL(wp), DIMENSION(0:nx) :: work_fftx !:
999	REAL(wp), DIMENSION(0:nx,1:nz) :: work_trix !:
1000	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: ar !:
1001	REAL(wp), DIMENSION(:,:,:,:), ALLOCATABLE :: tri !:
1002
1003
1004	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'start' )
1005
1006	ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) )
1007
1008	tn = 0 ! Default thread number in case of one thread
1009	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_fftx, work_trix )
1010	DO j = nys_x, nyn_x
1011
1012	!$ tn = omp_get_thread_num()
1013
1014	IF ( host(1:3) == 'nec' ) THEN
1015	!
1016	!-- Code optimized for vector processors
1017	DO k = 1, nz
1018
1019	m = 0
1020	DO n = 1, pdims(1)
1021	DO i = 1, nnx
1022	work_trix(m,k) = ar(i,k,j,n)
1023	m = m + 1
1024	ENDDO
1025	ENDDO
1026
1027	ENDDO
1028
1029	CALL fft_x_m( work_trix, 'forward' )
1030
1031	ELSE
1032	!
1033	!-- Cache optimized code
1034	DO k = 1, nz
1035
1036	m = 0
1037	DO n = 1, pdims(1)
1038	DO i = 1, nnx
1039	work_fftx(m) = ar(i,k,j,n)
1040	m = m + 1
1041	ENDDO
1042	ENDDO
1043
1044	CALL fft_x_1d( work_fftx, 'forward' )
1045
1046	DO i = 0, nx
1047	work_trix(i,k) = work_fftx(i)
1048	ENDDO
1049
1050	ENDDO
1051
1052	ENDIF
1053
1054	!
1055	!-- Solve the linear equation system
1056	CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) )
1057
1058	IF ( host(1:3) == 'nec' ) THEN
1059	!
1060	!-- Code optimized for vector processors
1061	CALL fft_x_m( work_trix, 'backward' )
1062
1063	DO k = 1, nz
1064
1065	m = 0
1066	DO n = 1, pdims(1)
1067	DO i = 1, nnx
1068	ar(i,k,j,n) = work_trix(m,k)
1069	m = m + 1
1070	ENDDO
1071	ENDDO
1072
1073	ENDDO
1074
1075	ELSE
1076	!
1077	!-- Cache optimized code
1078	DO k = 1, nz
1079
1080	DO i = 0, nx
1081	work_fftx(i) = work_trix(i,k)
1082	ENDDO
1083
1084	CALL fft_x_1d( work_fftx, 'backward' )
1085
1086	m = 0
1087	DO n = 1, pdims(1)
1088	DO i = 1, nnx
1089	ar(i,k,j,n) = work_fftx(m)
1090	m = m + 1
1091	ENDDO
1092	ENDDO
1093
1094	ENDDO
1095
1096	ENDIF
1097
1098	ENDDO
1099
1100	DEALLOCATE( tri )
1101
1102	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'stop' )
1103
1104	END SUBROUTINE fftx_tri_fftx
1105
1106
1107	SUBROUTINE fftx_tr_xy( f_in, f_out )
1108
1109	!------------------------------------------------------------------------------!
1110	! Fourier-transformation along x with subsequent transposition x --> y for
1111	! a 1d-decomposition along y
1112	!
1113	! ATTENTION: The NEC-branch of this routine may significantly profit from
1114	! further optimizations. So far, performance is much worse than
1115	! for routine ffty_tr_yx (more than three times slower).
1116	!------------------------------------------------------------------------------!
1117
1118	USE control_parameters, &
1119	ONLY: host
1120
1121	USE cpulog, &
1122	ONLY: cpu_log, log_point_s
1123
1124	USE kinds
1125
1126	USE pegrid
1127
1128	IMPLICIT NONE
1129
1130	INTEGER(iwp) :: i !:
1131	INTEGER(iwp) :: j !:
1132	INTEGER(iwp) :: k !:
1133
1134	REAL(wp), DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx !:
1135	REAL(wp), DIMENSION(1:nz,nys:nyn,0:nx) :: f_in !:
1136	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_out !:
1137	REAL(wp), DIMENSION(nys:nyn,1:nz,0:nx) :: work !:
1138
1139	!
1140	!-- Carry out the FFT along x, where all data are present due to the
1141	!-- 1d-decomposition along y. Resort the data in a way that y becomes
1142	!-- the first index.
1143	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'start' )
1144
1145	IF ( host(1:3) == 'nec' ) THEN
1146	!
1147	!-- Code for vector processors
1148	!$OMP PARALLEL PRIVATE ( i, j, k )
1149	!$OMP DO
1150	DO i = 0, nx
1151
1152	DO j = nys, nyn
1153	DO k = 1, nz
1154	work_fftx(i,k,j) = f_in(k,j,i)
1155	ENDDO
1156	ENDDO
1157
1158	ENDDO
1159
1160	!$OMP DO
1161	DO j = nys, nyn
1162
1163	CALL fft_x_m( work_fftx(:,:,j), 'forward' )
1164
1165	DO k = 1, nz
1166	DO i = 0, nx
1167	work(j,k,i) = work_fftx(i,k,j)
1168	ENDDO
1169	ENDDO
1170
1171	ENDDO
1172	!$OMP END PARALLEL
1173
1174	ELSE
1175
1176	!
1177	!-- Cache optimized code (there might be still a potential for better
1178	!-- optimization).
1179	!$OMP PARALLEL PRIVATE (i,j,k)
1180	!$OMP DO
1181	DO i = 0, nx
1182
1183	DO j = nys, nyn
1184	DO k = 1, nz
1185	work_fftx(i,k,j) = f_in(k,j,i)
1186	ENDDO
1187	ENDDO
1188
1189	ENDDO
1190
1191	!$OMP DO
1192	DO j = nys, nyn
1193	DO k = 1, nz
1194
1195	CALL fft_x_1d( work_fftx(0:nx,k,j), 'forward' )
1196
1197	DO i = 0, nx
1198	work(j,k,i) = work_fftx(i,k,j)
1199	ENDDO
1200	ENDDO
1201
1202	ENDDO
1203	!$OMP END PARALLEL
1204
1205	ENDIF
1206	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'pause' )
1207
1208	!
1209	!-- Transpose array
1210	#if defined( __parallel )
1211	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1212	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1213	CALL MPI_ALLTOALL( work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
1214	f_out(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
1215	comm1dy, ierr )
1216	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1217	#endif
1218
1219	END SUBROUTINE fftx_tr_xy
1220
1221
1222	SUBROUTINE tr_yx_fftx( f_in, f_out )
1223
1224	!------------------------------------------------------------------------------!
1225	! Transposition y --> x with a subsequent backward Fourier transformation for
1226	! a 1d-decomposition along x
1227	!------------------------------------------------------------------------------!
1228
1229	USE control_parameters, &
1230	ONLY: host
1231
1232	USE cpulog, &
1233	ONLY: cpu_log, log_point_s
1234
1235	USE kinds
1236
1237	USE pegrid
1238
1239	IMPLICIT NONE
1240
1241	INTEGER(iwp) :: i !:
1242	INTEGER(iwp) :: j !:
1243	INTEGER(iwp) :: k !:
1244
1245	REAL(wp), DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx !:
1246	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_in !:
1247	REAL(wp), DIMENSION(1:nz,nys:nyn,0:nx) :: f_out !:
1248	REAL(wp), DIMENSION(nys:nyn,1:nz,0:nx) :: work !:
1249
1250	!
1251	!-- Transpose array
1252	#if defined( __parallel )
1253	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1254	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1255	CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
1256	work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
1257	comm1dy, ierr )
1258	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1259	#endif
1260
1261	!
1262	!-- Carry out the FFT along x, where all data are present due to the
1263	!-- 1d-decomposition along y. Resort the data in a way that y becomes
1264	!-- the first index.
1265	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'continue' )
1266
1267	IF ( host(1:3) == 'nec' ) THEN
1268	!
1269	!-- Code optimized for vector processors
1270	!$OMP PARALLEL PRIVATE ( i, j, k )
1271	!$OMP DO
1272	DO j = nys, nyn
1273
1274	DO k = 1, nz
1275	DO i = 0, nx
1276	work_fftx(i,k,j) = work(j,k,i)
1277	ENDDO
1278	ENDDO
1279
1280	CALL fft_x_m( work_fftx(:,:,j), 'backward' )
1281
1282	ENDDO
1283
1284	!$OMP DO
1285	DO i = 0, nx
1286	DO j = nys, nyn
1287	DO k = 1, nz
1288	f_out(k,j,i) = work_fftx(i,k,j)
1289	ENDDO
1290	ENDDO
1291	ENDDO
1292	!$OMP END PARALLEL
1293
1294	ELSE
1295
1296	!
1297	!-- Cache optimized code (there might be still a potential for better
1298	!-- optimization).
1299	!$OMP PARALLEL PRIVATE (i,j,k)
1300	!$OMP DO
1301	DO j = nys, nyn
1302	DO k = 1, nz
1303
1304	DO i = 0, nx
1305	work_fftx(i,k,j) = work(j,k,i)
1306	ENDDO
1307
1308	CALL fft_x_1d( work_fftx(0:nx,k,j), 'backward' )
1309
1310	ENDDO
1311	ENDDO
1312
1313	!$OMP DO
1314	DO i = 0, nx
1315	DO j = nys, nyn
1316	DO k = 1, nz
1317	f_out(k,j,i) = work_fftx(i,k,j)
1318	ENDDO
1319	ENDDO
1320	ENDDO
1321	!$OMP END PARALLEL
1322
1323	ENDIF
1324	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'stop' )
1325
1326	END SUBROUTINE tr_yx_fftx
1327
1328
1329	SUBROUTINE ffty_tri_ffty( ar )
1330
1331	!------------------------------------------------------------------------------!
1332	! FFT along y, solution of the tridiagonal system and backward FFT for
1333	! a 1d-decomposition along y
1334	!
1335	! WARNING: this subroutine may still not work for hybrid parallelization
1336	! with OpenMP (for possible necessary changes see the original
1337	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
1338	!------------------------------------------------------------------------------!
1339
1340	USE control_parameters, &
1341	ONLY: host
1342
1343	USE cpulog, &
1344	ONLY: cpu_log, log_point_s
1345
1346	USE grid_variables, &
1347	ONLY: ddx2, ddy2
1348
1349	USE kinds
1350
1351	USE pegrid
1352
1353	IMPLICIT NONE
1354
1355	INTEGER(iwp) :: i !:
1356	INTEGER(iwp) :: j !:
1357	INTEGER(iwp) :: k !:
1358	INTEGER(iwp) :: m !:
1359	INTEGER(iwp) :: n !:
1360	INTEGER(iwp) :: omp_get_thread_num !:
1361	INTEGER(iwp) :: tn !:
1362
1363	REAL(wp), DIMENSION(0:ny) :: work_ffty !:
1364	REAL(wp), DIMENSION(0:ny,1:nz) :: work_triy !:
1365	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: ar !:
1366	REAL(wp), DIMENSION(:,:,:,:), ALLOCATABLE :: tri !:
1367
1368
1369	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'start' )
1370
1371	ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) )
1372
1373	tn = 0 ! Default thread number in case of one thread
1374	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy )
1375	DO i = nxl_y, nxr_y
1376
1377	!$ tn = omp_get_thread_num()
1378
1379	IF ( host(1:3) == 'nec' ) THEN
1380	!
1381	!-- Code optimized for vector processors
1382	DO k = 1, nz
1383
1384	m = 0
1385	DO n = 1, pdims(2)
1386	DO j = 1, nny
1387	work_triy(m,k) = ar(j,k,i,n)
1388	m = m + 1
1389	ENDDO
1390	ENDDO
1391
1392	ENDDO
1393
1394	CALL fft_y_m( work_triy, ny, 'forward' )
1395
1396	ELSE
1397	!
1398	!-- Cache optimized code
1399	DO k = 1, nz
1400
1401	m = 0
1402	DO n = 1, pdims(2)
1403	DO j = 1, nny
1404	work_ffty(m) = ar(j,k,i,n)
1405	m = m + 1
1406	ENDDO
1407	ENDDO
1408
1409	CALL fft_y_1d( work_ffty, 'forward' )
1410
1411	DO j = 0, ny
1412	work_triy(j,k) = work_ffty(j)
1413	ENDDO
1414
1415	ENDDO
1416
1417	ENDIF
1418
1419	!
1420	!-- Solve the linear equation system
1421	CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) )
1422
1423	IF ( host(1:3) == 'nec' ) THEN
1424	!
1425	!-- Code optimized for vector processors
1426	CALL fft_y_m( work_triy, ny, 'backward' )
1427
1428	DO k = 1, nz
1429
1430	m = 0
1431	DO n = 1, pdims(2)
1432	DO j = 1, nny
1433	ar(j,k,i,n) = work_triy(m,k)
1434	m = m + 1
1435	ENDDO
1436	ENDDO
1437
1438	ENDDO
1439
1440	ELSE
1441	!
1442	!-- Cache optimized code
1443	DO k = 1, nz
1444
1445	DO j = 0, ny
1446	work_ffty(j) = work_triy(j,k)
1447	ENDDO
1448
1449	CALL fft_y_1d( work_ffty, 'backward' )
1450
1451	m = 0
1452	DO n = 1, pdims(2)
1453	DO j = 1, nny
1454	ar(j,k,i,n) = work_ffty(m)
1455	m = m + 1
1456	ENDDO
1457	ENDDO
1458
1459	ENDDO
1460
1461	ENDIF
1462
1463	ENDDO
1464
1465	DEALLOCATE( tri )
1466
1467	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'stop' )
1468
1469	END SUBROUTINE ffty_tri_ffty
1470
1471	#endif
1472
1473	END MODULE poisfft_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |