Home

Context Navigation

source: palm/trunk/SOURCE/poisfft_mod.f90 @ 4438

Last change on this file since 4438 was 4429, checked in by raasch, 5 years ago
serial (non-MPI) test case added, several bugfixes for the serial mode
Property svn:keywords set to `Id`
File size: 44.0 KB

Line
1	!> @file poisfft_mod.f90
2	!------------------------------------------------------------------------------!
3	! This file is part of the PALM model system.
4	!
5	! PALM is free software: you can redistribute it and/or modify it under the
6	! terms of the GNU General Public License as published by the Free Software
7	! Foundation, either version 3 of the License, or (at your option) any later
8	! version.
9	!
10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13	!
14	! You should have received a copy of the GNU General Public License along with
15	! PALM. If not, see <http://www.gnu.org/licenses/>.
16	!
17	! Copyright 1997-2020 Leibniz Universitaet Hannover
18	!------------------------------------------------------------------------------!
19	!
20	! Current revisions:
21	! -----------------
22	!
23	!
24	! Former revisions:
25	! -----------------
26	! $Id: poisfft_mod.f90 4429 2020-02-27 15:24:30Z suehring $
27	! Statements added to avoid compile errors due to unused dummy arguments in serial mode
28	!
29	! 4366 2020-01-09 08:12:43Z raasch
30	! modification concerning NEC vectorizatio
31	!
32	! 4360 2020-01-07 11:25:50Z suehring
33	! Corrected "Former revisions" section
34	!
35	! 3690 2019-01-22 22:56:42Z knoop
36	! OpenACC port for SPEC
37	!
38	! Revision 1.1 1997/07/24 11:24:14 raasch
39	! Initial revision
40	!
41	!
42	! Description:
43	! ------------
44	!> Solves the Poisson equation with a 2D spectral method
45	!> d^2 p / dx^2 + d^2 p / dy^2 + d^2 p / dz^2 = s
46	!>
47	!> Input:
48	!> real ar contains (nnz,nny,nnx) elements of the velocity divergence,
49	!> starting from (1,nys,nxl)
50	!>
51	!> Output:
52	!> real ar contains the solution for perturbation pressure p
53	!------------------------------------------------------------------------------!
54	MODULE poisfft_mod
55
56
57	USE fft_xy, &
58	ONLY: fft_init, fft_y, fft_y_1d, fft_y_m, fft_x, fft_x_1d, fft_x_m, &
59	temperton_fft_vec
60
61	USE indices, &
62	ONLY: nnx, nny, nx, nxl, nxr, ny, nys, nyn, nz
63
64	USE transpose_indices, &
65	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nys_x, nys_z, nyn_x, nyn_z, nzb_x, &
66	nzb_y, nzt_x, nzt_y
67
68	USE tridia_solver, &
69	ONLY: tridia_1dd, tridia_init, tridia_substi, tridia_substi_overlap
70
71	IMPLICIT NONE
72
73	LOGICAL, SAVE :: poisfft_initialized = .FALSE.
74
75	PRIVATE
76
77	PUBLIC poisfft, poisfft_init
78
79	INTERFACE poisfft
80	MODULE PROCEDURE poisfft
81	END INTERFACE poisfft
82
83	INTERFACE poisfft_init
84	MODULE PROCEDURE poisfft_init
85	END INTERFACE poisfft_init
86
87
88	CONTAINS
89
90	!------------------------------------------------------------------------------!
91	! Description:
92	! ------------
93	!> Setup coefficients for FFT and the tridiagonal solver
94	!------------------------------------------------------------------------------!
95	SUBROUTINE poisfft_init
96
97	IMPLICIT NONE
98
99
100	CALL fft_init
101
102	CALL tridia_init
103
104	poisfft_initialized = .TRUE.
105
106	END SUBROUTINE poisfft_init
107
108
109
110	!------------------------------------------------------------------------------!
111	! Description:
112	! ------------
113	!> Two-dimensional Fourier Transformation in x- and y-direction.
114	!------------------------------------------------------------------------------!
115	SUBROUTINE poisfft( ar )
116
117	USE control_parameters, &
118	ONLY: transpose_compute_overlap
119
120	USE cpulog, &
121	ONLY: cpu_log, cpu_log_nowait, log_point_s
122
123	USE kinds
124
125	USE pegrid
126
127	IMPLICIT NONE
128
129	INTEGER(iwp) :: ii !<
130	INTEGER(iwp) :: iind !<
131	INTEGER(iwp) :: inew !<
132	INTEGER(iwp) :: jj !<
133	INTEGER(iwp) :: jind !<
134	INTEGER(iwp) :: jnew !<
135	INTEGER(iwp) :: ki !<
136	INTEGER(iwp) :: kk !<
137	INTEGER(iwp) :: knew !<
138	INTEGER(iwp) :: n !<
139	INTEGER(iwp) :: nblk !<
140	INTEGER(iwp) :: nnx_y !<
141	INTEGER(iwp) :: nny_z !<
142	INTEGER(iwp) :: nnz_x !<
143	INTEGER(iwp) :: nxl_y_bound !<
144	INTEGER(iwp) :: nxr_y_bound !<
145
146	INTEGER(iwp), DIMENSION(4) :: isave !<
147
148	REAL(wp), DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar !<
149	REAL(wp), DIMENSION(nys:nyn,nxl:nxr,1:nz) :: ar_inv !<
150
151	#define __acc_fft_device ( defined( _OPENACC ) && ( defined ( __cuda_fft ) ) )
152	#if __acc_fft_device
153	!$ACC DECLARE CREATE(ar_inv)
154	#endif
155
156	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: ar1 !<
157	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_in !<
158	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_inv !<
159	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_out_y !<
160	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_out_z !<
161
162
163	CALL cpu_log( log_point_s(3), 'poisfft', 'start' )
164
165	IF ( .NOT. poisfft_initialized ) CALL poisfft_init
166
167	#if !__acc_fft_device
168	!$ACC UPDATE HOST(ar)
169	#endif
170
171	#ifndef _OPENACC
172	!
173	!-- Two-dimensional Fourier Transformation in x- and y-direction.
174	IF ( pdims(2) == 1 .AND. pdims(1) > 1 ) THEN
175
176	!
177	!-- 1d-domain-decomposition along x:
178	!-- FFT along y and transposition y --> x
179	CALL ffty_tr_yx( ar, ar )
180
181	!
182	!-- FFT along x, solving the tridiagonal system and backward FFT
183	CALL fftx_tri_fftx( ar )
184
185	!
186	!-- Transposition x --> y and backward FFT along y
187	CALL tr_xy_ffty( ar, ar )
188
189	ELSEIF ( pdims(1) == 1 .AND. pdims(2) > 1 ) THEN
190
191	!
192	!-- 1d-domain-decomposition along y:
193	!-- FFT along x and transposition x --> y
194	CALL fftx_tr_xy( ar, ar )
195
196	!
197	!-- FFT along y, solving the tridiagonal system and backward FFT
198	CALL ffty_tri_ffty( ar )
199
200	!
201	!-- Transposition y --> x and backward FFT along x
202	CALL tr_yx_fftx( ar, ar )
203
204	ELSEIF ( .NOT. transpose_compute_overlap ) THEN
205	#endif
206
207	!
208	!-- 2d-domain-decomposition or no decomposition (1 PE run)
209	!-- Transposition z --> x
210	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
211	CALL resort_for_zx( ar, ar_inv )
212	CALL transpose_zx( ar_inv, ar )
213	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
214
215	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
216	IF ( temperton_fft_vec ) THEN
217	!
218	!-- Vector version outputs a transformed array ar_inv that does not require resorting
219	!-- (which is done for ar further below)
220	CALL fft_x( ar, 'forward', ar_inv=ar_inv)
221	ELSE
222	CALL fft_x( ar, 'forward')
223	ENDIF
224	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
225
226	!
227	!-- Transposition x --> y
228	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
229	IF( .NOT. temperton_fft_vec ) CALL resort_for_xy( ar, ar_inv )
230	CALL transpose_xy( ar_inv, ar )
231	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
232
233	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
234	IF ( temperton_fft_vec ) THEN
235	!
236	!-- Input array ar_inv from fft_x can be directly used here.
237	!-- The output (also in array ar_inv) does not require resorting below.
238	CALL fft_y( ar, 'forward', ar_inv = ar_inv, nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
239	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
240	ELSE
241	CALL fft_y( ar, 'forward', ar_tr = ar, nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
242	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
243	ENDIF
244	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
245
246	!
247	!-- Transposition y --> z
248	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
249	IF ( .NOT. temperton_fft_vec ) CALL resort_for_yz( ar, ar_inv )
250	CALL transpose_yz( ar_inv, ar )
251	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
252
253	!
254	!-- Solve the tridiagonal equation system along z
255	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
256	CALL tridia_substi( ar )
257	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
258
259	!
260	!-- Inverse Fourier Transformation
261	!-- Transposition z --> y
262	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
263	CALL transpose_zy( ar, ar_inv )
264	!
265	!-- The fft_y below (vector branch) can directly process ar_inv (i.e. does not require a
266	!-- resorting)
267	IF ( .NOT. temperton_fft_vec ) CALL resort_for_zy( ar_inv, ar )
268	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
269
270	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
271	IF ( temperton_fft_vec ) THEN
272	!
273	!-- Output array ar_inv can be used as input to the below fft_x routine without resorting
274	CALL fft_y( ar, 'backward', ar_inv = ar_inv, nxl_y_bound = nxl_y, nxr_y_bound = nxr_y,&
275	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
276	ELSE
277	CALL fft_y( ar, 'backward', ar_tr = ar, nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
278	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
279	ENDIF
280
281	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
282
283	!
284	!-- Transposition y --> x
285	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
286	CALL transpose_yx( ar, ar_inv )
287	IF ( .NOT. temperton_fft_vec ) CALL resort_for_yx( ar_inv, ar )
288	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
289
290	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
291	IF ( temperton_fft_vec ) THEN
292	CALL fft_x( ar, 'backward', ar_inv=ar_inv )
293	ELSE
294	CALL fft_x( ar, 'backward' )
295	ENDIF
296	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
297
298	!
299	!-- Transposition x --> z
300	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
301	CALL transpose_xz( ar, ar_inv )
302	CALL resort_for_xz( ar_inv, ar )
303	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
304
305	#ifndef _OPENACC
306	ELSE
307
308	!
309	!-- 2d-domain-decomposition or no decomposition (1 PE run) with
310	!-- overlapping transposition / fft
311	!-- cputime logging must not use barriers, which would prevent overlapping
312	ALLOCATE( f_out_y(0:ny,nxl_y:nxr_y,nzb_y:nzt_y), &
313	f_out_z(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
314	!
315	!-- Transposition z --> x + subsequent fft along x
316	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
317	CALL resort_for_zx( ar, f_inv )
318	!
319	!-- Save original indices and gridpoint counter
320	isave(1) = nz
321	isave(2) = nzb_x
322	isave(3) = nzt_x
323	isave(4) = sendrecvcount_zx
324	!
325	!-- Set new indices for transformation
326	nblk = nz / pdims(1)
327	nz = pdims(1)
328	nnz_x = 1
329	nzb_x = 1 + myidx * nnz_x
330	nzt_x = ( myidx + 1 ) * nnz_x
331	sendrecvcount_zx = nnx * nny * nnz_x
332
333	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
334	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
335
336	DO kk = 1, nblk
337
338	IF ( kk == 1 ) THEN
339	CALL cpu_log( log_point_s(5), 'transpo forward', 'start', cpu_log_nowait )
340	ELSE
341	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
342	ENDIF
343
344	DO knew = 1, nz
345	ki = kk + nblk * ( knew - 1 )
346	f_in(:,:,knew) = f_inv(:,:,ki)
347	ENDDO
348
349	CALL transpose_zx( f_in, ar1(:,:,:))
350	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
351
352	IF ( kk == 1 ) THEN
353	CALL cpu_log( log_point_s(4), 'fft_x', 'start', cpu_log_nowait )
354	ELSE
355	CALL cpu_log( log_point_s(4), 'fft_x', 'continue', cpu_log_nowait )
356	ENDIF
357
358	n = isave(2) + kk - 1
359	CALL fft_x( ar1(:,:,:), 'forward', ar_2d = f_out_z(:,:,n))
360	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
361
362	ENDDO
363	!
364	!-- Restore original indices/counters
365	nz = isave(1)
366	nzb_x = isave(2)
367	nzt_x = isave(3)
368	sendrecvcount_zx = isave(4)
369
370	DEALLOCATE( ar1, f_in, f_inv )
371
372	!
373	!-- Transposition x --> y + subsequent fft along y
374	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
375	CALL resort_for_xy( f_out_z, f_inv )
376	!
377	!-- Save original indices and gridpoint counter
378	isave(1) = nx
379	isave(2) = nxl_y
380	isave(3) = nxr_y
381	isave(4) = sendrecvcount_xy
382	!
383	!-- Set new indices for transformation
384	nblk = ( ( nx+1 ) / pdims(2) ) - 1
385	nx = pdims(2)
386	nnx_y = 1
387	nxl_y = myidy * nnx_y
388	nxr_y = ( myidy + 1 ) * nnx_y - 1
389	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
390
391	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
392	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
393
394	DO ii = 0, nblk
395
396	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
397
398	DO inew = 0, nx-1
399	iind = ii + ( nblk + 1 ) * inew
400	f_in(:,:,inew) = f_inv(:,:,iind)
401	ENDDO
402
403	CALL transpose_xy( f_in, ar1(:,:,:) )
404
405	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
406
407	IF ( ii == 1 ) THEN
408	CALL cpu_log( log_point_s(7), 'fft_y', 'start', cpu_log_nowait )
409	ELSE
410	CALL cpu_log( log_point_s(7), 'fft_y', 'continue', cpu_log_nowait )
411	ENDIF
412
413	nxl_y_bound = isave(2)
414	nxr_y_bound = isave(3)
415	n = isave(2) + ii
416	CALL fft_y( ar1(:,:,:), 'forward', ar_tr = f_out_y, &
417	nxl_y_bound = nxl_y_bound, nxr_y_bound = nxr_y_bound, &
418	nxl_y_l = n, nxr_y_l = n )
419
420	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
421
422	ENDDO
423	!
424	!-- Restore original indices/counters
425	nx = isave(1)
426	nxl_y = isave(2)
427	nxr_y = isave(3)
428	sendrecvcount_xy = isave(4)
429
430	DEALLOCATE( ar1, f_in, f_inv )
431
432	!
433	!-- Transposition y --> z + subsequent tridia + resort for z --> y
434	ALLOCATE( f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
435	CALL resort_for_yz( f_out_y, f_inv )
436	!
437	!-- Save original indices and gridpoint counter
438	isave(1) = ny
439	isave(2) = nys_z
440	isave(3) = nyn_z
441	isave(4) = sendrecvcount_yz
442	!
443	!-- Set new indices for transformation
444	nblk = ( ( ny+1 ) / pdims(1) ) - 1
445	ny = pdims(1)
446	nny_z = 1
447	nys_z = myidx * nny_z
448	nyn_z = ( myidx + 1 ) * nny_z - 1
449	sendrecvcount_yz = ( nxr_y-nxl_y+1 ) * nny_z * ( nzt_y-nzb_y+1 )
450
451	ALLOCATE( ar1(nxl_z:nxr_z,nys_z:nyn_z,1:nz) )
452	ALLOCATE( f_in(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
453
454	DO jj = 0, nblk
455	!
456	!-- Forward Fourier Transformation
457	!-- Transposition y --> z
458	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
459
460	DO jnew = 0, ny-1
461	jind = jj + ( nblk + 1 ) * jnew
462	f_in(:,:,jnew) = f_inv(:,:,jind)
463	ENDDO
464
465	CALL transpose_yz( f_in, ar1(:,:,:) )
466
467	IF ( jj == nblk ) THEN
468	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
469	ELSE
470	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
471	ENDIF
472
473	!
474	!-- Solve the tridiagonal equation system along z
475	CALL cpu_log( log_point_s(6), 'tridia', 'start', cpu_log_nowait )
476
477	n = isave(2) + jj
478	CALL tridia_substi_overlap( ar1(:,:,:), n )
479
480	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
481
482	!
483	!-- Inverse Fourier Transformation
484	!-- Transposition z --> y
485	!-- Only one thread should call MPI routines, therefore forward and
486	!-- backward tranpose are in the same section
487	IF ( jj == 0 ) THEN
488	CALL cpu_log( log_point_s(8), 'transpo invers', 'start', cpu_log_nowait )
489	ELSE
490	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
491	ENDIF
492
493	CALL transpose_zy( ar1(:,:,:), f_in )
494
495	DO jnew = 0, ny-1
496	jind = jj + ( nblk + 1 ) * jnew
497	f_inv(:,:,jind) = f_in(:,:,jnew)
498	ENDDO
499
500	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
501
502	ENDDO
503	!
504	!-- Restore original indices/counters
505	ny = isave(1)
506	nys_z = isave(2)
507	nyn_z = isave(3)
508	sendrecvcount_yz = isave(4)
509
510	CALL resort_for_zy( f_inv, f_out_y )
511
512	DEALLOCATE( ar1, f_in, f_inv )
513
514	!
515	!-- fft along y backward + subsequent transposition y --> x
516	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
517	!
518	!-- Save original indices and gridpoint counter
519	isave(1) = nx
520	isave(2) = nxl_y
521	isave(3) = nxr_y
522	isave(4) = sendrecvcount_xy
523	!
524	!-- Set new indices for transformation
525	nblk = (( nx+1 ) / pdims(2) ) - 1
526	nx = pdims(2)
527	nnx_y = 1
528	nxl_y = myidy * nnx_y
529	nxr_y = ( myidy + 1 ) * nnx_y - 1
530	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
531
532	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
533	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
534
535	DO ii = 0, nblk
536
537	CALL cpu_log( log_point_s(7), 'fft_y', 'continue', cpu_log_nowait )
538
539	n = isave(2) + ii
540	nxl_y_bound = isave(2)
541	nxr_y_bound = isave(3)
542
543	CALL fft_y( ar1(:,:,:), 'backward', ar_tr = f_out_y, &
544	nxl_y_bound = nxl_y_bound, nxr_y_bound = nxr_y_bound, &
545	nxl_y_l = n, nxr_y_l = n )
546
547	IF ( ii == nblk ) THEN
548	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
549	ELSE
550	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
551	ENDIF
552
553	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
554
555	CALL transpose_yx( ar1(:,:,:), f_in )
556
557	DO inew = 0, nx-1
558	iind = ii + (nblk+1) * inew
559	f_inv(:,:,iind) = f_in(:,:,inew)
560	ENDDO
561
562	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
563
564	ENDDO
565	!
566	!-- Restore original indices/counters
567	nx = isave(1)
568	nxl_y = isave(2)
569	nxr_y = isave(3)
570	sendrecvcount_xy = isave(4)
571
572	CALL resort_for_yx( f_inv, f_out_z )
573
574	DEALLOCATE( ar1, f_in, f_inv )
575
576	!
577	!-- fft along x backward + subsequent final transposition x --> z
578	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
579	!
580	!-- Save original indices and gridpoint counter
581	isave(1) = nz
582	isave(2) = nzb_x
583	isave(3) = nzt_x
584	isave(4) = sendrecvcount_zx
585	!
586	!-- Set new indices for transformation
587	nblk = nz / pdims(1)
588	nz = pdims(1)
589	nnz_x = 1
590	nzb_x = 1 + myidx * nnz_x
591	nzt_x = ( myidx + 1 ) * nnz_x
592	sendrecvcount_zx = nnx * nny * nnz_x
593
594	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
595	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
596
597	DO kk = 1, nblk
598
599	CALL cpu_log( log_point_s(4), 'fft_x', 'continue', cpu_log_nowait )
600
601	n = isave(2) + kk - 1
602	CALL fft_x( ar1(:,:,:), 'backward', f_out_z(:,:,n))
603
604	IF ( kk == nblk ) THEN
605	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
606	ELSE
607	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
608	ENDIF
609
610	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
611
612	CALL transpose_xz( ar1(:,:,:), f_in )
613
614	DO knew = 1, nz
615	ki = kk + nblk * (knew-1)
616	f_inv(:,:,ki) = f_in(:,:,knew)
617	ENDDO
618
619	IF ( kk == nblk ) THEN
620	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
621	ELSE
622	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
623	ENDIF
624
625	ENDDO
626	!
627	!-- Restore original indices/counters
628	nz = isave(1)
629	nzb_x = isave(2)
630	nzt_x = isave(3)
631	sendrecvcount_zx = isave(4)
632
633	CALL resort_for_xz( f_inv, ar )
634
635	DEALLOCATE( ar1, f_in, f_inv )
636
637	ENDIF
638	#endif
639
640	#if !__acc_fft_device
641	!$ACC UPDATE DEVICE(ar)
642	#endif
643
644	CALL cpu_log( log_point_s(3), 'poisfft', 'stop' )
645
646	END SUBROUTINE poisfft
647
648
649	!------------------------------------------------------------------------------!
650	! Description:
651	! ------------
652	!> Fourier-transformation along y with subsequent transposition y --> x for
653	!> a 1d-decomposition along x.
654	!>
655	!> @attention The performance of this routine is much faster on the NEC-SX6,
656	!> if the first index of work_ffty_vec is odd. Otherwise
657	!> memory bank conflicts may occur (especially if the index is a
658	!> multiple of 128). That's why work_ffty_vec is dimensioned as
659	!> 0:ny+1.
660	!> Of course, this will not work if users are using an odd number
661	!> of gridpoints along y.
662	!------------------------------------------------------------------------------!
663	SUBROUTINE ffty_tr_yx( f_in, f_out )
664
665	USE control_parameters, &
666	ONLY: loop_optimization
667
668	USE cpulog, &
669	ONLY: cpu_log, log_point_s
670
671	USE kinds
672
673	USE pegrid
674
675	IMPLICIT NONE
676
677	INTEGER(iwp) :: i !<
678	INTEGER(iwp) :: iend !<
679	INTEGER(iwp) :: iouter !<
680	INTEGER(iwp) :: ir !<
681	INTEGER(iwp) :: j !<
682	INTEGER(iwp) :: k !<
683
684	INTEGER(iwp), PARAMETER :: stridex = 4 !<
685
686	REAL(wp), DIMENSION(1:nz,0:ny,nxl:nxr) :: f_in !<
687	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_out !<
688	REAL(wp), DIMENSION(nxl:nxr,1:nz,0:ny) :: work !<
689
690	REAL(wp), DIMENSION(:,:), ALLOCATABLE :: work_ffty !<
691	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: work_ffty_vec !<
692
693	!
694	!-- Carry out the FFT along y, where all data are present due to the
695	!-- 1d-decomposition along x. Resort the data in a way that x becomes
696	!-- the first index.
697	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'start' )
698
699	IF ( loop_optimization == 'vector' ) THEN
700
701	ALLOCATE( work_ffty_vec(0:ny+1,1:nz,nxl:nxr) )
702	!
703	!-- Code optimized for vector processors
704	!$OMP PARALLEL PRIVATE ( i, j, k )
705	!$OMP DO
706	DO i = nxl, nxr
707
708	DO j = 0, ny
709	DO k = 1, nz
710	work_ffty_vec(j,k,i) = f_in(k,j,i)
711	ENDDO
712	ENDDO
713
714	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' )
715
716	ENDDO
717
718	!$OMP DO
719	DO k = 1, nz
720	DO j = 0, ny
721	DO i = nxl, nxr
722	work(i,k,j) = work_ffty_vec(j,k,i)
723	ENDDO
724	ENDDO
725	ENDDO
726	!$OMP END PARALLEL
727
728	DEALLOCATE( work_ffty_vec )
729
730	ELSE
731	!
732	!-- Cache optimized code.
733	ALLOCATE( work_ffty(0:ny,stridex) )
734	!
735	!-- The i-(x-)direction is split into a strided outer loop and an inner
736	!-- loop for better cache performance
737	!$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty)
738	!$OMP DO
739	DO iouter = nxl, nxr, stridex
740
741	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
742
743	DO k = 1, nz
744
745	DO i = iouter, iend
746
747	ir = i-iouter+1 ! counter within a stride
748	DO j = 0, ny
749	work_ffty(j,ir) = f_in(k,j,i)
750	ENDDO
751	!
752	!-- FFT along y
753	CALL fft_y_1d( work_ffty(:,ir), 'forward' )
754
755	ENDDO
756
757	!
758	!-- Resort
759	DO j = 0, ny
760	DO i = iouter, iend
761	work(i,k,j) = work_ffty(j,i-iouter+1)
762	ENDDO
763	ENDDO
764
765	ENDDO
766
767	ENDDO
768	!$OMP END PARALLEL
769
770	DEALLOCATE( work_ffty )
771
772	ENDIF
773
774	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'pause' )
775
776	!
777	!-- Transpose array
778	#if defined( __parallel )
779	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
780	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
781	CALL MPI_ALLTOALL( work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
782	f_out(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
783	comm1dx, ierr )
784	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
785	#else
786	!
787	!-- Next line required to avoid compile error about unused dummy argument in serial mode
788	i = SIZE( f_out )
789	#endif
790
791	END SUBROUTINE ffty_tr_yx
792
793
794	!------------------------------------------------------------------------------!
795	! Description:
796	! ------------
797	!> Transposition x --> y with a subsequent backward Fourier transformation for
798	!> a 1d-decomposition along x
799	!------------------------------------------------------------------------------!
800	SUBROUTINE tr_xy_ffty( f_in, f_out )
801
802	USE control_parameters, &
803	ONLY: loop_optimization
804
805	USE cpulog, &
806	ONLY: cpu_log, log_point_s
807
808	USE kinds
809
810	USE pegrid
811
812	IMPLICIT NONE
813
814	INTEGER(iwp) :: i !<
815	INTEGER(iwp) :: iend !<
816	INTEGER(iwp) :: iouter !<
817	INTEGER(iwp) :: ir !<
818	INTEGER(iwp) :: j !<
819	INTEGER(iwp) :: k !<
820
821	INTEGER(iwp), PARAMETER :: stridex = 4 !<
822
823	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_in !<
824	REAL(wp), DIMENSION(1:nz,0:ny,nxl:nxr) :: f_out !<
825	REAL(wp), DIMENSION(nxl:nxr,1:nz,0:ny) :: work !<
826
827	REAL(wp), DIMENSION(:,:), ALLOCATABLE :: work_ffty !<
828	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: work_ffty_vec !<
829
830	!
831	!-- Transpose array
832	#if defined( __parallel )
833	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
834	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
835	CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
836	work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
837	comm1dx, ierr )
838	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
839	#else
840	!
841	!-- Next line required to avoid compile error about unused dummy argument in serial mode
842	i = SIZE( f_in )
843	#endif
844
845	!
846	!-- Resort the data in a way that y becomes the first index and carry out the
847	!-- backward fft along y.
848	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'continue' )
849
850	IF ( loop_optimization == 'vector' ) THEN
851
852	ALLOCATE( work_ffty_vec(0:ny+1,1:nz,nxl:nxr) )
853	!
854	!-- Code optimized for vector processors
855	!$OMP PARALLEL PRIVATE ( i, j, k )
856	!$OMP DO
857	DO k = 1, nz
858	DO j = 0, ny
859	DO i = nxl, nxr
860	work_ffty_vec(j,k,i) = work(i,k,j)
861	ENDDO
862	ENDDO
863	ENDDO
864
865	!$OMP DO
866	DO i = nxl, nxr
867
868	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' )
869
870	DO j = 0, ny
871	DO k = 1, nz
872	f_out(k,j,i) = work_ffty_vec(j,k,i)
873	ENDDO
874	ENDDO
875
876	ENDDO
877	!$OMP END PARALLEL
878
879	DEALLOCATE( work_ffty_vec )
880
881	ELSE
882	!
883	!-- Cache optimized code.
884	ALLOCATE( work_ffty(0:ny,stridex) )
885	!
886	!-- The i-(x-)direction is split into a strided outer loop and an inner
887	!-- loop for better cache performance
888	!$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty )
889	!$OMP DO
890	DO iouter = nxl, nxr, stridex
891
892	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
893
894	DO k = 1, nz
895	!
896	!-- Resort
897	DO j = 0, ny
898	DO i = iouter, iend
899	work_ffty(j,i-iouter+1) = work(i,k,j)
900	ENDDO
901	ENDDO
902
903	DO i = iouter, iend
904
905	!
906	!-- FFT along y
907	ir = i-iouter+1 ! counter within a stride
908	CALL fft_y_1d( work_ffty(:,ir), 'backward' )
909
910	DO j = 0, ny
911	f_out(k,j,i) = work_ffty(j,ir)
912	ENDDO
913	ENDDO
914
915	ENDDO
916
917	ENDDO
918	!$OMP END PARALLEL
919
920	DEALLOCATE( work_ffty )
921
922	ENDIF
923
924	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'stop' )
925
926	END SUBROUTINE tr_xy_ffty
927
928
929	!------------------------------------------------------------------------------!
930	! Description:
931	! ------------
932	!> FFT along x, solution of the tridiagonal system and backward FFT for
933	!> a 1d-decomposition along x
934	!>
935	!> @warning this subroutine may still not work for hybrid parallelization
936	!> with OpenMP (for possible necessary changes see the original
937	!> routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
938	!------------------------------------------------------------------------------!
939	SUBROUTINE fftx_tri_fftx( ar )
940
941	USE control_parameters, &
942	ONLY: loop_optimization
943
944	USE cpulog, &
945	ONLY: cpu_log, log_point_s
946
947	USE grid_variables, &
948	ONLY: ddx2, ddy2
949
950	USE kinds
951
952	USE pegrid
953
954	IMPLICIT NONE
955
956	INTEGER(iwp) :: i !<
957	INTEGER(iwp) :: j !<
958	INTEGER(iwp) :: k !<
959	INTEGER(iwp) :: m !<
960	INTEGER(iwp) :: n !<
961	!$ INTEGER(iwp) :: omp_get_thread_num !<
962	INTEGER(iwp) :: tn !<
963
964	REAL(wp), DIMENSION(0:nx) :: work_fftx !<
965	REAL(wp), DIMENSION(0:nx,1:nz) :: work_trix !<
966	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: ar !<
967	REAL(wp), DIMENSION(:,:,:,:), ALLOCATABLE :: tri !<
968
969
970	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'start' )
971
972	ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) )
973
974	tn = 0 ! Default thread number in case of one thread
975	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_fftx, work_trix )
976	DO j = nys_x, nyn_x
977
978	!$ tn = omp_get_thread_num()
979
980	IF ( loop_optimization == 'vector' ) THEN
981	!
982	!-- Code optimized for vector processors
983	DO k = 1, nz
984
985	m = 0
986	DO n = 1, pdims(1)
987	DO i = 1, nnx
988	work_trix(m,k) = ar(i,k,j,n)
989	m = m + 1
990	ENDDO
991	ENDDO
992
993	ENDDO
994
995	CALL fft_x_m( work_trix, 'forward' )
996
997	ELSE
998	!
999	!-- Cache optimized code
1000	DO k = 1, nz
1001
1002	m = 0
1003	DO n = 1, pdims(1)
1004	DO i = 1, nnx
1005	work_fftx(m) = ar(i,k,j,n)
1006	m = m + 1
1007	ENDDO
1008	ENDDO
1009
1010	CALL fft_x_1d( work_fftx, 'forward' )
1011
1012	DO i = 0, nx
1013	work_trix(i,k) = work_fftx(i)
1014	ENDDO
1015
1016	ENDDO
1017
1018	ENDIF
1019
1020	!
1021	!-- Solve the linear equation system
1022	CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) )
1023
1024	IF ( loop_optimization == 'vector' ) THEN
1025	!
1026	!-- Code optimized for vector processors
1027	CALL fft_x_m( work_trix, 'backward' )
1028
1029	DO k = 1, nz
1030
1031	m = 0
1032	DO n = 1, pdims(1)
1033	DO i = 1, nnx
1034	ar(i,k,j,n) = work_trix(m,k)
1035	m = m + 1
1036	ENDDO
1037	ENDDO
1038
1039	ENDDO
1040
1041	ELSE
1042	!
1043	!-- Cache optimized code
1044	DO k = 1, nz
1045
1046	DO i = 0, nx
1047	work_fftx(i) = work_trix(i,k)
1048	ENDDO
1049
1050	CALL fft_x_1d( work_fftx, 'backward' )
1051
1052	m = 0
1053	DO n = 1, pdims(1)
1054	DO i = 1, nnx
1055	ar(i,k,j,n) = work_fftx(m)
1056	m = m + 1
1057	ENDDO
1058	ENDDO
1059
1060	ENDDO
1061
1062	ENDIF
1063
1064	ENDDO
1065
1066	DEALLOCATE( tri )
1067
1068	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'stop' )
1069
1070	END SUBROUTINE fftx_tri_fftx
1071
1072
1073	!------------------------------------------------------------------------------!
1074	! Description:
1075	! ------------
1076	!> Fourier-transformation along x with subsequent transposition x --> y for
1077	!> a 1d-decomposition along y.
1078	!>
1079	!> @attention NEC-branch of this routine may significantly profit from
1080	!> further optimizations. So far, performance is much worse than
1081	!> for routine ffty_tr_yx (more than three times slower).
1082	!------------------------------------------------------------------------------!
1083	SUBROUTINE fftx_tr_xy( f_in, f_out )
1084
1085
1086	USE control_parameters, &
1087	ONLY: loop_optimization
1088
1089	USE cpulog, &
1090	ONLY: cpu_log, log_point_s
1091
1092	USE kinds
1093
1094	USE pegrid
1095
1096	IMPLICIT NONE
1097
1098	INTEGER(iwp) :: i !<
1099	INTEGER(iwp) :: j !<
1100	INTEGER(iwp) :: k !<
1101
1102	REAL(wp), DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx !<
1103	REAL(wp), DIMENSION(1:nz,nys:nyn,0:nx) :: f_in !<
1104	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_out !<
1105	REAL(wp), DIMENSION(nys:nyn,1:nz,0:nx) :: work !<
1106
1107	!
1108	!-- Carry out the FFT along x, where all data are present due to the
1109	!-- 1d-decomposition along y. Resort the data in a way that y becomes
1110	!-- the first index.
1111	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'start' )
1112
1113	IF ( loop_optimization == 'vector' ) THEN
1114	!
1115	!-- Code for vector processors
1116	!$OMP PARALLEL PRIVATE ( i, j, k )
1117	!$OMP DO
1118	DO i = 0, nx
1119
1120	DO j = nys, nyn
1121	DO k = 1, nz
1122	work_fftx(i,k,j) = f_in(k,j,i)
1123	ENDDO
1124	ENDDO
1125
1126	ENDDO
1127
1128	!$OMP DO
1129	DO j = nys, nyn
1130
1131	CALL fft_x_m( work_fftx(:,:,j), 'forward' )
1132
1133	DO k = 1, nz
1134	DO i = 0, nx
1135	work(j,k,i) = work_fftx(i,k,j)
1136	ENDDO
1137	ENDDO
1138
1139	ENDDO
1140	!$OMP END PARALLEL
1141
1142	ELSE
1143
1144	!
1145	!-- Cache optimized code (there might be still a potential for better
1146	!-- optimization).
1147	!$OMP PARALLEL PRIVATE (i,j,k)
1148	!$OMP DO
1149	DO i = 0, nx
1150
1151	DO j = nys, nyn
1152	DO k = 1, nz
1153	work_fftx(i,k,j) = f_in(k,j,i)
1154	ENDDO
1155	ENDDO
1156
1157	ENDDO
1158
1159	!$OMP DO
1160	DO j = nys, nyn
1161	DO k = 1, nz
1162
1163	CALL fft_x_1d( work_fftx(0:nx,k,j), 'forward' )
1164
1165	DO i = 0, nx
1166	work(j,k,i) = work_fftx(i,k,j)
1167	ENDDO
1168	ENDDO
1169
1170	ENDDO
1171	!$OMP END PARALLEL
1172
1173	ENDIF
1174	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'pause' )
1175
1176	!
1177	!-- Transpose array
1178	#if defined( __parallel )
1179	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1180	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1181	CALL MPI_ALLTOALL( work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
1182	f_out(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
1183	comm1dy, ierr )
1184	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1185	#else
1186	!
1187	!-- Next line required to avoid compile error about unused dummy argument in serial mode
1188	i = SIZE( f_out )
1189	#endif
1190
1191	END SUBROUTINE fftx_tr_xy
1192
1193
1194	!------------------------------------------------------------------------------!
1195	! Description:
1196	! ------------
1197	!> Transposition y --> x with a subsequent backward Fourier transformation for
1198	!> a 1d-decomposition along x.
1199	!------------------------------------------------------------------------------!
1200	SUBROUTINE tr_yx_fftx( f_in, f_out )
1201
1202
1203	USE control_parameters, &
1204	ONLY: loop_optimization
1205
1206	USE cpulog, &
1207	ONLY: cpu_log, log_point_s
1208
1209	USE kinds
1210
1211	USE pegrid
1212
1213	IMPLICIT NONE
1214
1215	INTEGER(iwp) :: i !<
1216	INTEGER(iwp) :: j !<
1217	INTEGER(iwp) :: k !<
1218
1219	REAL(wp), DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx !<
1220	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_in !<
1221	REAL(wp), DIMENSION(1:nz,nys:nyn,0:nx) :: f_out !<
1222	REAL(wp), DIMENSION(nys:nyn,1:nz,0:nx) :: work !<
1223
1224
1225	!
1226	!-- Transpose array
1227	#if defined( __parallel )
1228	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1229	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1230	CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
1231	work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
1232	comm1dy, ierr )
1233	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1234	#else
1235	!
1236	!-- Next line required to avoid compile error about unused dummy argument in serial mode
1237	i = SIZE( f_in )
1238	#endif
1239
1240	!
1241	!-- Carry out the FFT along x, where all data are present due to the
1242	!-- 1d-decomposition along y. Resort the data in a way that y becomes
1243	!-- the first index.
1244	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'continue' )
1245
1246	IF ( loop_optimization == 'vector' ) THEN
1247	!
1248	!-- Code optimized for vector processors
1249	!$OMP PARALLEL PRIVATE ( i, j, k )
1250	!$OMP DO
1251	DO j = nys, nyn
1252
1253	DO k = 1, nz
1254	DO i = 0, nx
1255	work_fftx(i,k,j) = work(j,k,i)
1256	ENDDO
1257	ENDDO
1258
1259	CALL fft_x_m( work_fftx(:,:,j), 'backward' )
1260
1261	ENDDO
1262
1263	!$OMP DO
1264	DO i = 0, nx
1265	DO j = nys, nyn
1266	DO k = 1, nz
1267	f_out(k,j,i) = work_fftx(i,k,j)
1268	ENDDO
1269	ENDDO
1270	ENDDO
1271	!$OMP END PARALLEL
1272
1273	ELSE
1274
1275	!
1276	!-- Cache optimized code (there might be still a potential for better
1277	!-- optimization).
1278	!$OMP PARALLEL PRIVATE (i,j,k)
1279	!$OMP DO
1280	DO j = nys, nyn
1281	DO k = 1, nz
1282
1283	DO i = 0, nx
1284	work_fftx(i,k,j) = work(j,k,i)
1285	ENDDO
1286
1287	CALL fft_x_1d( work_fftx(0:nx,k,j), 'backward' )
1288
1289	ENDDO
1290	ENDDO
1291
1292	!$OMP DO
1293	DO i = 0, nx
1294	DO j = nys, nyn
1295	DO k = 1, nz
1296	f_out(k,j,i) = work_fftx(i,k,j)
1297	ENDDO
1298	ENDDO
1299	ENDDO
1300	!$OMP END PARALLEL
1301
1302	ENDIF
1303	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'stop' )
1304
1305	END SUBROUTINE tr_yx_fftx
1306
1307
1308	!------------------------------------------------------------------------------!
1309	! Description:
1310	! ------------
1311	!> FFT along y, solution of the tridiagonal system and backward FFT for
1312	!> a 1d-decomposition along y.
1313	!>
1314	!> @warning this subroutine may still not work for hybrid parallelization
1315	!> with OpenMP (for possible necessary changes see the original
1316	!> routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
1317	!------------------------------------------------------------------------------!
1318	SUBROUTINE ffty_tri_ffty( ar )
1319
1320
1321	USE control_parameters, &
1322	ONLY: loop_optimization
1323
1324	USE cpulog, &
1325	ONLY: cpu_log, log_point_s
1326
1327	USE grid_variables, &
1328	ONLY: ddx2, ddy2
1329
1330	USE kinds
1331
1332	USE pegrid
1333
1334	IMPLICIT NONE
1335
1336	INTEGER(iwp) :: i !<
1337	INTEGER(iwp) :: j !<
1338	INTEGER(iwp) :: k !<
1339	INTEGER(iwp) :: m !<
1340	INTEGER(iwp) :: n !<
1341	!$ INTEGER(iwp) :: omp_get_thread_num !<
1342	INTEGER(iwp) :: tn !<
1343
1344	REAL(wp), DIMENSION(0:ny) :: work_ffty !<
1345	REAL(wp), DIMENSION(0:ny,1:nz) :: work_triy !<
1346	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: ar !<
1347	REAL(wp), DIMENSION(:,:,:,:), ALLOCATABLE :: tri !<
1348
1349
1350	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'start' )
1351
1352	ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) )
1353
1354	tn = 0 ! Default thread number in case of one thread
1355	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy )
1356	DO i = nxl_y, nxr_y
1357
1358	!$ tn = omp_get_thread_num()
1359
1360	IF ( loop_optimization == 'vector' ) THEN
1361	!
1362	!-- Code optimized for vector processors
1363	DO k = 1, nz
1364
1365	m = 0
1366	DO n = 1, pdims(2)
1367	DO j = 1, nny
1368	work_triy(m,k) = ar(j,k,i,n)
1369	m = m + 1
1370	ENDDO
1371	ENDDO
1372
1373	ENDDO
1374
1375	CALL fft_y_m( work_triy, ny, 'forward' )
1376
1377	ELSE
1378	!
1379	!-- Cache optimized code
1380	DO k = 1, nz
1381
1382	m = 0
1383	DO n = 1, pdims(2)
1384	DO j = 1, nny
1385	work_ffty(m) = ar(j,k,i,n)
1386	m = m + 1
1387	ENDDO
1388	ENDDO
1389
1390	CALL fft_y_1d( work_ffty, 'forward' )
1391
1392	DO j = 0, ny
1393	work_triy(j,k) = work_ffty(j)
1394	ENDDO
1395
1396	ENDDO
1397
1398	ENDIF
1399
1400	!
1401	!-- Solve the linear equation system
1402	CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) )
1403
1404	IF ( loop_optimization == 'vector' ) THEN
1405	!
1406	!-- Code optimized for vector processors
1407	CALL fft_y_m( work_triy, ny, 'backward' )
1408
1409	DO k = 1, nz
1410
1411	m = 0
1412	DO n = 1, pdims(2)
1413	DO j = 1, nny
1414	ar(j,k,i,n) = work_triy(m,k)
1415	m = m + 1
1416	ENDDO
1417	ENDDO
1418
1419	ENDDO
1420
1421	ELSE
1422	!
1423	!-- Cache optimized code
1424	DO k = 1, nz
1425
1426	DO j = 0, ny
1427	work_ffty(j) = work_triy(j,k)
1428	ENDDO
1429
1430	CALL fft_y_1d( work_ffty, 'backward' )
1431
1432	m = 0
1433	DO n = 1, pdims(2)
1434	DO j = 1, nny
1435	ar(j,k,i,n) = work_ffty(m)
1436	m = m + 1
1437	ENDDO
1438	ENDDO
1439
1440	ENDDO
1441
1442	ENDIF
1443
1444	ENDDO
1445
1446	DEALLOCATE( tri )
1447
1448	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'stop' )
1449
1450	END SUBROUTINE ffty_tri_ffty
1451
1452	END MODULE poisfft_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |