Home

Context Navigation

source: palm/trunk/SOURCE/poisfft_mod.f90 @ 4763

Last change on this file since 4763 was 4717, checked in by pavelkrc, 4 years ago
Fixes and optimizations of OpenMP parallelization, formatting of OpenMP directives
Property svn:keywords set to `Id`
File size: 45.7 KB

Line
1	!> @file poisfft_mod.f90
2	!--------------------------------------------------------------------------------------------------!
3	! This file is part of the PALM model system.
4	!
5	! PALM is free software: you can redistribute it and/or modify it under the terms of the GNU General
6	! Public License as published by the Free Software Foundation, either version 3 of the License, or
7	! (at your option) any later version.
8	!
9	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the
10	! implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
11	! Public License for more details.
12	!
13	! You should have received a copy of the GNU General Public License along with PALM. If not, see
14	! <http://www.gnu.org/licenses/>.
15	!
16	! Copyright 1997-2020 Leibniz Universitaet Hannover
17	!--------------------------------------------------------------------------------------------------!
18	!
19	!
20	! Current revisions:
21	! -----------------
22	!
23	!
24	! Former revisions:
25	! -----------------
26	! $Id: poisfft_mod.f90 4717 2020-09-30 22:27:40Z suehring $
27	! Formatting of OpenMP directives (J. Resler)
28	!
29	! 4671 2020-09-09 20:27:58Z pavelkrc
30	! OMP bugfix
31	!
32	! 4649 2020-08-25 12:11:17Z raasch
33	! File re-formatted to follow the PALM coding standard
34	!
35	! 4429 2020-02-27 15:24:30Z raasch
36	! Statements added to avoid compile errors due to unused dummy arguments in serial mode
37	!
38	! 4366 2020-01-09 08:12:43Z raasch
39	! Modification concerning NEC vectorizatio
40	!
41	! 4360 2020-01-07 11:25:50Z suehring
42	! Corrected "Former revisions" section
43	!
44	! 3690 2019-01-22 22:56:42Z knoop
45	! OpenACC port for SPEC
46	!
47	! Revision 1.1 1997/07/24 11:24:14 raasch
48	! Initial revision
49	!
50	!
51	!--------------------------------------------------------------------------------------------------!
52	! Description:
53	! ------------
54	!> Solves the Poisson equation with a 2D spectral method
55	!> d^2 p / dx^2 + d^2 p / dy^2 + d^2 p / dz^2 = s
56	!>
57	!> Input:
58	!> real ar contains (nnz,nny,nnx) elements of the velocity divergence, starting from (1,nys,nxl)
59	!>
60	!> Output:
61	!> real ar contains the solution for perturbation pressure p
62	!--------------------------------------------------------------------------------------------------!
63	MODULE poisfft_mod
64
65
66	USE fft_xy, &
67	ONLY: fft_init, &
68	fft_y, &
69	fft_y_1d, &
70	fft_y_m, &
71	fft_x, &
72	fft_x_1d, &
73	fft_x_m, &
74	temperton_fft_vec
75
76	USE indices, &
77	ONLY: nnx, &
78	nny, &
79	nx, &
80	nxl, &
81	nxr, &
82	ny, &
83	nys, &
84	nyn, &
85	nz
86
87	USE transpose_indices, &
88	ONLY: nxl_y, &
89	nxl_z, &
90	nxr_y, &
91	nxr_z, &
92	nys_x, &
93	nys_z, &
94	nyn_x, &
95	nyn_z, &
96	nzb_x, &
97	nzb_y, &
98	nzt_x, &
99	nzt_y
100
101	USE tridia_solver, &
102	ONLY: tridia_1dd, &
103	tridia_init, &
104	tridia_substi, &
105	tridia_substi_overlap
106
107	IMPLICIT NONE
108
109	LOGICAL, SAVE :: poisfft_initialized = .FALSE. !<
110
111	PRIVATE
112
113	PUBLIC poisfft, poisfft_init
114
115	INTERFACE poisfft
116	MODULE PROCEDURE poisfft
117	END INTERFACE poisfft
118
119	INTERFACE poisfft_init
120	MODULE PROCEDURE poisfft_init
121	END INTERFACE poisfft_init
122
123
124	CONTAINS
125
126	!--------------------------------------------------------------------------------------------------!
127	! Description:
128	! ------------
129	!> Setup coefficients for FFT and the tridiagonal solver
130	!--------------------------------------------------------------------------------------------------!
131	SUBROUTINE poisfft_init
132
133	IMPLICIT NONE
134
135
136	CALL fft_init
137
138	CALL tridia_init
139
140	poisfft_initialized = .TRUE.
141
142	END SUBROUTINE poisfft_init
143
144
145
146	!--------------------------------------------------------------------------------------------------!
147	! Description:
148	! ------------
149	!> Two-dimensional Fourier Transformation in x- and y-direction.
150	!--------------------------------------------------------------------------------------------------!
151	SUBROUTINE poisfft( ar )
152
153	USE control_parameters, &
154	ONLY: transpose_compute_overlap
155
156	USE cpulog, &
157	ONLY: cpu_log, &
158	cpu_log_nowait, &
159	log_point_s
160
161	USE kinds
162
163	USE pegrid
164
165	IMPLICIT NONE
166
167	INTEGER(iwp) :: ii !<
168	INTEGER(iwp) :: iind !<
169	INTEGER(iwp) :: inew !<
170	INTEGER(iwp) :: jj !<
171	INTEGER(iwp) :: jind !<
172	INTEGER(iwp) :: jnew !<
173	INTEGER(iwp) :: ki !<
174	INTEGER(iwp) :: kk !<
175	INTEGER(iwp) :: knew !<
176	INTEGER(iwp) :: n !<
177	INTEGER(iwp) :: nblk !<
178	INTEGER(iwp) :: nnx_y !<
179	INTEGER(iwp) :: nny_z !<
180	INTEGER(iwp) :: nnz_x !<
181	INTEGER(iwp) :: nxl_y_bound !<
182	INTEGER(iwp) :: nxr_y_bound !<
183
184	INTEGER(iwp), DIMENSION(4) :: isave !<
185
186	REAL(wp), DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar !<
187	REAL(wp), DIMENSION(nys:nyn,nxl:nxr,1:nz) :: ar_inv !<
188
189	#define __acc_fft_device ( defined( _OPENACC ) && ( defined ( __cuda_fft ) ) )
190	#if __acc_fft_device
191	!$ACC DECLARE CREATE(ar_inv)
192	#endif
193
194	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: ar1 !<
195	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_in !<
196	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_inv !<
197	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_out_y !<
198	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_out_z !<
199
200
201	CALL cpu_log( log_point_s(3), 'poisfft', 'start' )
202
203	IF ( .NOT. poisfft_initialized ) CALL poisfft_init
204
205	#if !__acc_fft_device
206	!$ACC UPDATE HOST(ar)
207	#endif
208
209	#ifndef _OPENACC
210	!
211	!-- Two-dimensional Fourier Transformation in x- and y-direction.
212	IF ( pdims(2) == 1 .AND. pdims(1) > 1 ) THEN
213
214	!
215	!-- 1d-domain-decomposition along x:
216	!-- FFT along y and transposition y --> x
217	CALL ffty_tr_yx( ar, ar )
218
219	!
220	!-- FFT along x, solving the tridiagonal system and backward FFT
221	CALL fftx_tri_fftx( ar )
222
223	!
224	!-- Transposition x --> y and backward FFT along y
225	CALL tr_xy_ffty( ar, ar )
226
227	ELSEIF ( pdims(1) == 1 .AND. pdims(2) > 1 ) THEN
228
229	!
230	!-- 1d-domain-decomposition along y:
231	!-- FFT along x and transposition x --> y
232	CALL fftx_tr_xy( ar, ar )
233
234	!
235	!-- FFT along y, solving the tridiagonal system and backward FFT
236	CALL ffty_tri_ffty( ar )
237
238	!
239	!-- Transposition y --> x and backward FFT along x
240	CALL tr_yx_fftx( ar, ar )
241
242	ELSEIF ( .NOT. transpose_compute_overlap ) THEN
243	#endif
244
245	!
246	!-- 2d-domain-decomposition or no decomposition (1 PE run)
247	!-- Transposition z --> x
248	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
249	CALL resort_for_zx( ar, ar_inv )
250	CALL transpose_zx( ar_inv, ar )
251	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
252
253	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
254	IF ( temperton_fft_vec ) THEN
255	!
256	!-- Vector version outputs a transformed array ar_inv that does not require resorting
257	!-- (which is done for ar further below)
258	CALL fft_x( ar, 'forward', ar_inv=ar_inv)
259	ELSE
260	CALL fft_x( ar, 'forward')
261	ENDIF
262	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
263
264	!
265	!-- Transposition x --> y
266	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
267	IF( .NOT. temperton_fft_vec ) CALL resort_for_xy( ar, ar_inv )
268	CALL transpose_xy( ar_inv, ar )
269	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
270
271	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
272	IF ( temperton_fft_vec ) THEN
273	!
274	!-- Input array ar_inv from fft_x can be directly used here.
275	!-- The output (also in array ar_inv) does not require resorting below.
276	CALL fft_y( ar, 'forward', ar_inv = ar_inv, nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
277	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
278	ELSE
279	CALL fft_y( ar, 'forward', ar_tr = ar, nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
280	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
281	ENDIF
282	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
283
284	!
285	!-- Transposition y --> z
286	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
287	IF ( .NOT. temperton_fft_vec ) CALL resort_for_yz( ar, ar_inv )
288	CALL transpose_yz( ar_inv, ar )
289	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
290
291	!
292	!-- Solve the tridiagonal equation system along z
293	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
294	CALL tridia_substi( ar )
295	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
296
297	!
298	!-- Inverse Fourier Transformation
299	!-- Transposition z --> y
300	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
301	CALL transpose_zy( ar, ar_inv )
302	!
303	!-- The fft_y below (vector branch) can directly process ar_inv (i.e. does not require a
304	!-- resorting)
305	IF ( .NOT. temperton_fft_vec ) CALL resort_for_zy( ar_inv, ar )
306	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
307
308	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
309	IF ( temperton_fft_vec ) THEN
310	!
311	!-- Output array ar_inv can be used as input to the below fft_x routine without resorting
312	CALL fft_y( ar, 'backward', ar_inv = ar_inv, nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
313	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
314	ELSE
315	CALL fft_y( ar, 'backward', ar_tr = ar, nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
316	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
317	ENDIF
318
319	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
320
321	!
322	!-- Transposition y --> x
323	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
324	CALL transpose_yx( ar, ar_inv )
325	IF ( .NOT. temperton_fft_vec ) CALL resort_for_yx( ar_inv, ar )
326	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
327
328	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
329	IF ( temperton_fft_vec ) THEN
330	CALL fft_x( ar, 'backward', ar_inv=ar_inv )
331	ELSE
332	CALL fft_x( ar, 'backward' )
333	ENDIF
334	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
335
336	!
337	!-- Transposition x --> z
338	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
339	CALL transpose_xz( ar, ar_inv )
340	CALL resort_for_xz( ar_inv, ar )
341	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
342
343	#ifndef _OPENACC
344	ELSE
345
346	!
347	!-- 2d-domain-decomposition or no decomposition (1 PE run) with overlapping transposition / fft
348	!-- cputime logging must not use barriers, which would prevent overlapping
349	ALLOCATE( f_out_y(0:ny,nxl_y:nxr_y,nzb_y:nzt_y), &
350	f_out_z(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
351	!
352	!-- Transposition z --> x + subsequent fft along x
353	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
354	CALL resort_for_zx( ar, f_inv )
355	!
356	!-- Save original indices and gridpoint counter
357	isave(1) = nz
358	isave(2) = nzb_x
359	isave(3) = nzt_x
360	isave(4) = sendrecvcount_zx
361	!
362	!-- Set new indices for transformation
363	nblk = nz / pdims(1)
364	nz = pdims(1)
365	nnz_x = 1
366	nzb_x = 1 + myidx * nnz_x
367	nzt_x = ( myidx + 1 ) * nnz_x
368	sendrecvcount_zx = nnx * nny * nnz_x
369
370	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
371	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
372
373	DO kk = 1, nblk
374
375	IF ( kk == 1 ) THEN
376	CALL cpu_log( log_point_s(5), 'transpo forward', 'start', cpu_log_nowait )
377	ELSE
378	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
379	ENDIF
380
381	DO knew = 1, nz
382	ki = kk + nblk * ( knew - 1 )
383	f_in(:,:,knew) = f_inv(:,:,ki)
384	ENDDO
385
386	CALL transpose_zx( f_in, ar1(:,:,:))
387	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
388
389	IF ( kk == 1 ) THEN
390	CALL cpu_log( log_point_s(4), 'fft_x', 'start', cpu_log_nowait )
391	ELSE
392	CALL cpu_log( log_point_s(4), 'fft_x', 'continue', cpu_log_nowait )
393	ENDIF
394
395	n = isave(2) + kk - 1
396	CALL fft_x( ar1(:,:,:), 'forward', ar_2d = f_out_z(:,:,n) )
397	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
398
399	ENDDO
400	!
401	!-- Restore original indices/counters
402	nz = isave(1)
403	nzb_x = isave(2)
404	nzt_x = isave(3)
405	sendrecvcount_zx = isave(4)
406
407	DEALLOCATE( ar1, f_in, f_inv )
408
409	!
410	!-- Transposition x --> y + subsequent fft along y
411	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
412	CALL resort_for_xy( f_out_z, f_inv )
413	!
414	!-- Save original indices and gridpoint counter
415	isave(1) = nx
416	isave(2) = nxl_y
417	isave(3) = nxr_y
418	isave(4) = sendrecvcount_xy
419	!
420	!-- Set new indices for transformation
421	nblk = ( ( nx+1 ) / pdims(2) ) - 1
422	nx = pdims(2)
423	nnx_y = 1
424	nxl_y = myidy * nnx_y
425	nxr_y = ( myidy + 1 ) * nnx_y - 1
426	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
427
428	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
429	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
430
431	DO ii = 0, nblk
432
433	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
434
435	DO inew = 0, nx-1
436	iind = ii + ( nblk + 1 ) * inew
437	f_in(:,:,inew) = f_inv(:,:,iind)
438	ENDDO
439
440	CALL transpose_xy( f_in, ar1(:,:,:) )
441
442	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
443
444	IF ( ii == 1 ) THEN
445	CALL cpu_log( log_point_s(7), 'fft_y', 'start', cpu_log_nowait )
446	ELSE
447	CALL cpu_log( log_point_s(7), 'fft_y', 'continue', cpu_log_nowait )
448	ENDIF
449
450	nxl_y_bound = isave(2)
451	nxr_y_bound = isave(3)
452	n = isave(2) + ii
453	CALL fft_y( ar1(:,:,:), 'forward', ar_tr = f_out_y, nxl_y_bound = nxl_y_bound, &
454	nxr_y_bound = nxr_y_bound, nxl_y_l = n, nxr_y_l = n )
455
456	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
457
458	ENDDO
459	!
460	!-- Restore original indices/counters
461	nx = isave(1)
462	nxl_y = isave(2)
463	nxr_y = isave(3)
464	sendrecvcount_xy = isave(4)
465
466	DEALLOCATE( ar1, f_in, f_inv )
467
468	!
469	!-- Transposition y --> z + subsequent tridia + resort for z --> y
470	ALLOCATE( f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
471	CALL resort_for_yz( f_out_y, f_inv )
472	!
473	!-- Save original indices and gridpoint counter
474	isave(1) = ny
475	isave(2) = nys_z
476	isave(3) = nyn_z
477	isave(4) = sendrecvcount_yz
478	!
479	!-- Set new indices for transformation
480	nblk = ( ( ny+1 ) / pdims(1) ) - 1
481	ny = pdims(1)
482	nny_z = 1
483	nys_z = myidx * nny_z
484	nyn_z = ( myidx + 1 ) * nny_z - 1
485	sendrecvcount_yz = ( nxr_y-nxl_y+1 ) * nny_z * ( nzt_y-nzb_y+1 )
486
487	ALLOCATE( ar1(nxl_z:nxr_z,nys_z:nyn_z,1:nz) )
488	ALLOCATE( f_in(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
489
490	DO jj = 0, nblk
491	!
492	!-- Forward Fourier Transformation
493	!-- Transposition y --> z
494	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
495
496	DO jnew = 0, ny-1
497	jind = jj + ( nblk + 1 ) * jnew
498	f_in(:,:,jnew) = f_inv(:,:,jind)
499	ENDDO
500
501	CALL transpose_yz( f_in, ar1(:,:,:) )
502
503	IF ( jj == nblk ) THEN
504	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
505	ELSE
506	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
507	ENDIF
508
509	!
510	!-- Solve the tridiagonal equation system along z
511	CALL cpu_log( log_point_s(6), 'tridia', 'start', cpu_log_nowait )
512
513	n = isave(2) + jj
514	CALL tridia_substi_overlap( ar1(:,:,:), n )
515
516	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
517
518	!
519	!-- Inverse Fourier Transformation
520	!-- Transposition z --> y
521	!-- Only one thread should call MPI routines, therefore forward and backward tranpose are in
522	!-- the same section
523	IF ( jj == 0 ) THEN
524	CALL cpu_log( log_point_s(8), 'transpo invers', 'start', cpu_log_nowait )
525	ELSE
526	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
527	ENDIF
528
529	CALL transpose_zy( ar1(:,:,:), f_in )
530
531	DO jnew = 0, ny-1
532	jind = jj + ( nblk + 1 ) * jnew
533	f_inv(:,:,jind) = f_in(:,:,jnew)
534	ENDDO
535
536	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
537
538	ENDDO
539	!
540	!-- Restore original indices/counters
541	ny = isave(1)
542	nys_z = isave(2)
543	nyn_z = isave(3)
544	sendrecvcount_yz = isave(4)
545
546	CALL resort_for_zy( f_inv, f_out_y )
547
548	DEALLOCATE( ar1, f_in, f_inv )
549
550	!
551	!-- fft along y backward + subsequent transposition y --> x
552	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
553	!
554	!-- Save original indices and gridpoint counter
555	isave(1) = nx
556	isave(2) = nxl_y
557	isave(3) = nxr_y
558	isave(4) = sendrecvcount_xy
559	!
560	!-- Set new indices for transformation
561	nblk = ( ( nx+1 ) / pdims(2) ) - 1
562	nx = pdims(2)
563	nnx_y = 1
564	nxl_y = myidy * nnx_y
565	nxr_y = ( myidy + 1 ) * nnx_y - 1
566	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
567
568	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
569	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
570
571	DO ii = 0, nblk
572
573	CALL cpu_log( log_point_s(7), 'fft_y', 'continue', cpu_log_nowait )
574
575	n = isave(2) + ii
576	nxl_y_bound = isave(2)
577	nxr_y_bound = isave(3)
578
579	CALL fft_y( ar1(:,:,:), 'backward', ar_tr = f_out_y, nxl_y_bound = nxl_y_bound, &
580	nxr_y_bound = nxr_y_bound, nxl_y_l = n, nxr_y_l = n )
581
582	IF ( ii == nblk ) THEN
583	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
584	ELSE
585	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
586	ENDIF
587
588	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
589
590	CALL transpose_yx( ar1(:,:,:), f_in )
591
592	DO inew = 0, nx-1
593	iind = ii + (nblk+1) * inew
594	f_inv(:,:,iind) = f_in(:,:,inew)
595	ENDDO
596
597	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
598
599	ENDDO
600	!
601	!-- Restore original indices/counters
602	nx = isave(1)
603	nxl_y = isave(2)
604	nxr_y = isave(3)
605	sendrecvcount_xy = isave(4)
606
607	CALL resort_for_yx( f_inv, f_out_z )
608
609	DEALLOCATE( ar1, f_in, f_inv )
610
611	!
612	!-- fft along x backward + subsequent final transposition x --> z
613	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
614	!
615	!-- Save original indices and gridpoint counter
616	isave(1) = nz
617	isave(2) = nzb_x
618	isave(3) = nzt_x
619	isave(4) = sendrecvcount_zx
620	!
621	!-- Set new indices for transformation
622	nblk = nz / pdims(1)
623	nz = pdims(1)
624	nnz_x = 1
625	nzb_x = 1 + myidx * nnz_x
626	nzt_x = ( myidx + 1 ) * nnz_x
627	sendrecvcount_zx = nnx * nny * nnz_x
628
629	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
630	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
631
632	DO kk = 1, nblk
633
634	CALL cpu_log( log_point_s(4), 'fft_x', 'continue', cpu_log_nowait )
635
636	n = isave(2) + kk - 1
637	CALL fft_x( ar1(:,:,:), 'backward', f_out_z(:,:,n) )
638
639	IF ( kk == nblk ) THEN
640	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
641	ELSE
642	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
643	ENDIF
644
645	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
646
647	CALL transpose_xz( ar1(:,:,:), f_in )
648
649	DO knew = 1, nz
650	ki = kk + nblk * ( knew - 1 )
651	f_inv(:,:,ki) = f_in(:,:,knew)
652	ENDDO
653
654	IF ( kk == nblk ) THEN
655	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
656	ELSE
657	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
658	ENDIF
659
660	ENDDO
661	!
662	!-- Restore original indices/counters
663	nz = isave(1)
664	nzb_x = isave(2)
665	nzt_x = isave(3)
666	sendrecvcount_zx = isave(4)
667
668	CALL resort_for_xz( f_inv, ar )
669
670	DEALLOCATE( ar1, f_in, f_inv )
671
672	ENDIF
673	#endif
674
675	#if !__acc_fft_device
676	!$ACC UPDATE DEVICE(ar)
677	#endif
678
679	CALL cpu_log( log_point_s(3), 'poisfft', 'stop' )
680
681	END SUBROUTINE poisfft
682
683
684	!--------------------------------------------------------------------------------------------------!
685	! Description:
686	! ------------
687	!> Fourier-transformation along y with subsequent transposition y --> x for a 1d-decomposition
688	!> along x.
689	!>
690	!> @attention The performance of this routine is much faster on the NEC-SX6, if the first index of
691	!> work_ffty_vec is odd. Otherwise memory bank conflicts may occur (especially if the
692	!> index is a multiple of 128). That's why work_ffty_vec is dimensioned as 0:ny+1.
693	!> Of course, this will not work if users are using an odd number of gridpoints along y.
694	!--------------------------------------------------------------------------------------------------!
695	SUBROUTINE ffty_tr_yx( f_in, f_out )
696
697	USE control_parameters, &
698	ONLY: loop_optimization
699
700	USE cpulog, &
701	ONLY: cpu_log, &
702	log_point_s
703
704	USE kinds
705
706	USE pegrid
707
708	IMPLICIT NONE
709
710	INTEGER(iwp) :: i !<
711	INTEGER(iwp) :: iend !<
712	INTEGER(iwp) :: iouter !<
713	INTEGER(iwp) :: ir !<
714	INTEGER(iwp) :: j !<
715	INTEGER(iwp) :: k !<
716
717	INTEGER(iwp), PARAMETER :: stridex = 4 !<
718
719	REAL(wp), DIMENSION(1:nz,0:ny,nxl:nxr) :: f_in !<
720	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_out !<
721	REAL(wp), DIMENSION(nxl:nxr,1:nz,0:ny) :: work !<
722
723	REAL(wp), DIMENSION(:,:), ALLOCATABLE :: work_ffty !<
724	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: work_ffty_vec !<
725
726	!
727	!-- Carry out the FFT along y, where all data are present due to the 1d-decomposition along x.
728	!-- Resort the data in a way that x becomes the first index.
729	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'start' )
730
731	IF ( loop_optimization == 'vector' ) THEN
732
733	ALLOCATE( work_ffty_vec(0:ny+1,1:nz,nxl:nxr) )
734	!
735	!-- Code optimized for vector processors
736	!$OMP PARALLEL PRIVATE ( i, j, k )
737	!$OMP DO
738	DO i = nxl, nxr
739
740	DO j = 0, ny
741	DO k = 1, nz
742	work_ffty_vec(j,k,i) = f_in(k,j,i)
743	ENDDO
744	ENDDO
745
746	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' )
747
748	ENDDO
749
750	!$OMP DO
751	DO k = 1, nz
752	DO j = 0, ny
753	DO i = nxl, nxr
754	work(i,k,j) = work_ffty_vec(j,k,i)
755	ENDDO
756	ENDDO
757	ENDDO
758	!$OMP END PARALLEL
759
760	DEALLOCATE( work_ffty_vec )
761
762	ELSE
763	!
764	!-- Cache optimized code.
765	ALLOCATE( work_ffty(0:ny,stridex) )
766	!
767	!-- The i-(x-)direction is split into a strided outer loop and an inner loop for better cache
768	!-- performance
769	!$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty)
770	!$OMP DO
771	DO iouter = nxl, nxr, stridex
772
773	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
774
775	DO k = 1, nz
776
777	DO i = iouter, iend
778
779	ir = i-iouter+1 ! Counter within a stride
780	DO j = 0, ny
781	work_ffty(j,ir) = f_in(k,j,i)
782	ENDDO
783	!
784	!-- FFT along y
785	CALL fft_y_1d( work_ffty(:,ir), 'forward' )
786
787	ENDDO
788
789	!
790	!-- Resort
791	DO j = 0, ny
792	DO i = iouter, iend
793	work(i,k,j) = work_ffty(j,i-iouter+1)
794	ENDDO
795	ENDDO
796
797	ENDDO
798
799	ENDDO
800	!$OMP END PARALLEL
801
802	DEALLOCATE( work_ffty )
803
804	ENDIF
805
806	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'pause' )
807
808	!
809	!-- Transpose array
810	#if defined( __parallel )
811	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
812	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
813	CALL MPI_ALLTOALL( work(nxl,1,0), sendrecvcount_xy, MPI_REAL, f_out(1,1,nys_x,1), &
814	sendrecvcount_xy, MPI_REAL, comm1dx, ierr )
815	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
816	#else
817	!
818	!-- Next line required to avoid compile error about unused dummy argument in serial mode
819	i = SIZE( f_out )
820	#endif
821
822	END SUBROUTINE ffty_tr_yx
823
824
825	!--------------------------------------------------------------------------------------------------!
826	! Description:
827	! ------------
828	!> Transposition x --> y with a subsequent backward Fourier transformation for a 1d-decomposition
829	!> along x
830	!--------------------------------------------------------------------------------------------------!
831	SUBROUTINE tr_xy_ffty( f_in, f_out )
832
833	USE control_parameters, &
834	ONLY: loop_optimization
835
836	USE cpulog, &
837	ONLY: cpu_log, &
838	log_point_s
839
840	USE kinds
841
842	USE pegrid
843
844	IMPLICIT NONE
845
846	INTEGER(iwp) :: i !<
847	INTEGER(iwp) :: iend !<
848	INTEGER(iwp) :: iouter !<
849	INTEGER(iwp) :: ir !<
850	INTEGER(iwp) :: j !<
851	INTEGER(iwp) :: k !<
852
853	INTEGER(iwp), PARAMETER :: stridex = 4 !<
854
855	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_in !<
856	REAL(wp), DIMENSION(1:nz,0:ny,nxl:nxr) :: f_out !<
857	REAL(wp), DIMENSION(nxl:nxr,1:nz,0:ny) :: work !<
858
859	REAL(wp), DIMENSION(:,:), ALLOCATABLE :: work_ffty !<
860	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: work_ffty_vec !<
861
862	!
863	!-- Transpose array
864	#if defined( __parallel )
865	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
866	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
867	CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, work(nxl,1,0), &
868	sendrecvcount_xy, MPI_REAL, comm1dx, ierr )
869	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
870	#else
871	!
872	!-- Next line required to avoid compile error about unused dummy argument in serial mode
873	i = SIZE( f_in )
874	#endif
875
876	!
877	!-- Resort the data in a way that y becomes the first index and carry out the backward fft along y.
878	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'continue' )
879
880	IF ( loop_optimization == 'vector' ) THEN
881
882	ALLOCATE( work_ffty_vec(0:ny+1,1:nz,nxl:nxr) )
883	!
884	!-- Code optimized for vector processors
885	!$OMP PARALLEL PRIVATE ( i, j, k )
886	!$OMP DO
887	DO k = 1, nz
888	DO j = 0, ny
889	DO i = nxl, nxr
890	work_ffty_vec(j,k,i) = work(i,k,j)
891	ENDDO
892	ENDDO
893	ENDDO
894
895	!$OMP DO
896	DO i = nxl, nxr
897
898	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' )
899
900	DO j = 0, ny
901	DO k = 1, nz
902	f_out(k,j,i) = work_ffty_vec(j,k,i)
903	ENDDO
904	ENDDO
905
906	ENDDO
907	!$OMP END PARALLEL
908
909	DEALLOCATE( work_ffty_vec )
910
911	ELSE
912	!
913	!-- Cache optimized code.
914	ALLOCATE( work_ffty(0:ny,stridex) )
915	!
916	!-- The i-(x-)direction is split into a strided outer loop and an inner loop for better cache
917	!-- performance
918	!$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty )
919	!$OMP DO
920	DO iouter = nxl, nxr, stridex
921
922	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
923
924	DO k = 1, nz
925	!
926	!-- Resort
927	DO j = 0, ny
928	DO i = iouter, iend
929	work_ffty(j,i-iouter+1) = work(i,k,j)
930	ENDDO
931	ENDDO
932
933	DO i = iouter, iend
934
935	!
936	!-- FFT along y
937	ir = i-iouter+1 ! Counter within a stride
938	CALL fft_y_1d( work_ffty(:,ir), 'backward' )
939
940	DO j = 0, ny
941	f_out(k,j,i) = work_ffty(j,ir)
942	ENDDO
943	ENDDO
944
945	ENDDO
946
947	ENDDO
948	!$OMP END PARALLEL
949
950	DEALLOCATE( work_ffty )
951
952	ENDIF
953
954	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'stop' )
955
956	END SUBROUTINE tr_xy_ffty
957
958
959	!--------------------------------------------------------------------------------------------------!
960	! Description:
961	! ------------
962	!> FFT along x, solution of the tridiagonal system and backward FFT for a 1d-decomposition along x
963	!>
964	!> @warning This subroutine may still not work for hybrid parallelization with OpenMP (for possible
965	!> necessary changes see the original routine poisfft_hybrid, developed by Klaus Ketelsen,
966	!> May 2002)
967	!--------------------------------------------------------------------------------------------------!
968	SUBROUTINE fftx_tri_fftx( ar )
969
970	USE control_parameters, &
971	ONLY: loop_optimization
972
973	USE cpulog, &
974	ONLY: cpu_log, &
975	log_point_s
976
977	USE grid_variables, &
978	ONLY: ddx2, &
979	ddy2
980
981	USE kinds
982
983	USE pegrid
984
985	IMPLICIT NONE
986
987	INTEGER(iwp) :: i !<
988	INTEGER(iwp) :: j !<
989	INTEGER(iwp) :: k !<
990	INTEGER(iwp) :: m !<
991	INTEGER(iwp) :: n !<
992	!$ INTEGER(iwp) :: omp_get_thread_num !<
993	INTEGER(iwp) :: tn !<
994
995	REAL(wp), DIMENSION(0:nx) :: work_fftx !<
996	REAL(wp), DIMENSION(0:nx,1:nz) :: work_trix !<
997	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: ar !<
998	REAL(wp), DIMENSION(:,:,:,:), ALLOCATABLE :: tri !<
999
1000
1001	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'start' )
1002
1003	ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) )
1004
1005	tn = 0 ! Default thread number in case of one thread
1006	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_fftx, work_trix )
1007	DO j = nys_x, nyn_x
1008
1009	!$ tn = omp_get_thread_num()
1010
1011	IF ( loop_optimization == 'vector' ) THEN
1012	!
1013	!-- Code optimized for vector processors
1014	DO k = 1, nz
1015
1016	m = 0
1017	DO n = 1, pdims(1)
1018	DO i = 1, nnx
1019	work_trix(m,k) = ar(i,k,j,n)
1020	m = m + 1
1021	ENDDO
1022	ENDDO
1023
1024	ENDDO
1025
1026	CALL fft_x_m( work_trix, 'forward' )
1027
1028	ELSE
1029	!
1030	!-- Cache optimized code
1031	DO k = 1, nz
1032
1033	m = 0
1034	DO n = 1, pdims(1)
1035	DO i = 1, nnx
1036	work_fftx(m) = ar(i,k,j,n)
1037	m = m + 1
1038	ENDDO
1039	ENDDO
1040
1041	CALL fft_x_1d( work_fftx, 'forward' )
1042
1043	DO i = 0, nx
1044	work_trix(i,k) = work_fftx(i)
1045	ENDDO
1046
1047	ENDDO
1048
1049	ENDIF
1050
1051	!
1052	!-- Solve the linear equation system
1053	CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) )
1054
1055	IF ( loop_optimization == 'vector' ) THEN
1056	!
1057	!-- Code optimized for vector processors
1058	CALL fft_x_m( work_trix, 'backward' )
1059
1060	DO k = 1, nz
1061
1062	m = 0
1063	DO n = 1, pdims(1)
1064	DO i = 1, nnx
1065	ar(i,k,j,n) = work_trix(m,k)
1066	m = m + 1
1067	ENDDO
1068	ENDDO
1069
1070	ENDDO
1071
1072	ELSE
1073	!
1074	!-- Cache optimized code
1075	DO k = 1, nz
1076
1077	DO i = 0, nx
1078	work_fftx(i) = work_trix(i,k)
1079	ENDDO
1080
1081	CALL fft_x_1d( work_fftx, 'backward' )
1082
1083	m = 0
1084	DO n = 1, pdims(1)
1085	DO i = 1, nnx
1086	ar(i,k,j,n) = work_fftx(m)
1087	m = m + 1
1088	ENDDO
1089	ENDDO
1090
1091	ENDDO
1092
1093	ENDIF
1094
1095	ENDDO
1096
1097	DEALLOCATE( tri )
1098
1099	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'stop' )
1100
1101	END SUBROUTINE fftx_tri_fftx
1102
1103
1104	!--------------------------------------------------------------------------------------------------!
1105	! Description:
1106	! ------------
1107	!> Fourier-transformation along x with subsequent transposition x --> y for a 1d-decomposition
1108	!> along y.
1109	!>
1110	!> @attention NEC-branch of this routine may significantly profit from further optimizations. So
1111	!> far, performance is much worse than for routine ffty_tr_yx (more than three times
1112	!> slower).
1113	!--------------------------------------------------------------------------------------------------!
1114	SUBROUTINE fftx_tr_xy( f_in, f_out )
1115
1116
1117	USE control_parameters, &
1118	ONLY: loop_optimization
1119
1120	USE cpulog, &
1121	ONLY: cpu_log, &
1122	log_point_s
1123
1124	USE kinds
1125
1126	USE pegrid
1127
1128	IMPLICIT NONE
1129
1130	INTEGER(iwp) :: i !<
1131	INTEGER(iwp) :: j !<
1132	INTEGER(iwp) :: k !<
1133
1134	REAL(wp), DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx !<
1135	REAL(wp), DIMENSION(1:nz,nys:nyn,0:nx) :: f_in !<
1136	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_out !<
1137	REAL(wp), DIMENSION(nys:nyn,1:nz,0:nx) :: work !<
1138
1139	!
1140	!-- Carry out the FFT along x, where all data are present due to the 1d-decomposition along y.
1141	!-- Resort the data in a way that y becomes the first index.
1142	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'start' )
1143
1144	IF ( loop_optimization == 'vector' ) THEN
1145	!
1146	!-- Code for vector processors
1147	!$OMP PARALLEL PRIVATE ( i, j, k )
1148	!$OMP DO
1149	DO i = 0, nx
1150
1151	DO j = nys, nyn
1152	DO k = 1, nz
1153	work_fftx(i,k,j) = f_in(k,j,i)
1154	ENDDO
1155	ENDDO
1156
1157	ENDDO
1158
1159	!$OMP DO
1160	DO j = nys, nyn
1161
1162	CALL fft_x_m( work_fftx(:,:,j), 'forward' )
1163
1164	DO k = 1, nz
1165	DO i = 0, nx
1166	work(j,k,i) = work_fftx(i,k,j)
1167	ENDDO
1168	ENDDO
1169
1170	ENDDO
1171	!$OMP END PARALLEL
1172
1173	ELSE
1174
1175	!
1176	!-- Cache optimized code (there might still be a potential for better optimization).
1177	!$OMP PARALLEL PRIVATE (i,j,k)
1178	!$OMP DO
1179	DO i = 0, nx
1180
1181	DO j = nys, nyn
1182	DO k = 1, nz
1183	work_fftx(i,k,j) = f_in(k,j,i)
1184	ENDDO
1185	ENDDO
1186
1187	ENDDO
1188
1189	!$OMP DO
1190	DO j = nys, nyn
1191	DO k = 1, nz
1192
1193	CALL fft_x_1d( work_fftx(0:nx,k,j), 'forward' )
1194
1195	DO i = 0, nx
1196	work(j,k,i) = work_fftx(i,k,j)
1197	ENDDO
1198	ENDDO
1199
1200	ENDDO
1201	!$OMP END PARALLEL
1202
1203	ENDIF
1204	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'pause' )
1205
1206	!
1207	!-- Transpose array
1208	#if defined( __parallel )
1209	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1210	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1211	CALL MPI_ALLTOALL( work(nys,1,0), sendrecvcount_xy, MPI_REAL, f_out(1,1,nxl_y,1), &
1212	sendrecvcount_xy, MPI_REAL, comm1dy, ierr )
1213	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1214	#else
1215	!
1216	!-- Next line required to avoid compile error about unused dummy argument in serial mode
1217	i = SIZE( f_out )
1218	#endif
1219
1220	END SUBROUTINE fftx_tr_xy
1221
1222
1223	!--------------------------------------------------------------------------------------------------!
1224	! Description:
1225	! ------------
1226	!> Transposition y --> x with a subsequent backward Fourier transformation for a 1d-decomposition
1227	!> along x.
1228	!--------------------------------------------------------------------------------------------------!
1229	SUBROUTINE tr_yx_fftx( f_in, f_out )
1230
1231
1232	USE control_parameters, &
1233	ONLY: loop_optimization
1234
1235	USE cpulog, &
1236	ONLY: cpu_log, &
1237	log_point_s
1238
1239	USE kinds
1240
1241	USE pegrid
1242
1243	IMPLICIT NONE
1244
1245	INTEGER(iwp) :: i !<
1246	INTEGER(iwp) :: j !<
1247	INTEGER(iwp) :: k !<
1248
1249	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_in !<
1250	REAL(wp), DIMENSION(1:nz,nys:nyn,0:nx) :: f_out !<
1251	REAL(wp), DIMENSION(nys:nyn,1:nz,0:nx) :: work !<
1252	REAL(wp), DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx !<
1253
1254
1255	!
1256	!-- Transpose array
1257	#if defined( __parallel )
1258	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1259	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1260	CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, work(nys,1,0), &
1261	sendrecvcount_xy, MPI_REAL, comm1dy, ierr )
1262	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1263	#else
1264	!
1265	!-- Next line required to avoid compile error about unused dummy argument in serial mode
1266	i = SIZE( f_in )
1267	#endif
1268
1269	!
1270	!-- Carry out the FFT along x, where all data are present due to the 1d-decomposition along y.
1271	!-- Resort the data in a way that y becomes the first index.
1272	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'continue' )
1273
1274	IF ( loop_optimization == 'vector' ) THEN
1275	!
1276	!-- Code optimized for vector processors
1277	!$OMP PARALLEL PRIVATE ( i, j, k )
1278	!$OMP DO
1279	DO j = nys, nyn
1280
1281	DO k = 1, nz
1282	DO i = 0, nx
1283	work_fftx(i,k,j) = work(j,k,i)
1284	ENDDO
1285	ENDDO
1286
1287	CALL fft_x_m( work_fftx(:,:,j), 'backward' )
1288
1289	ENDDO
1290
1291	!$OMP DO
1292	DO i = 0, nx
1293	DO j = nys, nyn
1294	DO k = 1, nz
1295	f_out(k,j,i) = work_fftx(i,k,j)
1296	ENDDO
1297	ENDDO
1298	ENDDO
1299	!$OMP END PARALLEL
1300
1301	ELSE
1302
1303	!
1304	!-- Cache optimized code (there might be still a potential for better optimization).
1305	!$OMP PARALLEL PRIVATE (i,j,k)
1306	!$OMP DO
1307	DO j = nys, nyn
1308	DO k = 1, nz
1309
1310	DO i = 0, nx
1311	work_fftx(i,k,j) = work(j,k,i)
1312	ENDDO
1313
1314	CALL fft_x_1d( work_fftx(0:nx,k,j), 'backward' )
1315
1316	ENDDO
1317	ENDDO
1318
1319	!$OMP DO
1320	DO i = 0, nx
1321	DO j = nys, nyn
1322	DO k = 1, nz
1323	f_out(k,j,i) = work_fftx(i,k,j)
1324	ENDDO
1325	ENDDO
1326	ENDDO
1327	!$OMP END PARALLEL
1328
1329	ENDIF
1330	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'stop' )
1331
1332	END SUBROUTINE tr_yx_fftx
1333
1334
1335	!--------------------------------------------------------------------------------------------------!
1336	! Description:
1337	! ------------
1338	!> FFT along y, solution of the tridiagonal system and backward FFT for a 1d-decomposition along y.
1339	!>
1340	!> @warning This subroutine may still not work for hybrid parallelization with OpenMP (for possible
1341	!> necessary changes see the original routine poisfft_hybrid, developed by Klaus Ketelsen,
1342	!> May 2002)
1343	!--------------------------------------------------------------------------------------------------!
1344	SUBROUTINE ffty_tri_ffty( ar )
1345
1346
1347	USE control_parameters, &
1348	ONLY: loop_optimization
1349
1350	USE cpulog, &
1351	ONLY: cpu_log, &
1352	log_point_s
1353
1354	USE grid_variables, &
1355	ONLY: ddx2, &
1356	ddy2
1357
1358	USE kinds
1359
1360	USE pegrid
1361
1362	IMPLICIT NONE
1363
1364	INTEGER(iwp) :: i !<
1365	INTEGER(iwp) :: j !<
1366	INTEGER(iwp) :: k !<
1367	INTEGER(iwp) :: m !<
1368	INTEGER(iwp) :: n !<
1369	!$ INTEGER(iwp) :: omp_get_thread_num !<
1370	INTEGER(iwp) :: tn !<
1371
1372	REAL(wp), DIMENSION(0:ny) :: work_ffty !<
1373	REAL(wp), DIMENSION(0:ny,1:nz) :: work_triy !<
1374	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: ar !<
1375	REAL(wp), DIMENSION(:,:,:,:), ALLOCATABLE :: tri !<
1376
1377
1378	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'start' )
1379
1380	ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) )
1381
1382	tn = 0 ! Default thread number in case of one thread
1383	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy )
1384	DO i = nxl_y, nxr_y
1385
1386	!$ tn = omp_get_thread_num()
1387
1388	IF ( loop_optimization == 'vector' ) THEN
1389	!
1390	!-- Code optimized for vector processors
1391	DO k = 1, nz
1392
1393	m = 0
1394	DO n = 1, pdims(2)
1395	DO j = 1, nny
1396	work_triy(m,k) = ar(j,k,i,n)
1397	m = m + 1
1398	ENDDO
1399	ENDDO
1400
1401	ENDDO
1402
1403	CALL fft_y_m( work_triy, ny, 'forward' )
1404
1405	ELSE
1406	!
1407	!-- Cache optimized code
1408	DO k = 1, nz
1409
1410	m = 0
1411	DO n = 1, pdims(2)
1412	DO j = 1, nny
1413	work_ffty(m) = ar(j,k,i,n)
1414	m = m + 1
1415	ENDDO
1416	ENDDO
1417
1418	CALL fft_y_1d( work_ffty, 'forward' )
1419
1420	DO j = 0, ny
1421	work_triy(j,k) = work_ffty(j)
1422	ENDDO
1423
1424	ENDDO
1425
1426	ENDIF
1427
1428	!
1429	!-- Solve the linear equation system
1430	CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) )
1431
1432	IF ( loop_optimization == 'vector' ) THEN
1433	!
1434	!-- Code optimized for vector processors
1435	CALL fft_y_m( work_triy, ny, 'backward' )
1436
1437	DO k = 1, nz
1438
1439	m = 0
1440	DO n = 1, pdims(2)
1441	DO j = 1, nny
1442	ar(j,k,i,n) = work_triy(m,k)
1443	m = m + 1
1444	ENDDO
1445	ENDDO
1446
1447	ENDDO
1448
1449	ELSE
1450	!
1451	!-- Cache optimized code
1452	DO k = 1, nz
1453
1454	DO j = 0, ny
1455	work_ffty(j) = work_triy(j,k)
1456	ENDDO
1457
1458	CALL fft_y_1d( work_ffty, 'backward' )
1459
1460	m = 0
1461	DO n = 1, pdims(2)
1462	DO j = 1, nny
1463	ar(j,k,i,n) = work_ffty(m)
1464	m = m + 1
1465	ENDDO
1466	ENDDO
1467
1468	ENDDO
1469
1470	ENDIF
1471
1472	ENDDO
1473
1474	DEALLOCATE( tri )
1475
1476	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'stop' )
1477
1478	END SUBROUTINE ffty_tri_ffty
1479
1480	END MODULE poisfft_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |