Home

Context Navigation

source: palm/trunk/SOURCE/poisfft_mod.f90 @ 4700

Last change on this file since 4700 was 4671, checked in by pavelkrc, 4 years ago
Radiative transfer model RTM version 4.1
Property svn:keywords set to `Id`
File size: 45.5 KB

Line
1	!> @file poisfft_mod.f90
2	!--------------------------------------------------------------------------------------------------!
3	! This file is part of the PALM model system.
4	!
5	! PALM is free software: you can redistribute it and/or modify it under the terms of the GNU General
6	! Public License as published by the Free Software Foundation, either version 3 of the License, or
7	! (at your option) any later version.
8	!
9	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the
10	! implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
11	! Public License for more details.
12	!
13	! You should have received a copy of the GNU General Public License along with PALM. If not, see
14	! <http://www.gnu.org/licenses/>.
15	!
16	! Copyright 1997-2020 Leibniz Universitaet Hannover
17	!--------------------------------------------------------------------------------------------------!
18	!
19	!
20	! Current revisions:
21	! -----------------
22	!
23	!
24	! Former revisions:
25	! -----------------
26	! $Id: poisfft_mod.f90 4671 2020-09-09 20:27:58Z raasch $
27	! OMP bugfix
28	!
29	! 4649 2020-08-25 12:11:17Z raasch
30	! File re-formatted to follow the PALM coding standard
31	!
32	! 4429 2020-02-27 15:24:30Z raasch
33	! Statements added to avoid compile errors due to unused dummy arguments in serial mode
34	!
35	! 4366 2020-01-09 08:12:43Z raasch
36	! Modification concerning NEC vectorizatio
37	!
38	! 4360 2020-01-07 11:25:50Z suehring
39	! Corrected "Former revisions" section
40	!
41	! 3690 2019-01-22 22:56:42Z knoop
42	! OpenACC port for SPEC
43	!
44	! Revision 1.1 1997/07/24 11:24:14 raasch
45	! Initial revision
46	!
47	!
48	!--------------------------------------------------------------------------------------------------!
49	! Description:
50	! ------------
51	!> Solves the Poisson equation with a 2D spectral method
52	!> d^2 p / dx^2 + d^2 p / dy^2 + d^2 p / dz^2 = s
53	!>
54	!> Input:
55	!> real ar contains (nnz,nny,nnx) elements of the velocity divergence, starting from (1,nys,nxl)
56	!>
57	!> Output:
58	!> real ar contains the solution for perturbation pressure p
59	!--------------------------------------------------------------------------------------------------!
60	MODULE poisfft_mod
61
62
63	USE fft_xy, &
64	ONLY: fft_init, &
65	fft_y, &
66	fft_y_1d, &
67	fft_y_m, &
68	fft_x, &
69	fft_x_1d, &
70	fft_x_m, &
71	temperton_fft_vec
72
73	USE indices, &
74	ONLY: nnx, &
75	nny, &
76	nx, &
77	nxl, &
78	nxr, &
79	ny, &
80	nys, &
81	nyn, &
82	nz
83
84	USE transpose_indices, &
85	ONLY: nxl_y, &
86	nxl_z, &
87	nxr_y, &
88	nxr_z, &
89	nys_x, &
90	nys_z, &
91	nyn_x, &
92	nyn_z, &
93	nzb_x, &
94	nzb_y, &
95	nzt_x, &
96	nzt_y
97
98	USE tridia_solver, &
99	ONLY: tridia_1dd, &
100	tridia_init, &
101	tridia_substi, &
102	tridia_substi_overlap
103
104	IMPLICIT NONE
105
106	LOGICAL, SAVE :: poisfft_initialized = .FALSE. !<
107
108	PRIVATE
109
110	PUBLIC poisfft, poisfft_init
111
112	INTERFACE poisfft
113	MODULE PROCEDURE poisfft
114	END INTERFACE poisfft
115
116	INTERFACE poisfft_init
117	MODULE PROCEDURE poisfft_init
118	END INTERFACE poisfft_init
119
120
121	CONTAINS
122
123	!--------------------------------------------------------------------------------------------------!
124	! Description:
125	! ------------
126	!> Setup coefficients for FFT and the tridiagonal solver
127	!--------------------------------------------------------------------------------------------------!
128	SUBROUTINE poisfft_init
129
130	IMPLICIT NONE
131
132
133	CALL fft_init
134
135	CALL tridia_init
136
137	poisfft_initialized = .TRUE.
138
139	END SUBROUTINE poisfft_init
140
141
142
143	!--------------------------------------------------------------------------------------------------!
144	! Description:
145	! ------------
146	!> Two-dimensional Fourier Transformation in x- and y-direction.
147	!--------------------------------------------------------------------------------------------------!
148	SUBROUTINE poisfft( ar )
149
150	USE control_parameters, &
151	ONLY: transpose_compute_overlap
152
153	USE cpulog, &
154	ONLY: cpu_log, &
155	cpu_log_nowait, &
156	log_point_s
157
158	USE kinds
159
160	USE pegrid
161
162	IMPLICIT NONE
163
164	INTEGER(iwp) :: ii !<
165	INTEGER(iwp) :: iind !<
166	INTEGER(iwp) :: inew !<
167	INTEGER(iwp) :: jj !<
168	INTEGER(iwp) :: jind !<
169	INTEGER(iwp) :: jnew !<
170	INTEGER(iwp) :: ki !<
171	INTEGER(iwp) :: kk !<
172	INTEGER(iwp) :: knew !<
173	INTEGER(iwp) :: n !<
174	INTEGER(iwp) :: nblk !<
175	INTEGER(iwp) :: nnx_y !<
176	INTEGER(iwp) :: nny_z !<
177	INTEGER(iwp) :: nnz_x !<
178	INTEGER(iwp) :: nxl_y_bound !<
179	INTEGER(iwp) :: nxr_y_bound !<
180
181	INTEGER(iwp), DIMENSION(4) :: isave !<
182
183	REAL(wp), DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar !<
184	REAL(wp), DIMENSION(nys:nyn,nxl:nxr,1:nz) :: ar_inv !<
185
186	#define __acc_fft_device ( defined( _OPENACC ) && ( defined ( __cuda_fft ) ) )
187	#if __acc_fft_device
188	!$ACC DECLARE CREATE(ar_inv)
189	#endif
190
191	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: ar1 !<
192	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_in !<
193	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_inv !<
194	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_out_y !<
195	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_out_z !<
196
197
198	CALL cpu_log( log_point_s(3), 'poisfft', 'start' )
199
200	IF ( .NOT. poisfft_initialized ) CALL poisfft_init
201
202	#if !__acc_fft_device
203	!$ACC UPDATE HOST(ar)
204	#endif
205
206	#ifndef _OPENACC
207	!
208	!-- Two-dimensional Fourier Transformation in x- and y-direction.
209	IF ( pdims(2) == 1 .AND. pdims(1) > 1 ) THEN
210
211	!
212	!-- 1d-domain-decomposition along x:
213	!-- FFT along y and transposition y --> x
214	CALL ffty_tr_yx( ar, ar )
215
216	!
217	!-- FFT along x, solving the tridiagonal system and backward FFT
218	CALL fftx_tri_fftx( ar )
219
220	!
221	!-- Transposition x --> y and backward FFT along y
222	CALL tr_xy_ffty( ar, ar )
223
224	ELSEIF ( pdims(1) == 1 .AND. pdims(2) > 1 ) THEN
225
226	!
227	!-- 1d-domain-decomposition along y:
228	!-- FFT along x and transposition x --> y
229	CALL fftx_tr_xy( ar, ar )
230
231	!
232	!-- FFT along y, solving the tridiagonal system and backward FFT
233	CALL ffty_tri_ffty( ar )
234
235	!
236	!-- Transposition y --> x and backward FFT along x
237	CALL tr_yx_fftx( ar, ar )
238
239	ELSEIF ( .NOT. transpose_compute_overlap ) THEN
240	#endif
241
242	!
243	!-- 2d-domain-decomposition or no decomposition (1 PE run)
244	!-- Transposition z --> x
245	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
246	CALL resort_for_zx( ar, ar_inv )
247	CALL transpose_zx( ar_inv, ar )
248	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
249
250	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
251	IF ( temperton_fft_vec ) THEN
252	!
253	!-- Vector version outputs a transformed array ar_inv that does not require resorting
254	!-- (which is done for ar further below)
255	CALL fft_x( ar, 'forward', ar_inv=ar_inv)
256	ELSE
257	CALL fft_x( ar, 'forward')
258	ENDIF
259	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
260
261	!
262	!-- Transposition x --> y
263	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
264	IF( .NOT. temperton_fft_vec ) CALL resort_for_xy( ar, ar_inv )
265	CALL transpose_xy( ar_inv, ar )
266	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
267
268	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
269	IF ( temperton_fft_vec ) THEN
270	!
271	!-- Input array ar_inv from fft_x can be directly used here.
272	!-- The output (also in array ar_inv) does not require resorting below.
273	CALL fft_y( ar, 'forward', ar_inv = ar_inv, nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
274	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
275	ELSE
276	CALL fft_y( ar, 'forward', ar_tr = ar, nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
277	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
278	ENDIF
279	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
280
281	!
282	!-- Transposition y --> z
283	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
284	IF ( .NOT. temperton_fft_vec ) CALL resort_for_yz( ar, ar_inv )
285	CALL transpose_yz( ar_inv, ar )
286	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
287
288	!
289	!-- Solve the tridiagonal equation system along z
290	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
291	CALL tridia_substi( ar )
292	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
293
294	!
295	!-- Inverse Fourier Transformation
296	!-- Transposition z --> y
297	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
298	CALL transpose_zy( ar, ar_inv )
299	!
300	!-- The fft_y below (vector branch) can directly process ar_inv (i.e. does not require a
301	!-- resorting)
302	IF ( .NOT. temperton_fft_vec ) CALL resort_for_zy( ar_inv, ar )
303	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
304
305	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
306	IF ( temperton_fft_vec ) THEN
307	!
308	!-- Output array ar_inv can be used as input to the below fft_x routine without resorting
309	CALL fft_y( ar, 'backward', ar_inv = ar_inv, nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
310	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
311	ELSE
312	CALL fft_y( ar, 'backward', ar_tr = ar, nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
313	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
314	ENDIF
315
316	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
317
318	!
319	!-- Transposition y --> x
320	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
321	CALL transpose_yx( ar, ar_inv )
322	IF ( .NOT. temperton_fft_vec ) CALL resort_for_yx( ar_inv, ar )
323	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
324
325	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
326	IF ( temperton_fft_vec ) THEN
327	CALL fft_x( ar, 'backward', ar_inv=ar_inv )
328	ELSE
329	CALL fft_x( ar, 'backward' )
330	ENDIF
331	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
332
333	!
334	!-- Transposition x --> z
335	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
336	CALL transpose_xz( ar, ar_inv )
337	CALL resort_for_xz( ar_inv, ar )
338	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
339
340	#ifndef _OPENACC
341	ELSE
342
343	!
344	!-- 2d-domain-decomposition or no decomposition (1 PE run) with overlapping transposition / fft
345	!-- cputime logging must not use barriers, which would prevent overlapping
346	ALLOCATE( f_out_y(0:ny,nxl_y:nxr_y,nzb_y:nzt_y), &
347	f_out_z(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
348	!
349	!-- Transposition z --> x + subsequent fft along x
350	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
351	CALL resort_for_zx( ar, f_inv )
352	!
353	!-- Save original indices and gridpoint counter
354	isave(1) = nz
355	isave(2) = nzb_x
356	isave(3) = nzt_x
357	isave(4) = sendrecvcount_zx
358	!
359	!-- Set new indices for transformation
360	nblk = nz / pdims(1)
361	nz = pdims(1)
362	nnz_x = 1
363	nzb_x = 1 + myidx * nnz_x
364	nzt_x = ( myidx + 1 ) * nnz_x
365	sendrecvcount_zx = nnx * nny * nnz_x
366
367	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
368	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
369
370	DO kk = 1, nblk
371
372	IF ( kk == 1 ) THEN
373	CALL cpu_log( log_point_s(5), 'transpo forward', 'start', cpu_log_nowait )
374	ELSE
375	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
376	ENDIF
377
378	DO knew = 1, nz
379	ki = kk + nblk * ( knew - 1 )
380	f_in(:,:,knew) = f_inv(:,:,ki)
381	ENDDO
382
383	CALL transpose_zx( f_in, ar1(:,:,:))
384	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
385
386	IF ( kk == 1 ) THEN
387	CALL cpu_log( log_point_s(4), 'fft_x', 'start', cpu_log_nowait )
388	ELSE
389	CALL cpu_log( log_point_s(4), 'fft_x', 'continue', cpu_log_nowait )
390	ENDIF
391
392	n = isave(2) + kk - 1
393	CALL fft_x( ar1(:,:,:), 'forward', ar_2d = f_out_z(:,:,n) )
394	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
395
396	ENDDO
397	!
398	!-- Restore original indices/counters
399	nz = isave(1)
400	nzb_x = isave(2)
401	nzt_x = isave(3)
402	sendrecvcount_zx = isave(4)
403
404	DEALLOCATE( ar1, f_in, f_inv )
405
406	!
407	!-- Transposition x --> y + subsequent fft along y
408	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
409	CALL resort_for_xy( f_out_z, f_inv )
410	!
411	!-- Save original indices and gridpoint counter
412	isave(1) = nx
413	isave(2) = nxl_y
414	isave(3) = nxr_y
415	isave(4) = sendrecvcount_xy
416	!
417	!-- Set new indices for transformation
418	nblk = ( ( nx+1 ) / pdims(2) ) - 1
419	nx = pdims(2)
420	nnx_y = 1
421	nxl_y = myidy * nnx_y
422	nxr_y = ( myidy + 1 ) * nnx_y - 1
423	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
424
425	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
426	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
427
428	DO ii = 0, nblk
429
430	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
431
432	DO inew = 0, nx-1
433	iind = ii + ( nblk + 1 ) * inew
434	f_in(:,:,inew) = f_inv(:,:,iind)
435	ENDDO
436
437	CALL transpose_xy( f_in, ar1(:,:,:) )
438
439	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
440
441	IF ( ii == 1 ) THEN
442	CALL cpu_log( log_point_s(7), 'fft_y', 'start', cpu_log_nowait )
443	ELSE
444	CALL cpu_log( log_point_s(7), 'fft_y', 'continue', cpu_log_nowait )
445	ENDIF
446
447	nxl_y_bound = isave(2)
448	nxr_y_bound = isave(3)
449	n = isave(2) + ii
450	CALL fft_y( ar1(:,:,:), 'forward', ar_tr = f_out_y, nxl_y_bound = nxl_y_bound, &
451	nxr_y_bound = nxr_y_bound, nxl_y_l = n, nxr_y_l = n )
452
453	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
454
455	ENDDO
456	!
457	!-- Restore original indices/counters
458	nx = isave(1)
459	nxl_y = isave(2)
460	nxr_y = isave(3)
461	sendrecvcount_xy = isave(4)
462
463	DEALLOCATE( ar1, f_in, f_inv )
464
465	!
466	!-- Transposition y --> z + subsequent tridia + resort for z --> y
467	ALLOCATE( f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
468	CALL resort_for_yz( f_out_y, f_inv )
469	!
470	!-- Save original indices and gridpoint counter
471	isave(1) = ny
472	isave(2) = nys_z
473	isave(3) = nyn_z
474	isave(4) = sendrecvcount_yz
475	!
476	!-- Set new indices for transformation
477	nblk = ( ( ny+1 ) / pdims(1) ) - 1
478	ny = pdims(1)
479	nny_z = 1
480	nys_z = myidx * nny_z
481	nyn_z = ( myidx + 1 ) * nny_z - 1
482	sendrecvcount_yz = ( nxr_y-nxl_y+1 ) * nny_z * ( nzt_y-nzb_y+1 )
483
484	ALLOCATE( ar1(nxl_z:nxr_z,nys_z:nyn_z,1:nz) )
485	ALLOCATE( f_in(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
486
487	DO jj = 0, nblk
488	!
489	!-- Forward Fourier Transformation
490	!-- Transposition y --> z
491	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
492
493	DO jnew = 0, ny-1
494	jind = jj + ( nblk + 1 ) * jnew
495	f_in(:,:,jnew) = f_inv(:,:,jind)
496	ENDDO
497
498	CALL transpose_yz( f_in, ar1(:,:,:) )
499
500	IF ( jj == nblk ) THEN
501	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
502	ELSE
503	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
504	ENDIF
505
506	!
507	!-- Solve the tridiagonal equation system along z
508	CALL cpu_log( log_point_s(6), 'tridia', 'start', cpu_log_nowait )
509
510	n = isave(2) + jj
511	CALL tridia_substi_overlap( ar1(:,:,:), n )
512
513	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
514
515	!
516	!-- Inverse Fourier Transformation
517	!-- Transposition z --> y
518	!-- Only one thread should call MPI routines, therefore forward and backward tranpose are in
519	!-- the same section
520	IF ( jj == 0 ) THEN
521	CALL cpu_log( log_point_s(8), 'transpo invers', 'start', cpu_log_nowait )
522	ELSE
523	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
524	ENDIF
525
526	CALL transpose_zy( ar1(:,:,:), f_in )
527
528	DO jnew = 0, ny-1
529	jind = jj + ( nblk + 1 ) * jnew
530	f_inv(:,:,jind) = f_in(:,:,jnew)
531	ENDDO
532
533	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
534
535	ENDDO
536	!
537	!-- Restore original indices/counters
538	ny = isave(1)
539	nys_z = isave(2)
540	nyn_z = isave(3)
541	sendrecvcount_yz = isave(4)
542
543	CALL resort_for_zy( f_inv, f_out_y )
544
545	DEALLOCATE( ar1, f_in, f_inv )
546
547	!
548	!-- fft along y backward + subsequent transposition y --> x
549	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
550	!
551	!-- Save original indices and gridpoint counter
552	isave(1) = nx
553	isave(2) = nxl_y
554	isave(3) = nxr_y
555	isave(4) = sendrecvcount_xy
556	!
557	!-- Set new indices for transformation
558	nblk = ( ( nx+1 ) / pdims(2) ) - 1
559	nx = pdims(2)
560	nnx_y = 1
561	nxl_y = myidy * nnx_y
562	nxr_y = ( myidy + 1 ) * nnx_y - 1
563	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
564
565	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
566	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
567
568	DO ii = 0, nblk
569
570	CALL cpu_log( log_point_s(7), 'fft_y', 'continue', cpu_log_nowait )
571
572	n = isave(2) + ii
573	nxl_y_bound = isave(2)
574	nxr_y_bound = isave(3)
575
576	CALL fft_y( ar1(:,:,:), 'backward', ar_tr = f_out_y, nxl_y_bound = nxl_y_bound, &
577	nxr_y_bound = nxr_y_bound, nxl_y_l = n, nxr_y_l = n )
578
579	IF ( ii == nblk ) THEN
580	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
581	ELSE
582	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
583	ENDIF
584
585	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
586
587	CALL transpose_yx( ar1(:,:,:), f_in )
588
589	DO inew = 0, nx-1
590	iind = ii + (nblk+1) * inew
591	f_inv(:,:,iind) = f_in(:,:,inew)
592	ENDDO
593
594	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
595
596	ENDDO
597	!
598	!-- Restore original indices/counters
599	nx = isave(1)
600	nxl_y = isave(2)
601	nxr_y = isave(3)
602	sendrecvcount_xy = isave(4)
603
604	CALL resort_for_yx( f_inv, f_out_z )
605
606	DEALLOCATE( ar1, f_in, f_inv )
607
608	!
609	!-- fft along x backward + subsequent final transposition x --> z
610	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
611	!
612	!-- Save original indices and gridpoint counter
613	isave(1) = nz
614	isave(2) = nzb_x
615	isave(3) = nzt_x
616	isave(4) = sendrecvcount_zx
617	!
618	!-- Set new indices for transformation
619	nblk = nz / pdims(1)
620	nz = pdims(1)
621	nnz_x = 1
622	nzb_x = 1 + myidx * nnz_x
623	nzt_x = ( myidx + 1 ) * nnz_x
624	sendrecvcount_zx = nnx * nny * nnz_x
625
626	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
627	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
628
629	DO kk = 1, nblk
630
631	CALL cpu_log( log_point_s(4), 'fft_x', 'continue', cpu_log_nowait )
632
633	n = isave(2) + kk - 1
634	CALL fft_x( ar1(:,:,:), 'backward', f_out_z(:,:,n) )
635
636	IF ( kk == nblk ) THEN
637	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
638	ELSE
639	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
640	ENDIF
641
642	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
643
644	CALL transpose_xz( ar1(:,:,:), f_in )
645
646	DO knew = 1, nz
647	ki = kk + nblk * ( knew - 1 )
648	f_inv(:,:,ki) = f_in(:,:,knew)
649	ENDDO
650
651	IF ( kk == nblk ) THEN
652	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
653	ELSE
654	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
655	ENDIF
656
657	ENDDO
658	!
659	!-- Restore original indices/counters
660	nz = isave(1)
661	nzb_x = isave(2)
662	nzt_x = isave(3)
663	sendrecvcount_zx = isave(4)
664
665	CALL resort_for_xz( f_inv, ar )
666
667	DEALLOCATE( ar1, f_in, f_inv )
668
669	ENDIF
670	#endif
671
672	#if !__acc_fft_device
673	!$ACC UPDATE DEVICE(ar)
674	#endif
675
676	CALL cpu_log( log_point_s(3), 'poisfft', 'stop' )
677
678	END SUBROUTINE poisfft
679
680
681	!--------------------------------------------------------------------------------------------------!
682	! Description:
683	! ------------
684	!> Fourier-transformation along y with subsequent transposition y --> x for a 1d-decomposition
685	!> along x.
686	!>
687	!> @attention The performance of this routine is much faster on the NEC-SX6, if the first index of
688	!> work_ffty_vec is odd. Otherwise memory bank conflicts may occur (especially if the
689	!> index is a multiple of 128). That's why work_ffty_vec is dimensioned as 0:ny+1.
690	!> Of course, this will not work if users are using an odd number of gridpoints along y.
691	!--------------------------------------------------------------------------------------------------!
692	SUBROUTINE ffty_tr_yx( f_in, f_out )
693
694	USE control_parameters, &
695	ONLY: loop_optimization
696
697	USE cpulog, &
698	ONLY: cpu_log, &
699	log_point_s
700
701	USE kinds
702
703	USE pegrid
704
705	IMPLICIT NONE
706
707	INTEGER(iwp) :: i !<
708	INTEGER(iwp) :: iend !<
709	INTEGER(iwp) :: iouter !<
710	INTEGER(iwp) :: ir !<
711	INTEGER(iwp) :: j !<
712	INTEGER(iwp) :: k !<
713
714	INTEGER(iwp), PARAMETER :: stridex = 4 !<
715
716	REAL(wp), DIMENSION(1:nz,0:ny,nxl:nxr) :: f_in !<
717	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_out !<
718	REAL(wp), DIMENSION(nxl:nxr,1:nz,0:ny) :: work !<
719
720	REAL(wp), DIMENSION(:,:), ALLOCATABLE :: work_ffty !<
721	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: work_ffty_vec !<
722
723	!
724	!-- Carry out the FFT along y, where all data are present due to the 1d-decomposition along x.
725	!-- Resort the data in a way that x becomes the first index.
726	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'start' )
727
728	IF ( loop_optimization == 'vector' ) THEN
729
730	ALLOCATE( work_ffty_vec(0:ny+1,1:nz,nxl:nxr) )
731	!
732	!-- Code optimized for vector processors
733	!$OMP PARALLEL PRIVATE ( i, j, k )
734	!$OMP DO
735	DO i = nxl, nxr
736
737	DO j = 0, ny
738	DO k = 1, nz
739	work_ffty_vec(j,k,i) = f_in(k,j,i)
740	ENDDO
741	ENDDO
742
743	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' )
744
745	ENDDO
746
747	!$OMP DO
748	DO k = 1, nz
749	DO j = 0, ny
750	DO i = nxl, nxr
751	work(i,k,j) = work_ffty_vec(j,k,i)
752	ENDDO
753	ENDDO
754	ENDDO
755	!$OMP END PARALLEL
756
757	DEALLOCATE( work_ffty_vec )
758
759	ELSE
760	!
761	!-- Cache optimized code.
762	ALLOCATE( work_ffty(0:ny,stridex) )
763	!
764	!-- The i-(x-)direction is split into a strided outer loop and an inner loop for better cache
765	!-- performance
766	!$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty)
767	!$OMP DO
768	DO iouter = nxl, nxr, stridex
769
770	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
771
772	DO k = 1, nz
773
774	DO i = iouter, iend
775
776	ir = i-iouter+1 ! Counter within a stride
777	DO j = 0, ny
778	work_ffty(j,ir) = f_in(k,j,i)
779	ENDDO
780	!
781	!-- FFT along y
782	CALL fft_y_1d( work_ffty(:,ir), 'forward' )
783
784	ENDDO
785
786	!
787	!-- Resort
788	DO j = 0, ny
789	DO i = iouter, iend
790	work(i,k,j) = work_ffty(j,i-iouter+1)
791	ENDDO
792	ENDDO
793
794	ENDDO
795
796	ENDDO
797	!$OMP END PARALLEL
798
799	DEALLOCATE( work_ffty )
800
801	ENDIF
802
803	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'pause' )
804
805	!
806	!-- Transpose array
807	#if defined( __parallel )
808	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
809	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
810	CALL MPI_ALLTOALL( work(nxl,1,0), sendrecvcount_xy, MPI_REAL, f_out(1,1,nys_x,1), &
811	sendrecvcount_xy, MPI_REAL, comm1dx, ierr )
812	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
813	#else
814	!
815	!-- Next line required to avoid compile error about unused dummy argument in serial mode
816	i = SIZE( f_out )
817	#endif
818
819	END SUBROUTINE ffty_tr_yx
820
821
822	!--------------------------------------------------------------------------------------------------!
823	! Description:
824	! ------------
825	!> Transposition x --> y with a subsequent backward Fourier transformation for a 1d-decomposition
826	!> along x
827	!--------------------------------------------------------------------------------------------------!
828	SUBROUTINE tr_xy_ffty( f_in, f_out )
829
830	USE control_parameters, &
831	ONLY: loop_optimization
832
833	USE cpulog, &
834	ONLY: cpu_log, &
835	log_point_s
836
837	USE kinds
838
839	USE pegrid
840
841	IMPLICIT NONE
842
843	INTEGER(iwp) :: i !<
844	INTEGER(iwp) :: iend !<
845	INTEGER(iwp) :: iouter !<
846	INTEGER(iwp) :: ir !<
847	INTEGER(iwp) :: j !<
848	INTEGER(iwp) :: k !<
849
850	INTEGER(iwp), PARAMETER :: stridex = 4 !<
851
852	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_in !<
853	REAL(wp), DIMENSION(1:nz,0:ny,nxl:nxr) :: f_out !<
854	REAL(wp), DIMENSION(nxl:nxr,1:nz,0:ny) :: work !<
855
856	REAL(wp), DIMENSION(:,:), ALLOCATABLE :: work_ffty !<
857	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: work_ffty_vec !<
858
859	!
860	!-- Transpose array
861	#if defined( __parallel )
862	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
863	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
864	CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, work(nxl,1,0), &
865	sendrecvcount_xy, MPI_REAL, comm1dx, ierr )
866	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
867	#else
868	!
869	!-- Next line required to avoid compile error about unused dummy argument in serial mode
870	i = SIZE( f_in )
871	#endif
872
873	!
874	!-- Resort the data in a way that y becomes the first index and carry out the backward fft along y.
875	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'continue' )
876
877	IF ( loop_optimization == 'vector' ) THEN
878
879	ALLOCATE( work_ffty_vec(0:ny+1,1:nz,nxl:nxr) )
880	!
881	!-- Code optimized for vector processors
882	!$OMP PARALLEL PRIVATE ( i, j, k )
883	!$OMP DO
884	DO k = 1, nz
885	DO j = 0, ny
886	DO i = nxl, nxr
887	work_ffty_vec(j,k,i) = work(i,k,j)
888	ENDDO
889	ENDDO
890	ENDDO
891
892	!$OMP DO
893	DO i = nxl, nxr
894
895	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' )
896
897	DO j = 0, ny
898	DO k = 1, nz
899	f_out(k,j,i) = work_ffty_vec(j,k,i)
900	ENDDO
901	ENDDO
902
903	ENDDO
904	!$OMP END PARALLEL
905
906	DEALLOCATE( work_ffty_vec )
907
908	ELSE
909	!
910	!-- Cache optimized code.
911	ALLOCATE( work_ffty(0:ny,stridex) )
912	!
913	!-- The i-(x-)direction is split into a strided outer loop and an inner loop for better cache
914	!-- performance
915	!$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty )
916	!$OMP DO
917	DO iouter = nxl, nxr, stridex
918
919	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
920
921	DO k = 1, nz
922	!
923	!-- Resort
924	DO j = 0, ny
925	DO i = iouter, iend
926	work_ffty(j,i-iouter+1) = work(i,k,j)
927	ENDDO
928	ENDDO
929
930	DO i = iouter, iend
931
932	!
933	!-- FFT along y
934	ir = i-iouter+1 ! Counter within a stride
935	CALL fft_y_1d( work_ffty(:,ir), 'backward' )
936
937	DO j = 0, ny
938	f_out(k,j,i) = work_ffty(j,ir)
939	ENDDO
940	ENDDO
941
942	ENDDO
943
944	ENDDO
945	!$OMP END PARALLEL
946
947	DEALLOCATE( work_ffty )
948
949	ENDIF
950
951	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'stop' )
952
953	END SUBROUTINE tr_xy_ffty
954
955
956	!--------------------------------------------------------------------------------------------------!
957	! Description:
958	! ------------
959	!> FFT along x, solution of the tridiagonal system and backward FFT for a 1d-decomposition along x
960	!>
961	!> @warning This subroutine may still not work for hybrid parallelization with OpenMP (for possible
962	!> necessary changes see the original routine poisfft_hybrid, developed by Klaus Ketelsen,
963	!> May 2002)
964	!--------------------------------------------------------------------------------------------------!
965	SUBROUTINE fftx_tri_fftx( ar )
966
967	USE control_parameters, &
968	ONLY: loop_optimization
969
970	USE cpulog, &
971	ONLY: cpu_log, &
972	log_point_s
973
974	USE grid_variables, &
975	ONLY: ddx2, &
976	ddy2
977
978	USE kinds
979
980	USE pegrid
981
982	IMPLICIT NONE
983
984	INTEGER(iwp) :: i !<
985	INTEGER(iwp) :: j !<
986	INTEGER(iwp) :: k !<
987	INTEGER(iwp) :: m !<
988	INTEGER(iwp) :: n !<
989	!$ INTEGER(iwp) :: omp_get_thread_num !<
990	INTEGER(iwp) :: tn !<
991
992	REAL(wp), DIMENSION(0:nx) :: work_fftx !<
993	REAL(wp), DIMENSION(0:nx,1:nz) :: work_trix !<
994	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: ar !<
995	REAL(wp), DIMENSION(:,:,:,:), ALLOCATABLE :: tri !<
996
997
998	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'start' )
999
1000	ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) )
1001
1002	tn = 0 ! Default thread number in case of one thread
1003	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_fftx, work_trix )
1004	DO j = nys_x, nyn_x
1005
1006	!$ tn = omp_get_thread_num()
1007
1008	IF ( loop_optimization == 'vector' ) THEN
1009	!
1010	!-- Code optimized for vector processors
1011	DO k = 1, nz
1012
1013	m = 0
1014	DO n = 1, pdims(1)
1015	DO i = 1, nnx
1016	work_trix(m,k) = ar(i,k,j,n)
1017	m = m + 1
1018	ENDDO
1019	ENDDO
1020
1021	ENDDO
1022
1023	CALL fft_x_m( work_trix, 'forward' )
1024
1025	ELSE
1026	!
1027	!-- Cache optimized code
1028	DO k = 1, nz
1029
1030	m = 0
1031	DO n = 1, pdims(1)
1032	DO i = 1, nnx
1033	work_fftx(m) = ar(i,k,j,n)
1034	m = m + 1
1035	ENDDO
1036	ENDDO
1037
1038	CALL fft_x_1d( work_fftx, 'forward' )
1039
1040	DO i = 0, nx
1041	work_trix(i,k) = work_fftx(i)
1042	ENDDO
1043
1044	ENDDO
1045
1046	ENDIF
1047
1048	!
1049	!-- Solve the linear equation system
1050	CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) )
1051
1052	IF ( loop_optimization == 'vector' ) THEN
1053	!
1054	!-- Code optimized for vector processors
1055	CALL fft_x_m( work_trix, 'backward' )
1056
1057	DO k = 1, nz
1058
1059	m = 0
1060	DO n = 1, pdims(1)
1061	DO i = 1, nnx
1062	ar(i,k,j,n) = work_trix(m,k)
1063	m = m + 1
1064	ENDDO
1065	ENDDO
1066
1067	ENDDO
1068
1069	ELSE
1070	!
1071	!-- Cache optimized code
1072	DO k = 1, nz
1073
1074	DO i = 0, nx
1075	work_fftx(i) = work_trix(i,k)
1076	ENDDO
1077
1078	CALL fft_x_1d( work_fftx, 'backward' )
1079
1080	m = 0
1081	DO n = 1, pdims(1)
1082	DO i = 1, nnx
1083	ar(i,k,j,n) = work_fftx(m)
1084	m = m + 1
1085	ENDDO
1086	ENDDO
1087
1088	ENDDO
1089
1090	ENDIF
1091
1092	ENDDO
1093
1094	DEALLOCATE( tri )
1095
1096	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'stop' )
1097
1098	END SUBROUTINE fftx_tri_fftx
1099
1100
1101	!--------------------------------------------------------------------------------------------------!
1102	! Description:
1103	! ------------
1104	!> Fourier-transformation along x with subsequent transposition x --> y for a 1d-decomposition
1105	!> along y.
1106	!>
1107	!> @attention NEC-branch of this routine may significantly profit from further optimizations. So
1108	!> far, performance is much worse than for routine ffty_tr_yx (more than three times
1109	!> slower).
1110	!--------------------------------------------------------------------------------------------------!
1111	SUBROUTINE fftx_tr_xy( f_in, f_out )
1112
1113
1114	USE control_parameters, &
1115	ONLY: loop_optimization
1116
1117	USE cpulog, &
1118	ONLY: cpu_log, &
1119	log_point_s
1120
1121	USE kinds
1122
1123	USE pegrid
1124
1125	IMPLICIT NONE
1126
1127	INTEGER(iwp) :: i !<
1128	INTEGER(iwp) :: j !<
1129	INTEGER(iwp) :: k !<
1130
1131	REAL(wp), DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx !<
1132	REAL(wp), DIMENSION(1:nz,nys:nyn,0:nx) :: f_in !<
1133	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_out !<
1134	REAL(wp), DIMENSION(nys:nyn,1:nz,0:nx) :: work !<
1135
1136	!
1137	!-- Carry out the FFT along x, where all data are present due to the 1d-decomposition along y.
1138	!-- Resort the data in a way that y becomes the first index.
1139	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'start' )
1140
1141	IF ( loop_optimization == 'vector' ) THEN
1142	!
1143	!-- Code for vector processors
1144	!$OMP PARALLEL PRIVATE ( i, j, k )
1145	!$OMP DO
1146	DO i = 0, nx
1147
1148	DO j = nys, nyn
1149	DO k = 1, nz
1150	work_fftx(i,k,j) = f_in(k,j,i)
1151	ENDDO
1152	ENDDO
1153
1154	ENDDO
1155
1156	!$OMP DO
1157	DO j = nys, nyn
1158
1159	CALL fft_x_m( work_fftx(:,:,j), 'forward' )
1160
1161	DO k = 1, nz
1162	DO i = 0, nx
1163	work(j,k,i) = work_fftx(i,k,j)
1164	ENDDO
1165	ENDDO
1166
1167	ENDDO
1168	!$OMP END PARALLEL
1169
1170	ELSE
1171
1172	!
1173	!-- Cache optimized code (there might still be a potential for better optimization).
1174	!$OMP PARALLEL PRIVATE (i,j,k)
1175	!$OMP DO
1176	DO i = 0, nx
1177
1178	DO j = nys, nyn
1179	DO k = 1, nz
1180	work_fftx(i,k,j) = f_in(k,j,i)
1181	ENDDO
1182	ENDDO
1183
1184	ENDDO
1185
1186	!$OMP DO
1187	DO j = nys, nyn
1188	DO k = 1, nz
1189
1190	CALL fft_x_1d( work_fftx(0:nx,k,j), 'forward' )
1191
1192	DO i = 0, nx
1193	work(j,k,i) = work_fftx(i,k,j)
1194	ENDDO
1195	ENDDO
1196
1197	ENDDO
1198	!$OMP END PARALLEL
1199
1200	ENDIF
1201	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'pause' )
1202
1203	!
1204	!-- Transpose array
1205	#if defined( __parallel )
1206	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1207	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1208	CALL MPI_ALLTOALL( work(nys,1,0), sendrecvcount_xy, MPI_REAL, f_out(1,1,nxl_y,1), &
1209	sendrecvcount_xy, MPI_REAL, comm1dy, ierr )
1210	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1211	#else
1212	!
1213	!-- Next line required to avoid compile error about unused dummy argument in serial mode
1214	i = SIZE( f_out )
1215	#endif
1216
1217	END SUBROUTINE fftx_tr_xy
1218
1219
1220	!--------------------------------------------------------------------------------------------------!
1221	! Description:
1222	! ------------
1223	!> Transposition y --> x with a subsequent backward Fourier transformation for a 1d-decomposition
1224	!> along x.
1225	!--------------------------------------------------------------------------------------------------!
1226	SUBROUTINE tr_yx_fftx( f_in, f_out )
1227
1228
1229	USE control_parameters, &
1230	ONLY: loop_optimization
1231
1232	USE cpulog, &
1233	ONLY: cpu_log, &
1234	log_point_s
1235
1236	USE kinds
1237
1238	USE pegrid
1239
1240	IMPLICIT NONE
1241
1242	INTEGER(iwp) :: i !<
1243	INTEGER(iwp) :: j !<
1244	INTEGER(iwp) :: k !<
1245
1246	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_in !<
1247	REAL(wp), DIMENSION(1:nz,nys:nyn,0:nx) :: f_out !<
1248	REAL(wp), DIMENSION(nys:nyn,1:nz,0:nx) :: work !<
1249	REAL(wp), DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx !<
1250
1251
1252	!
1253	!-- Transpose array
1254	#if defined( __parallel )
1255	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1256	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1257	CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, work(nys,1,0), &
1258	sendrecvcount_xy, MPI_REAL, comm1dy, ierr )
1259	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1260	#else
1261	!
1262	!-- Next line required to avoid compile error about unused dummy argument in serial mode
1263	i = SIZE( f_in )
1264	#endif
1265
1266	!
1267	!-- Carry out the FFT along x, where all data are present due to the 1d-decomposition along y.
1268	!-- Resort the data in a way that y becomes the first index.
1269	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'continue' )
1270
1271	IF ( loop_optimization == 'vector' ) THEN
1272	!
1273	!-- Code optimized for vector processors
1274	!$OMP PARALLEL PRIVATE ( i, j, k )
1275	!$OMP DO
1276	DO j = nys, nyn
1277
1278	DO k = 1, nz
1279	DO i = 0, nx
1280	work_fftx(i,k,j) = work(j,k,i)
1281	ENDDO
1282	ENDDO
1283
1284	CALL fft_x_m( work_fftx(:,:,j), 'backward' )
1285
1286	ENDDO
1287
1288	!$OMP DO
1289	DO i = 0, nx
1290	DO j = nys, nyn
1291	DO k = 1, nz
1292	f_out(k,j,i) = work_fftx(i,k,j)
1293	ENDDO
1294	ENDDO
1295	ENDDO
1296	!$OMP END PARALLEL
1297
1298	ELSE
1299
1300	!
1301	!-- Cache optimized code (there might be still a potential for better optimization).
1302	!$OMP PARALLEL PRIVATE (i,j,k)
1303	!$OMP DO
1304	DO j = nys, nyn
1305	DO k = 1, nz
1306
1307	DO i = 0, nx
1308	work_fftx(i,k,j) = work(j,k,i)
1309	ENDDO
1310
1311	CALL fft_x_1d( work_fftx(0:nx,k,j), 'backward' )
1312
1313	ENDDO
1314	ENDDO
1315
1316	!$OMP DO
1317	DO i = 0, nx
1318	DO j = nys, nyn
1319	DO k = 1, nz
1320	f_out(k,j,i) = work_fftx(i,k,j)
1321	ENDDO
1322	ENDDO
1323	ENDDO
1324	!$OMP END PARALLEL
1325
1326	ENDIF
1327	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'stop' )
1328
1329	END SUBROUTINE tr_yx_fftx
1330
1331
1332	!--------------------------------------------------------------------------------------------------!
1333	! Description:
1334	! ------------
1335	!> FFT along y, solution of the tridiagonal system and backward FFT for a 1d-decomposition along y.
1336	!>
1337	!> @warning This subroutine may still not work for hybrid parallelization with OpenMP (for possible
1338	!> necessary changes see the original routine poisfft_hybrid, developed by Klaus Ketelsen,
1339	!> May 2002)
1340	!--------------------------------------------------------------------------------------------------!
1341	SUBROUTINE ffty_tri_ffty( ar )
1342
1343
1344	USE control_parameters, &
1345	ONLY: loop_optimization
1346
1347	USE cpulog, &
1348	ONLY: cpu_log, &
1349	log_point_s
1350
1351	USE grid_variables, &
1352	ONLY: ddx2, &
1353	ddy2
1354
1355	USE kinds
1356
1357	USE pegrid
1358
1359	IMPLICIT NONE
1360
1361	INTEGER(iwp) :: i !<
1362	INTEGER(iwp) :: j !<
1363	INTEGER(iwp) :: k !<
1364	INTEGER(iwp) :: m !<
1365	INTEGER(iwp) :: n !<
1366	!$ INTEGER(iwp) :: omp_get_thread_num !<
1367	INTEGER(iwp) :: tn !<
1368
1369	REAL(wp), DIMENSION(0:ny) :: work_ffty !<
1370	REAL(wp), DIMENSION(0:ny,1:nz) :: work_triy !<
1371	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: ar !<
1372	REAL(wp), DIMENSION(:,:,:,:), ALLOCATABLE :: tri !<
1373
1374
1375	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'start' )
1376
1377	ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) )
1378
1379	tn = 0 ! Default thread number in case of one thread
1380	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy )
1381	DO i = nxl_y, nxr_y
1382
1383	!$ tn = omp_get_thread_num()
1384
1385	IF ( loop_optimization == 'vector' ) THEN
1386	!
1387	!-- Code optimized for vector processors
1388	DO k = 1, nz
1389
1390	m = 0
1391	DO n = 1, pdims(2)
1392	DO j = 1, nny
1393	work_triy(m,k) = ar(j,k,i,n)
1394	m = m + 1
1395	ENDDO
1396	ENDDO
1397
1398	ENDDO
1399
1400	CALL fft_y_m( work_triy, ny, 'forward' )
1401
1402	ELSE
1403	!
1404	!-- Cache optimized code
1405	DO k = 1, nz
1406
1407	m = 0
1408	DO n = 1, pdims(2)
1409	DO j = 1, nny
1410	work_ffty(m) = ar(j,k,i,n)
1411	m = m + 1
1412	ENDDO
1413	ENDDO
1414
1415	CALL fft_y_1d( work_ffty, 'forward' )
1416
1417	DO j = 0, ny
1418	work_triy(j,k) = work_ffty(j)
1419	ENDDO
1420
1421	ENDDO
1422
1423	ENDIF
1424
1425	!
1426	!-- Solve the linear equation system
1427	CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) )
1428
1429	IF ( loop_optimization == 'vector' ) THEN
1430	!
1431	!-- Code optimized for vector processors
1432	CALL fft_y_m( work_triy, ny, 'backward' )
1433
1434	DO k = 1, nz
1435
1436	m = 0
1437	DO n = 1, pdims(2)
1438	DO j = 1, nny
1439	ar(j,k,i,n) = work_triy(m,k)
1440	m = m + 1
1441	ENDDO
1442	ENDDO
1443
1444	ENDDO
1445
1446	ELSE
1447	!
1448	!-- Cache optimized code
1449	DO k = 1, nz
1450
1451	DO j = 0, ny
1452	work_ffty(j) = work_triy(j,k)
1453	ENDDO
1454
1455	CALL fft_y_1d( work_ffty, 'backward' )
1456
1457	m = 0
1458	DO n = 1, pdims(2)
1459	DO j = 1, nny
1460	ar(j,k,i,n) = work_ffty(m)
1461	m = m + 1
1462	ENDDO
1463	ENDDO
1464
1465	ENDDO
1466
1467	ENDIF
1468
1469	ENDDO
1470
1471	DEALLOCATE( tri )
1472
1473	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'stop' )
1474
1475	END SUBROUTINE ffty_tri_ffty
1476
1477	END MODULE poisfft_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |