Home

Context Navigation

source: palm/trunk/SOURCE/poisfft_mod.f90 @ 4670

Last change on this file since 4670 was 4649, checked in by raasch, 4 years ago
files re-formatted to follow the PALM coding standard
Property svn:keywords set to `Id`
File size: 45.5 KB

Line
1	!> @file poisfft_mod.f90
2	!--------------------------------------------------------------------------------------------------!
3	! This file is part of the PALM model system.
4	!
5	! PALM is free software: you can redistribute it and/or modify it under the terms of the GNU General
6	! Public License as published by the Free Software Foundation, either version 3 of the License, or
7	! (at your option) any later version.
8	!
9	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the
10	! implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
11	! Public License for more details.
12	!
13	! You should have received a copy of the GNU General Public License along with PALM. If not, see
14	! <http://www.gnu.org/licenses/>.
15	!
16	! Copyright 1997-2020 Leibniz Universitaet Hannover
17	!--------------------------------------------------------------------------------------------------!
18	!
19	!
20	! Current revisions:
21	! -----------------
22	!
23	!
24	! Former revisions:
25	! -----------------
26	! $Id: poisfft_mod.f90 4649 2020-08-25 12:11:17Z maronga $
27	! File re-formatted to follow the PALM coding standard
28	!
29	!
30	! 4429 2020-02-27 15:24:30Z raasch
31	! Statements added to avoid compile errors due to unused dummy arguments in serial mode
32	!
33	! 4366 2020-01-09 08:12:43Z raasch
34	! Modification concerning NEC vectorizatio
35	!
36	! 4360 2020-01-07 11:25:50Z suehring
37	! Corrected "Former revisions" section
38	!
39	! 3690 2019-01-22 22:56:42Z knoop
40	! OpenACC port for SPEC
41	!
42	! Revision 1.1 1997/07/24 11:24:14 raasch
43	! Initial revision
44	!
45	!
46	!--------------------------------------------------------------------------------------------------!
47	! Description:
48	! ------------
49	!> Solves the Poisson equation with a 2D spectral method
50	!> d^2 p / dx^2 + d^2 p / dy^2 + d^2 p / dz^2 = s
51	!>
52	!> Input:
53	!> real ar contains (nnz,nny,nnx) elements of the velocity divergence, starting from (1,nys,nxl)
54	!>
55	!> Output:
56	!> real ar contains the solution for perturbation pressure p
57	!--------------------------------------------------------------------------------------------------!
58	MODULE poisfft_mod
59
60
61	USE fft_xy, &
62	ONLY: fft_init, &
63	fft_y, &
64	fft_y_1d, &
65	fft_y_m, &
66	fft_x, &
67	fft_x_1d, &
68	fft_x_m, &
69	temperton_fft_vec
70
71	USE indices, &
72	ONLY: nnx, &
73	nny, &
74	nx, &
75	nxl, &
76	nxr, &
77	ny, &
78	nys, &
79	nyn, &
80	nz
81
82	USE transpose_indices, &
83	ONLY: nxl_y, &
84	nxl_z, &
85	nxr_y, &
86	nxr_z, &
87	nys_x, &
88	nys_z, &
89	nyn_x, &
90	nyn_z, &
91	nzb_x, &
92	nzb_y, &
93	nzt_x, &
94	nzt_y
95
96	USE tridia_solver, &
97	ONLY: tridia_1dd, &
98	tridia_init, &
99	tridia_substi, &
100	tridia_substi_overlap
101
102	IMPLICIT NONE
103
104	LOGICAL, SAVE :: poisfft_initialized = .FALSE. !<
105
106	PRIVATE
107
108	PUBLIC poisfft, poisfft_init
109
110	INTERFACE poisfft
111	MODULE PROCEDURE poisfft
112	END INTERFACE poisfft
113
114	INTERFACE poisfft_init
115	MODULE PROCEDURE poisfft_init
116	END INTERFACE poisfft_init
117
118
119	CONTAINS
120
121	!--------------------------------------------------------------------------------------------------!
122	! Description:
123	! ------------
124	!> Setup coefficients for FFT and the tridiagonal solver
125	!--------------------------------------------------------------------------------------------------!
126	SUBROUTINE poisfft_init
127
128	IMPLICIT NONE
129
130
131	CALL fft_init
132
133	CALL tridia_init
134
135	poisfft_initialized = .TRUE.
136
137	END SUBROUTINE poisfft_init
138
139
140
141	!--------------------------------------------------------------------------------------------------!
142	! Description:
143	! ------------
144	!> Two-dimensional Fourier Transformation in x- and y-direction.
145	!--------------------------------------------------------------------------------------------------!
146	SUBROUTINE poisfft( ar )
147
148	USE control_parameters, &
149	ONLY: transpose_compute_overlap
150
151	USE cpulog, &
152	ONLY: cpu_log, &
153	cpu_log_nowait, &
154	log_point_s
155
156	USE kinds
157
158	USE pegrid
159
160	IMPLICIT NONE
161
162	INTEGER(iwp) :: ii !<
163	INTEGER(iwp) :: iind !<
164	INTEGER(iwp) :: inew !<
165	INTEGER(iwp) :: jj !<
166	INTEGER(iwp) :: jind !<
167	INTEGER(iwp) :: jnew !<
168	INTEGER(iwp) :: ki !<
169	INTEGER(iwp) :: kk !<
170	INTEGER(iwp) :: knew !<
171	INTEGER(iwp) :: n !<
172	INTEGER(iwp) :: nblk !<
173	INTEGER(iwp) :: nnx_y !<
174	INTEGER(iwp) :: nny_z !<
175	INTEGER(iwp) :: nnz_x !<
176	INTEGER(iwp) :: nxl_y_bound !<
177	INTEGER(iwp) :: nxr_y_bound !<
178
179	INTEGER(iwp), DIMENSION(4) :: isave !<
180
181	REAL(wp), DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar !<
182	REAL(wp), DIMENSION(nys:nyn,nxl:nxr,1:nz) :: ar_inv !<
183
184	#define __acc_fft_device ( defined( _OPENACC ) && ( defined ( __cuda_fft ) ) )
185	#if __acc_fft_device
186	!$ACC DECLARE CREATE(ar_inv)
187	#endif
188
189	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: ar1 !<
190	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_in !<
191	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_inv !<
192	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_out_y !<
193	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_out_z !<
194
195
196	CALL cpu_log( log_point_s(3), 'poisfft', 'start' )
197
198	IF ( .NOT. poisfft_initialized ) CALL poisfft_init
199
200	#if !__acc_fft_device
201	!$ACC UPDATE HOST(ar)
202	#endif
203
204	#ifndef _OPENACC
205	!
206	!-- Two-dimensional Fourier Transformation in x- and y-direction.
207	IF ( pdims(2) == 1 .AND. pdims(1) > 1 ) THEN
208
209	!
210	!-- 1d-domain-decomposition along x:
211	!-- FFT along y and transposition y --> x
212	CALL ffty_tr_yx( ar, ar )
213
214	!
215	!-- FFT along x, solving the tridiagonal system and backward FFT
216	CALL fftx_tri_fftx( ar )
217
218	!
219	!-- Transposition x --> y and backward FFT along y
220	CALL tr_xy_ffty( ar, ar )
221
222	ELSEIF ( pdims(1) == 1 .AND. pdims(2) > 1 ) THEN
223
224	!
225	!-- 1d-domain-decomposition along y:
226	!-- FFT along x and transposition x --> y
227	CALL fftx_tr_xy( ar, ar )
228
229	!
230	!-- FFT along y, solving the tridiagonal system and backward FFT
231	CALL ffty_tri_ffty( ar )
232
233	!
234	!-- Transposition y --> x and backward FFT along x
235	CALL tr_yx_fftx( ar, ar )
236
237	ELSEIF ( .NOT. transpose_compute_overlap ) THEN
238	#endif
239
240	!
241	!-- 2d-domain-decomposition or no decomposition (1 PE run)
242	!-- Transposition z --> x
243	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
244	CALL resort_for_zx( ar, ar_inv )
245	CALL transpose_zx( ar_inv, ar )
246	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
247
248	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
249	IF ( temperton_fft_vec ) THEN
250	!
251	!-- Vector version outputs a transformed array ar_inv that does not require resorting
252	!-- (which is done for ar further below)
253	CALL fft_x( ar, 'forward', ar_inv=ar_inv)
254	ELSE
255	CALL fft_x( ar, 'forward')
256	ENDIF
257	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
258
259	!
260	!-- Transposition x --> y
261	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
262	IF( .NOT. temperton_fft_vec ) CALL resort_for_xy( ar, ar_inv )
263	CALL transpose_xy( ar_inv, ar )
264	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
265
266	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
267	IF ( temperton_fft_vec ) THEN
268	!
269	!-- Input array ar_inv from fft_x can be directly used here.
270	!-- The output (also in array ar_inv) does not require resorting below.
271	CALL fft_y( ar, 'forward', ar_inv = ar_inv, nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
272	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
273	ELSE
274	CALL fft_y( ar, 'forward', ar_tr = ar, nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
275	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
276	ENDIF
277	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
278
279	!
280	!-- Transposition y --> z
281	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
282	IF ( .NOT. temperton_fft_vec ) CALL resort_for_yz( ar, ar_inv )
283	CALL transpose_yz( ar_inv, ar )
284	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
285
286	!
287	!-- Solve the tridiagonal equation system along z
288	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
289	CALL tridia_substi( ar )
290	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
291
292	!
293	!-- Inverse Fourier Transformation
294	!-- Transposition z --> y
295	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
296	CALL transpose_zy( ar, ar_inv )
297	!
298	!-- The fft_y below (vector branch) can directly process ar_inv (i.e. does not require a
299	!-- resorting)
300	IF ( .NOT. temperton_fft_vec ) CALL resort_for_zy( ar_inv, ar )
301	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
302
303	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
304	IF ( temperton_fft_vec ) THEN
305	!
306	!-- Output array ar_inv can be used as input to the below fft_x routine without resorting
307	CALL fft_y( ar, 'backward', ar_inv = ar_inv, nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
308	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
309	ELSE
310	CALL fft_y( ar, 'backward', ar_tr = ar, nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
311	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
312	ENDIF
313
314	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
315
316	!
317	!-- Transposition y --> x
318	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
319	CALL transpose_yx( ar, ar_inv )
320	IF ( .NOT. temperton_fft_vec ) CALL resort_for_yx( ar_inv, ar )
321	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
322
323	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
324	IF ( temperton_fft_vec ) THEN
325	CALL fft_x( ar, 'backward', ar_inv=ar_inv )
326	ELSE
327	CALL fft_x( ar, 'backward' )
328	ENDIF
329	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
330
331	!
332	!-- Transposition x --> z
333	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
334	CALL transpose_xz( ar, ar_inv )
335	CALL resort_for_xz( ar_inv, ar )
336	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
337
338	#ifndef _OPENACC
339	ELSE
340
341	!
342	!-- 2d-domain-decomposition or no decomposition (1 PE run) with overlapping transposition / fft
343	!-- cputime logging must not use barriers, which would prevent overlapping
344	ALLOCATE( f_out_y(0:ny,nxl_y:nxr_y,nzb_y:nzt_y), &
345	f_out_z(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
346	!
347	!-- Transposition z --> x + subsequent fft along x
348	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
349	CALL resort_for_zx( ar, f_inv )
350	!
351	!-- Save original indices and gridpoint counter
352	isave(1) = nz
353	isave(2) = nzb_x
354	isave(3) = nzt_x
355	isave(4) = sendrecvcount_zx
356	!
357	!-- Set new indices for transformation
358	nblk = nz / pdims(1)
359	nz = pdims(1)
360	nnz_x = 1
361	nzb_x = 1 + myidx * nnz_x
362	nzt_x = ( myidx + 1 ) * nnz_x
363	sendrecvcount_zx = nnx * nny * nnz_x
364
365	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
366	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
367
368	DO kk = 1, nblk
369
370	IF ( kk == 1 ) THEN
371	CALL cpu_log( log_point_s(5), 'transpo forward', 'start', cpu_log_nowait )
372	ELSE
373	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
374	ENDIF
375
376	DO knew = 1, nz
377	ki = kk + nblk * ( knew - 1 )
378	f_in(:,:,knew) = f_inv(:,:,ki)
379	ENDDO
380
381	CALL transpose_zx( f_in, ar1(:,:,:))
382	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
383
384	IF ( kk == 1 ) THEN
385	CALL cpu_log( log_point_s(4), 'fft_x', 'start', cpu_log_nowait )
386	ELSE
387	CALL cpu_log( log_point_s(4), 'fft_x', 'continue', cpu_log_nowait )
388	ENDIF
389
390	n = isave(2) + kk - 1
391	CALL fft_x( ar1(:,:,:), 'forward', ar_2d = f_out_z(:,:,n) )
392	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
393
394	ENDDO
395	!
396	!-- Restore original indices/counters
397	nz = isave(1)
398	nzb_x = isave(2)
399	nzt_x = isave(3)
400	sendrecvcount_zx = isave(4)
401
402	DEALLOCATE( ar1, f_in, f_inv )
403
404	!
405	!-- Transposition x --> y + subsequent fft along y
406	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
407	CALL resort_for_xy( f_out_z, f_inv )
408	!
409	!-- Save original indices and gridpoint counter
410	isave(1) = nx
411	isave(2) = nxl_y
412	isave(3) = nxr_y
413	isave(4) = sendrecvcount_xy
414	!
415	!-- Set new indices for transformation
416	nblk = ( ( nx+1 ) / pdims(2) ) - 1
417	nx = pdims(2)
418	nnx_y = 1
419	nxl_y = myidy * nnx_y
420	nxr_y = ( myidy + 1 ) * nnx_y - 1
421	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
422
423	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
424	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
425
426	DO ii = 0, nblk
427
428	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
429
430	DO inew = 0, nx-1
431	iind = ii + ( nblk + 1 ) * inew
432	f_in(:,:,inew) = f_inv(:,:,iind)
433	ENDDO
434
435	CALL transpose_xy( f_in, ar1(:,:,:) )
436
437	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
438
439	IF ( ii == 1 ) THEN
440	CALL cpu_log( log_point_s(7), 'fft_y', 'start', cpu_log_nowait )
441	ELSE
442	CALL cpu_log( log_point_s(7), 'fft_y', 'continue', cpu_log_nowait )
443	ENDIF
444
445	nxl_y_bound = isave(2)
446	nxr_y_bound = isave(3)
447	n = isave(2) + ii
448	CALL fft_y( ar1(:,:,:), 'forward', ar_tr = f_out_y, nxl_y_bound = nxl_y_bound, &
449	nxr_y_bound = nxr_y_bound, nxl_y_l = n, nxr_y_l = n )
450
451	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
452
453	ENDDO
454	!
455	!-- Restore original indices/counters
456	nx = isave(1)
457	nxl_y = isave(2)
458	nxr_y = isave(3)
459	sendrecvcount_xy = isave(4)
460
461	DEALLOCATE( ar1, f_in, f_inv )
462
463	!
464	!-- Transposition y --> z + subsequent tridia + resort for z --> y
465	ALLOCATE( f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
466	CALL resort_for_yz( f_out_y, f_inv )
467	!
468	!-- Save original indices and gridpoint counter
469	isave(1) = ny
470	isave(2) = nys_z
471	isave(3) = nyn_z
472	isave(4) = sendrecvcount_yz
473	!
474	!-- Set new indices for transformation
475	nblk = ( ( ny+1 ) / pdims(1) ) - 1
476	ny = pdims(1)
477	nny_z = 1
478	nys_z = myidx * nny_z
479	nyn_z = ( myidx + 1 ) * nny_z - 1
480	sendrecvcount_yz = ( nxr_y-nxl_y+1 ) * nny_z * ( nzt_y-nzb_y+1 )
481
482	ALLOCATE( ar1(nxl_z:nxr_z,nys_z:nyn_z,1:nz) )
483	ALLOCATE( f_in(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
484
485	DO jj = 0, nblk
486	!
487	!-- Forward Fourier Transformation
488	!-- Transposition y --> z
489	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
490
491	DO jnew = 0, ny-1
492	jind = jj + ( nblk + 1 ) * jnew
493	f_in(:,:,jnew) = f_inv(:,:,jind)
494	ENDDO
495
496	CALL transpose_yz( f_in, ar1(:,:,:) )
497
498	IF ( jj == nblk ) THEN
499	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
500	ELSE
501	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
502	ENDIF
503
504	!
505	!-- Solve the tridiagonal equation system along z
506	CALL cpu_log( log_point_s(6), 'tridia', 'start', cpu_log_nowait )
507
508	n = isave(2) + jj
509	CALL tridia_substi_overlap( ar1(:,:,:), n )
510
511	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
512
513	!
514	!-- Inverse Fourier Transformation
515	!-- Transposition z --> y
516	!-- Only one thread should call MPI routines, therefore forward and backward tranpose are in
517	!-- the same section
518	IF ( jj == 0 ) THEN
519	CALL cpu_log( log_point_s(8), 'transpo invers', 'start', cpu_log_nowait )
520	ELSE
521	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
522	ENDIF
523
524	CALL transpose_zy( ar1(:,:,:), f_in )
525
526	DO jnew = 0, ny-1
527	jind = jj + ( nblk + 1 ) * jnew
528	f_inv(:,:,jind) = f_in(:,:,jnew)
529	ENDDO
530
531	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
532
533	ENDDO
534	!
535	!-- Restore original indices/counters
536	ny = isave(1)
537	nys_z = isave(2)
538	nyn_z = isave(3)
539	sendrecvcount_yz = isave(4)
540
541	CALL resort_for_zy( f_inv, f_out_y )
542
543	DEALLOCATE( ar1, f_in, f_inv )
544
545	!
546	!-- fft along y backward + subsequent transposition y --> x
547	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
548	!
549	!-- Save original indices and gridpoint counter
550	isave(1) = nx
551	isave(2) = nxl_y
552	isave(3) = nxr_y
553	isave(4) = sendrecvcount_xy
554	!
555	!-- Set new indices for transformation
556	nblk = ( ( nx+1 ) / pdims(2) ) - 1
557	nx = pdims(2)
558	nnx_y = 1
559	nxl_y = myidy * nnx_y
560	nxr_y = ( myidy + 1 ) * nnx_y - 1
561	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
562
563	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
564	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
565
566	DO ii = 0, nblk
567
568	CALL cpu_log( log_point_s(7), 'fft_y', 'continue', cpu_log_nowait )
569
570	n = isave(2) + ii
571	nxl_y_bound = isave(2)
572	nxr_y_bound = isave(3)
573
574	CALL fft_y( ar1(:,:,:), 'backward', ar_tr = f_out_y, nxl_y_bound = nxl_y_bound, &
575	nxr_y_bound = nxr_y_bound, nxl_y_l = n, nxr_y_l = n )
576
577	IF ( ii == nblk ) THEN
578	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
579	ELSE
580	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
581	ENDIF
582
583	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
584
585	CALL transpose_yx( ar1(:,:,:), f_in )
586
587	DO inew = 0, nx-1
588	iind = ii + (nblk+1) * inew
589	f_inv(:,:,iind) = f_in(:,:,inew)
590	ENDDO
591
592	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
593
594	ENDDO
595	!
596	!-- Restore original indices/counters
597	nx = isave(1)
598	nxl_y = isave(2)
599	nxr_y = isave(3)
600	sendrecvcount_xy = isave(4)
601
602	CALL resort_for_yx( f_inv, f_out_z )
603
604	DEALLOCATE( ar1, f_in, f_inv )
605
606	!
607	!-- fft along x backward + subsequent final transposition x --> z
608	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
609	!
610	!-- Save original indices and gridpoint counter
611	isave(1) = nz
612	isave(2) = nzb_x
613	isave(3) = nzt_x
614	isave(4) = sendrecvcount_zx
615	!
616	!-- Set new indices for transformation
617	nblk = nz / pdims(1)
618	nz = pdims(1)
619	nnz_x = 1
620	nzb_x = 1 + myidx * nnz_x
621	nzt_x = ( myidx + 1 ) * nnz_x
622	sendrecvcount_zx = nnx * nny * nnz_x
623
624	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
625	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
626
627	DO kk = 1, nblk
628
629	CALL cpu_log( log_point_s(4), 'fft_x', 'continue', cpu_log_nowait )
630
631	n = isave(2) + kk - 1
632	CALL fft_x( ar1(:,:,:), 'backward', f_out_z(:,:,n) )
633
634	IF ( kk == nblk ) THEN
635	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
636	ELSE
637	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
638	ENDIF
639
640	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
641
642	CALL transpose_xz( ar1(:,:,:), f_in )
643
644	DO knew = 1, nz
645	ki = kk + nblk * ( knew - 1 )
646	f_inv(:,:,ki) = f_in(:,:,knew)
647	ENDDO
648
649	IF ( kk == nblk ) THEN
650	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
651	ELSE
652	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
653	ENDIF
654
655	ENDDO
656	!
657	!-- Restore original indices/counters
658	nz = isave(1)
659	nzb_x = isave(2)
660	nzt_x = isave(3)
661	sendrecvcount_zx = isave(4)
662
663	CALL resort_for_xz( f_inv, ar )
664
665	DEALLOCATE( ar1, f_in, f_inv )
666
667	ENDIF
668	#endif
669
670	#if !__acc_fft_device
671	!$ACC UPDATE DEVICE(ar)
672	#endif
673
674	CALL cpu_log( log_point_s(3), 'poisfft', 'stop' )
675
676	END SUBROUTINE poisfft
677
678
679	!--------------------------------------------------------------------------------------------------!
680	! Description:
681	! ------------
682	!> Fourier-transformation along y with subsequent transposition y --> x for a 1d-decomposition
683	!> along x.
684	!>
685	!> @attention The performance of this routine is much faster on the NEC-SX6, if the first index of
686	!> work_ffty_vec is odd. Otherwise memory bank conflicts may occur (especially if the
687	!> index is a multiple of 128). That's why work_ffty_vec is dimensioned as 0:ny+1.
688	!> Of course, this will not work if users are using an odd number of gridpoints along y.
689	!--------------------------------------------------------------------------------------------------!
690	SUBROUTINE ffty_tr_yx( f_in, f_out )
691
692	USE control_parameters, &
693	ONLY: loop_optimization
694
695	USE cpulog, &
696	ONLY: cpu_log, &
697	log_point_s
698
699	USE kinds
700
701	USE pegrid
702
703	IMPLICIT NONE
704
705	INTEGER(iwp) :: i !<
706	INTEGER(iwp) :: iend !<
707	INTEGER(iwp) :: iouter !<
708	INTEGER(iwp) :: ir !<
709	INTEGER(iwp) :: j !<
710	INTEGER(iwp) :: k !<
711
712	INTEGER(iwp), PARAMETER :: stridex = 4 !<
713
714	REAL(wp), DIMENSION(1:nz,0:ny,nxl:nxr) :: f_in !<
715	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_out !<
716	REAL(wp), DIMENSION(nxl:nxr,1:nz,0:ny) :: work !<
717
718	REAL(wp), DIMENSION(:,:), ALLOCATABLE :: work_ffty !<
719	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: work_ffty_vec !<
720
721	!
722	!-- Carry out the FFT along y, where all data are present due to the 1d-decomposition along x.
723	!-- Resort the data in a way that x becomes the first index.
724	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'start' )
725
726	IF ( loop_optimization == 'vector' ) THEN
727
728	ALLOCATE( work_ffty_vec(0:ny+1,1:nz,nxl:nxr) )
729	!
730	!-- Code optimized for vector processors
731	!$OMP PARALLEL PRIVATE ( i, j, k )
732	!$OMP DO
733	DO i = nxl, nxr
734
735	DO j = 0, ny
736	DO k = 1, nz
737	work_ffty_vec(j,k,i) = f_in(k,j,i)
738	ENDDO
739	ENDDO
740
741	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' )
742
743	ENDDO
744
745	!$OMP DO
746	DO k = 1, nz
747	DO j = 0, ny
748	DO i = nxl, nxr
749	work(i,k,j) = work_ffty_vec(j,k,i)
750	ENDDO
751	ENDDO
752	ENDDO
753	!$OMP END PARALLEL
754
755	DEALLOCATE( work_ffty_vec )
756
757	ELSE
758	!
759	!-- Cache optimized code.
760	ALLOCATE( work_ffty(0:ny,stridex) )
761	!
762	!-- The i-(x-)direction is split into a strided outer loop and an inner loop for better cache
763	!-- performance
764	!$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty)
765	!$OMP DO
766	DO iouter = nxl, nxr, stridex
767
768	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
769
770	DO k = 1, nz
771
772	DO i = iouter, iend
773
774	ir = i-iouter+1 ! Counter within a stride
775	DO j = 0, ny
776	work_ffty(j,ir) = f_in(k,j,i)
777	ENDDO
778	!
779	!-- FFT along y
780	CALL fft_y_1d( work_ffty(:,ir), 'forward' )
781
782	ENDDO
783
784	!
785	!-- Resort
786	DO j = 0, ny
787	DO i = iouter, iend
788	work(i,k,j) = work_ffty(j,i-iouter+1)
789	ENDDO
790	ENDDO
791
792	ENDDO
793
794	ENDDO
795	!$OMP END PARALLEL
796
797	DEALLOCATE( work_ffty )
798
799	ENDIF
800
801	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'pause' )
802
803	!
804	!-- Transpose array
805	#if defined( __parallel )
806	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
807	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
808	CALL MPI_ALLTOALL( work(nxl,1,0), sendrecvcount_xy, MPI_REAL, f_out(1,1,nys_x,1), &
809	sendrecvcount_xy, MPI_REAL, comm1dx, ierr )
810	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
811	#else
812	!
813	!-- Next line required to avoid compile error about unused dummy argument in serial mode
814	i = SIZE( f_out )
815	#endif
816
817	END SUBROUTINE ffty_tr_yx
818
819
820	!--------------------------------------------------------------------------------------------------!
821	! Description:
822	! ------------
823	!> Transposition x --> y with a subsequent backward Fourier transformation for a 1d-decomposition
824	!> along x
825	!--------------------------------------------------------------------------------------------------!
826	SUBROUTINE tr_xy_ffty( f_in, f_out )
827
828	USE control_parameters, &
829	ONLY: loop_optimization
830
831	USE cpulog, &
832	ONLY: cpu_log, &
833	log_point_s
834
835	USE kinds
836
837	USE pegrid
838
839	IMPLICIT NONE
840
841	INTEGER(iwp) :: i !<
842	INTEGER(iwp) :: iend !<
843	INTEGER(iwp) :: iouter !<
844	INTEGER(iwp) :: ir !<
845	INTEGER(iwp) :: j !<
846	INTEGER(iwp) :: k !<
847
848	INTEGER(iwp), PARAMETER :: stridex = 4 !<
849
850	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_in !<
851	REAL(wp), DIMENSION(1:nz,0:ny,nxl:nxr) :: f_out !<
852	REAL(wp), DIMENSION(nxl:nxr,1:nz,0:ny) :: work !<
853
854	REAL(wp), DIMENSION(:,:), ALLOCATABLE :: work_ffty !<
855	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: work_ffty_vec !<
856
857	!
858	!-- Transpose array
859	#if defined( __parallel )
860	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
861	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
862	CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, work(nxl,1,0), &
863	sendrecvcount_xy, MPI_REAL, comm1dx, ierr )
864	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
865	#else
866	!
867	!-- Next line required to avoid compile error about unused dummy argument in serial mode
868	i = SIZE( f_in )
869	#endif
870
871	!
872	!-- Resort the data in a way that y becomes the first index and carry out the backward fft along y.
873	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'continue' )
874
875	IF ( loop_optimization == 'vector' ) THEN
876
877	ALLOCATE( work_ffty_vec(0:ny+1,1:nz,nxl:nxr) )
878	!
879	!-- Code optimized for vector processors
880	!$OMP PARALLEL PRIVATE ( i, j, k )
881	!$OMP DO
882	DO k = 1, nz
883	DO j = 0, ny
884	DO i = nxl, nxr
885	work_ffty_vec(j,k,i) = work(i,k,j)
886	ENDDO
887	ENDDO
888	ENDDO
889
890	!$OMP DO
891	DO i = nxl, nxr
892
893	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' )
894
895	DO j = 0, ny
896	DO k = 1, nz
897	f_out(k,j,i) = work_ffty_vec(j,k,i)
898	ENDDO
899	ENDDO
900
901	ENDDO
902	!$OMP END PARALLEL
903
904	DEALLOCATE( work_ffty_vec )
905
906	ELSE
907	!
908	!-- Cache optimized code.
909	ALLOCATE( work_ffty(0:ny,stridex) )
910	!
911	!-- The i-(x-)direction is split into a strided outer loop and an inner loop for better cache
912	!-- performance
913	!$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty )
914	!$OMP DO
915	DO iouter = nxl, nxr, stridex
916
917	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
918
919	DO k = 1, nz
920	!
921	!-- Resort
922	DO j = 0, ny
923	DO i = iouter, iend
924	work_ffty(j,i-iouter+1) = work(i,k,j)
925	ENDDO
926	ENDDO
927
928	DO i = iouter, iend
929
930	!
931	!-- FFT along y
932	ir = i-iouter+1 ! Counter within a stride
933	CALL fft_y_1d( work_ffty(:,ir), 'backward' )
934
935	DO j = 0, ny
936	f_out(k,j,i) = work_ffty(j,ir)
937	ENDDO
938	ENDDO
939
940	ENDDO
941
942	ENDDO
943	!$OMP END PARALLEL
944
945	DEALLOCATE( work_ffty )
946
947	ENDIF
948
949	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'stop' )
950
951	END SUBROUTINE tr_xy_ffty
952
953
954	!--------------------------------------------------------------------------------------------------!
955	! Description:
956	! ------------
957	!> FFT along x, solution of the tridiagonal system and backward FFT for a 1d-decomposition along x
958	!>
959	!> @warning This subroutine may still not work for hybrid parallelization with OpenMP (for possible
960	!> necessary changes see the original routine poisfft_hybrid, developed by Klaus Ketelsen,
961	!> May 2002)
962	!--------------------------------------------------------------------------------------------------!
963	SUBROUTINE fftx_tri_fftx( ar )
964
965	USE control_parameters, &
966	ONLY: loop_optimization
967
968	USE cpulog, &
969	ONLY: cpu_log, &
970	log_point_s
971
972	USE grid_variables, &
973	ONLY: ddx2, &
974	ddy2
975
976	USE kinds
977
978	USE pegrid
979
980	IMPLICIT NONE
981
982	INTEGER(iwp) :: i !<
983	INTEGER(iwp) :: j !<
984	INTEGER(iwp) :: k !<
985	INTEGER(iwp) :: m !<
986	INTEGER(iwp) :: n !<
987	! INTEGER(iwp) :: omp_get_thread_num !<
988	INTEGER(iwp) :: tn !<
989
990	REAL(wp), DIMENSION(0:nx) :: work_fftx !<
991	REAL(wp), DIMENSION(0:nx,1:nz) :: work_trix !<
992	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: ar !<
993	REAL(wp), DIMENSION(:,:,:,:), ALLOCATABLE :: tri !<
994
995
996	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'start' )
997
998	ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) )
999
1000	tn = 0 ! Default thread number in case of one thread
1001	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_fftx, work_trix )
1002	DO j = nys_x, nyn_x
1003
1004	!$ tn = omp_get_thread_num()
1005
1006	IF ( loop_optimization == 'vector' ) THEN
1007	!
1008	!-- Code optimized for vector processors
1009	DO k = 1, nz
1010
1011	m = 0
1012	DO n = 1, pdims(1)
1013	DO i = 1, nnx
1014	work_trix(m,k) = ar(i,k,j,n)
1015	m = m + 1
1016	ENDDO
1017	ENDDO
1018
1019	ENDDO
1020
1021	CALL fft_x_m( work_trix, 'forward' )
1022
1023	ELSE
1024	!
1025	!-- Cache optimized code
1026	DO k = 1, nz
1027
1028	m = 0
1029	DO n = 1, pdims(1)
1030	DO i = 1, nnx
1031	work_fftx(m) = ar(i,k,j,n)
1032	m = m + 1
1033	ENDDO
1034	ENDDO
1035
1036	CALL fft_x_1d( work_fftx, 'forward' )
1037
1038	DO i = 0, nx
1039	work_trix(i,k) = work_fftx(i)
1040	ENDDO
1041
1042	ENDDO
1043
1044	ENDIF
1045
1046	!
1047	!-- Solve the linear equation system
1048	CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) )
1049
1050	IF ( loop_optimization == 'vector' ) THEN
1051	!
1052	!-- Code optimized for vector processors
1053	CALL fft_x_m( work_trix, 'backward' )
1054
1055	DO k = 1, nz
1056
1057	m = 0
1058	DO n = 1, pdims(1)
1059	DO i = 1, nnx
1060	ar(i,k,j,n) = work_trix(m,k)
1061	m = m + 1
1062	ENDDO
1063	ENDDO
1064
1065	ENDDO
1066
1067	ELSE
1068	!
1069	!-- Cache optimized code
1070	DO k = 1, nz
1071
1072	DO i = 0, nx
1073	work_fftx(i) = work_trix(i,k)
1074	ENDDO
1075
1076	CALL fft_x_1d( work_fftx, 'backward' )
1077
1078	m = 0
1079	DO n = 1, pdims(1)
1080	DO i = 1, nnx
1081	ar(i,k,j,n) = work_fftx(m)
1082	m = m + 1
1083	ENDDO
1084	ENDDO
1085
1086	ENDDO
1087
1088	ENDIF
1089
1090	ENDDO
1091
1092	DEALLOCATE( tri )
1093
1094	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'stop' )
1095
1096	END SUBROUTINE fftx_tri_fftx
1097
1098
1099	!--------------------------------------------------------------------------------------------------!
1100	! Description:
1101	! ------------
1102	!> Fourier-transformation along x with subsequent transposition x --> y for a 1d-decomposition
1103	!> along y.
1104	!>
1105	!> @attention NEC-branch of this routine may significantly profit from further optimizations. So
1106	!> far, performance is much worse than for routine ffty_tr_yx (more than three times
1107	!> slower).
1108	!--------------------------------------------------------------------------------------------------!
1109	SUBROUTINE fftx_tr_xy( f_in, f_out )
1110
1111
1112	USE control_parameters, &
1113	ONLY: loop_optimization
1114
1115	USE cpulog, &
1116	ONLY: cpu_log, &
1117	log_point_s
1118
1119	USE kinds
1120
1121	USE pegrid
1122
1123	IMPLICIT NONE
1124
1125	INTEGER(iwp) :: i !<
1126	INTEGER(iwp) :: j !<
1127	INTEGER(iwp) :: k !<
1128
1129	REAL(wp), DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx !<
1130	REAL(wp), DIMENSION(1:nz,nys:nyn,0:nx) :: f_in !<
1131	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_out !<
1132	REAL(wp), DIMENSION(nys:nyn,1:nz,0:nx) :: work !<
1133
1134	!
1135	!-- Carry out the FFT along x, where all data are present due to the 1d-decomposition along y.
1136	!-- Resort the data in a way that y becomes the first index.
1137	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'start' )
1138
1139	IF ( loop_optimization == 'vector' ) THEN
1140	!
1141	!-- Code for vector processors
1142	!$OMP PARALLEL PRIVATE ( i, j, k )
1143	!$OMP DO
1144	DO i = 0, nx
1145
1146	DO j = nys, nyn
1147	DO k = 1, nz
1148	work_fftx(i,k,j) = f_in(k,j,i)
1149	ENDDO
1150	ENDDO
1151
1152	ENDDO
1153
1154	!$OMP DO
1155	DO j = nys, nyn
1156
1157	CALL fft_x_m( work_fftx(:,:,j), 'forward' )
1158
1159	DO k = 1, nz
1160	DO i = 0, nx
1161	work(j,k,i) = work_fftx(i,k,j)
1162	ENDDO
1163	ENDDO
1164
1165	ENDDO
1166	!$OMP END PARALLEL
1167
1168	ELSE
1169
1170	!
1171	!-- Cache optimized code (there might still be a potential for better optimization).
1172	!$OMP PARALLEL PRIVATE (i,j,k)
1173	!$OMP DO
1174	DO i = 0, nx
1175
1176	DO j = nys, nyn
1177	DO k = 1, nz
1178	work_fftx(i,k,j) = f_in(k,j,i)
1179	ENDDO
1180	ENDDO
1181
1182	ENDDO
1183
1184	!$OMP DO
1185	DO j = nys, nyn
1186	DO k = 1, nz
1187
1188	CALL fft_x_1d( work_fftx(0:nx,k,j), 'forward' )
1189
1190	DO i = 0, nx
1191	work(j,k,i) = work_fftx(i,k,j)
1192	ENDDO
1193	ENDDO
1194
1195	ENDDO
1196	!$OMP END PARALLEL
1197
1198	ENDIF
1199	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'pause' )
1200
1201	!
1202	!-- Transpose array
1203	#if defined( __parallel )
1204	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1205	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1206	CALL MPI_ALLTOALL( work(nys,1,0), sendrecvcount_xy, MPI_REAL, f_out(1,1,nxl_y,1), &
1207	sendrecvcount_xy, MPI_REAL, comm1dy, ierr )
1208	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1209	#else
1210	!
1211	!-- Next line required to avoid compile error about unused dummy argument in serial mode
1212	i = SIZE( f_out )
1213	#endif
1214
1215	END SUBROUTINE fftx_tr_xy
1216
1217
1218	!--------------------------------------------------------------------------------------------------!
1219	! Description:
1220	! ------------
1221	!> Transposition y --> x with a subsequent backward Fourier transformation for a 1d-decomposition
1222	!> along x.
1223	!--------------------------------------------------------------------------------------------------!
1224	SUBROUTINE tr_yx_fftx( f_in, f_out )
1225
1226
1227	USE control_parameters, &
1228	ONLY: loop_optimization
1229
1230	USE cpulog, &
1231	ONLY: cpu_log, &
1232	log_point_s
1233
1234	USE kinds
1235
1236	USE pegrid
1237
1238	IMPLICIT NONE
1239
1240	INTEGER(iwp) :: i !<
1241	INTEGER(iwp) :: j !<
1242	INTEGER(iwp) :: k !<
1243
1244	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_in !<
1245	REAL(wp), DIMENSION(1:nz,nys:nyn,0:nx) :: f_out !<
1246	REAL(wp), DIMENSION(nys:nyn,1:nz,0:nx) :: work !<
1247	REAL(wp), DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx !<
1248
1249
1250	!
1251	!-- Transpose array
1252	#if defined( __parallel )
1253	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1254	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1255	CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, work(nys,1,0), &
1256	sendrecvcount_xy, MPI_REAL, comm1dy, ierr )
1257	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1258	#else
1259	!
1260	!-- Next line required to avoid compile error about unused dummy argument in serial mode
1261	i = SIZE( f_in )
1262	#endif
1263
1264	!
1265	!-- Carry out the FFT along x, where all data are present due to the 1d-decomposition along y.
1266	!-- Resort the data in a way that y becomes the first index.
1267	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'continue' )
1268
1269	IF ( loop_optimization == 'vector' ) THEN
1270	!
1271	!-- Code optimized for vector processors
1272	!$OMP PARALLEL PRIVATE ( i, j, k )
1273	!$OMP DO
1274	DO j = nys, nyn
1275
1276	DO k = 1, nz
1277	DO i = 0, nx
1278	work_fftx(i,k,j) = work(j,k,i)
1279	ENDDO
1280	ENDDO
1281
1282	CALL fft_x_m( work_fftx(:,:,j), 'backward' )
1283
1284	ENDDO
1285
1286	!$OMP DO
1287	DO i = 0, nx
1288	DO j = nys, nyn
1289	DO k = 1, nz
1290	f_out(k,j,i) = work_fftx(i,k,j)
1291	ENDDO
1292	ENDDO
1293	ENDDO
1294	!$OMP END PARALLEL
1295
1296	ELSE
1297
1298	!
1299	!-- Cache optimized code (there might be still a potential for better optimization).
1300	!$OMP PARALLEL PRIVATE (i,j,k)
1301	!$OMP DO
1302	DO j = nys, nyn
1303	DO k = 1, nz
1304
1305	DO i = 0, nx
1306	work_fftx(i,k,j) = work(j,k,i)
1307	ENDDO
1308
1309	CALL fft_x_1d( work_fftx(0:nx,k,j), 'backward' )
1310
1311	ENDDO
1312	ENDDO
1313
1314	!$OMP DO
1315	DO i = 0, nx
1316	DO j = nys, nyn
1317	DO k = 1, nz
1318	f_out(k,j,i) = work_fftx(i,k,j)
1319	ENDDO
1320	ENDDO
1321	ENDDO
1322	!$OMP END PARALLEL
1323
1324	ENDIF
1325	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'stop' )
1326
1327	END SUBROUTINE tr_yx_fftx
1328
1329
1330	!--------------------------------------------------------------------------------------------------!
1331	! Description:
1332	! ------------
1333	!> FFT along y, solution of the tridiagonal system and backward FFT for a 1d-decomposition along y.
1334	!>
1335	!> @warning This subroutine may still not work for hybrid parallelization with OpenMP (for possible
1336	!> necessary changes see the original routine poisfft_hybrid, developed by Klaus Ketelsen,
1337	!> May 2002)
1338	!--------------------------------------------------------------------------------------------------!
1339	SUBROUTINE ffty_tri_ffty( ar )
1340
1341
1342	USE control_parameters, &
1343	ONLY: loop_optimization
1344
1345	USE cpulog, &
1346	ONLY: cpu_log, &
1347	log_point_s
1348
1349	USE grid_variables, &
1350	ONLY: ddx2, &
1351	ddy2
1352
1353	USE kinds
1354
1355	USE pegrid
1356
1357	IMPLICIT NONE
1358
1359	INTEGER(iwp) :: i !<
1360	INTEGER(iwp) :: j !<
1361	INTEGER(iwp) :: k !<
1362	INTEGER(iwp) :: m !<
1363	INTEGER(iwp) :: n !<
1364	!$ INTEGER(iwp) :: omp_get_thread_num !<
1365	INTEGER(iwp) :: tn !<
1366
1367	REAL(wp), DIMENSION(0:ny) :: work_ffty !<
1368	REAL(wp), DIMENSION(0:ny,1:nz) :: work_triy !<
1369	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: ar !<
1370	REAL(wp), DIMENSION(:,:,:,:), ALLOCATABLE :: tri !<
1371
1372
1373	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'start' )
1374
1375	ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) )
1376
1377	tn = 0 ! Default thread number in case of one thread
1378	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy )
1379	DO i = nxl_y, nxr_y
1380
1381	!$ tn = omp_get_thread_num()
1382
1383	IF ( loop_optimization == 'vector' ) THEN
1384	!
1385	!-- Code optimized for vector processors
1386	DO k = 1, nz
1387
1388	m = 0
1389	DO n = 1, pdims(2)
1390	DO j = 1, nny
1391	work_triy(m,k) = ar(j,k,i,n)
1392	m = m + 1
1393	ENDDO
1394	ENDDO
1395
1396	ENDDO
1397
1398	CALL fft_y_m( work_triy, ny, 'forward' )
1399
1400	ELSE
1401	!
1402	!-- Cache optimized code
1403	DO k = 1, nz
1404
1405	m = 0
1406	DO n = 1, pdims(2)
1407	DO j = 1, nny
1408	work_ffty(m) = ar(j,k,i,n)
1409	m = m + 1
1410	ENDDO
1411	ENDDO
1412
1413	CALL fft_y_1d( work_ffty, 'forward' )
1414
1415	DO j = 0, ny
1416	work_triy(j,k) = work_ffty(j)
1417	ENDDO
1418
1419	ENDDO
1420
1421	ENDIF
1422
1423	!
1424	!-- Solve the linear equation system
1425	CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) )
1426
1427	IF ( loop_optimization == 'vector' ) THEN
1428	!
1429	!-- Code optimized for vector processors
1430	CALL fft_y_m( work_triy, ny, 'backward' )
1431
1432	DO k = 1, nz
1433
1434	m = 0
1435	DO n = 1, pdims(2)
1436	DO j = 1, nny
1437	ar(j,k,i,n) = work_triy(m,k)
1438	m = m + 1
1439	ENDDO
1440	ENDDO
1441
1442	ENDDO
1443
1444	ELSE
1445	!
1446	!-- Cache optimized code
1447	DO k = 1, nz
1448
1449	DO j = 0, ny
1450	work_ffty(j) = work_triy(j,k)
1451	ENDDO
1452
1453	CALL fft_y_1d( work_ffty, 'backward' )
1454
1455	m = 0
1456	DO n = 1, pdims(2)
1457	DO j = 1, nny
1458	ar(j,k,i,n) = work_ffty(m)
1459	m = m + 1
1460	ENDDO
1461	ENDDO
1462
1463	ENDDO
1464
1465	ENDIF
1466
1467	ENDDO
1468
1469	DEALLOCATE( tri )
1470
1471	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'stop' )
1472
1473	END SUBROUTINE ffty_tri_ffty
1474
1475	END MODULE poisfft_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |