Home

Context Navigation

source: palm/trunk/SOURCE/poisfft_mod.f90 @ 4366

Last change on this file since 4366 was 4366, checked in by raasch, 5 years ago
code vectorization for NEC Aurora: vectorized version of Temperton FFT, vectorization of Newtor iteration for calculating the Obukhov length
Property svn:keywords set to `Id`
File size: 43.3 KB

Line
1	!> @file poisfft_mod.f90
2	!------------------------------------------------------------------------------!
3	! This file is part of the PALM model system.
4	!
5	! PALM is free software: you can redistribute it and/or modify it under the
6	! terms of the GNU General Public License as published by the Free Software
7	! Foundation, either version 3 of the License, or (at your option) any later
8	! version.
9	!
10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13	!
14	! You should have received a copy of the GNU General Public License along with
15	! PALM. If not, see <http://www.gnu.org/licenses/>.
16	!
17	! Copyright 1997-2020 Leibniz Universitaet Hannover
18	!------------------------------------------------------------------------------!
19	!
20	! Current revisions:
21	! -----------------
22	!
23	!
24	! Former revisions:
25	! -----------------
26	! $Id: poisfft_mod.f90 4366 2020-01-09 08:12:43Z raasch $
27	! modification concerning NEC vectorizatio
28	!
29	! 4360 2020-01-07 11:25:50Z suehring
30	! Corrected "Former revisions" section
31	!
32	! 3690 2019-01-22 22:56:42Z knoop
33	! OpenACC port for SPEC
34	!
35	! Revision 1.1 1997/07/24 11:24:14 raasch
36	! Initial revision
37	!
38	!
39	! Description:
40	! ------------
41	!> Solves the Poisson equation with a 2D spectral method
42	!> d^2 p / dx^2 + d^2 p / dy^2 + d^2 p / dz^2 = s
43	!>
44	!> Input:
45	!> real ar contains (nnz,nny,nnx) elements of the velocity divergence,
46	!> starting from (1,nys,nxl)
47	!>
48	!> Output:
49	!> real ar contains the solution for perturbation pressure p
50	!------------------------------------------------------------------------------!
51	MODULE poisfft_mod
52
53
54	USE fft_xy, &
55	ONLY: fft_init, fft_y, fft_y_1d, fft_y_m, fft_x, fft_x_1d, fft_x_m, &
56	temperton_fft_vec
57
58	USE indices, &
59	ONLY: nnx, nny, nx, nxl, nxr, ny, nys, nyn, nz
60
61	USE transpose_indices, &
62	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nys_x, nys_z, nyn_x, nyn_z, nzb_x, &
63	nzb_y, nzt_x, nzt_y
64
65	USE tridia_solver, &
66	ONLY: tridia_1dd, tridia_init, tridia_substi, tridia_substi_overlap
67
68	IMPLICIT NONE
69
70	LOGICAL, SAVE :: poisfft_initialized = .FALSE.
71
72	PRIVATE
73
74	PUBLIC poisfft, poisfft_init
75
76	INTERFACE poisfft
77	MODULE PROCEDURE poisfft
78	END INTERFACE poisfft
79
80	INTERFACE poisfft_init
81	MODULE PROCEDURE poisfft_init
82	END INTERFACE poisfft_init
83
84
85	CONTAINS
86
87	!------------------------------------------------------------------------------!
88	! Description:
89	! ------------
90	!> Setup coefficients for FFT and the tridiagonal solver
91	!------------------------------------------------------------------------------!
92	SUBROUTINE poisfft_init
93
94	IMPLICIT NONE
95
96
97	CALL fft_init
98
99	CALL tridia_init
100
101	poisfft_initialized = .TRUE.
102
103	END SUBROUTINE poisfft_init
104
105
106
107	!------------------------------------------------------------------------------!
108	! Description:
109	! ------------
110	!> Two-dimensional Fourier Transformation in x- and y-direction.
111	!------------------------------------------------------------------------------!
112	SUBROUTINE poisfft( ar )
113
114	USE control_parameters, &
115	ONLY: transpose_compute_overlap
116
117	USE cpulog, &
118	ONLY: cpu_log, cpu_log_nowait, log_point_s
119
120	USE kinds
121
122	USE pegrid
123
124	IMPLICIT NONE
125
126	INTEGER(iwp) :: ii !<
127	INTEGER(iwp) :: iind !<
128	INTEGER(iwp) :: inew !<
129	INTEGER(iwp) :: jj !<
130	INTEGER(iwp) :: jind !<
131	INTEGER(iwp) :: jnew !<
132	INTEGER(iwp) :: ki !<
133	INTEGER(iwp) :: kk !<
134	INTEGER(iwp) :: knew !<
135	INTEGER(iwp) :: n !<
136	INTEGER(iwp) :: nblk !<
137	INTEGER(iwp) :: nnx_y !<
138	INTEGER(iwp) :: nny_z !<
139	INTEGER(iwp) :: nnz_x !<
140	INTEGER(iwp) :: nxl_y_bound !<
141	INTEGER(iwp) :: nxr_y_bound !<
142
143	INTEGER(iwp), DIMENSION(4) :: isave !<
144
145	REAL(wp), DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar !<
146	REAL(wp), DIMENSION(nys:nyn,nxl:nxr,1:nz) :: ar_inv !<
147
148	#define __acc_fft_device ( defined( _OPENACC ) && ( defined ( __cuda_fft ) ) )
149	#if __acc_fft_device
150	!$ACC DECLARE CREATE(ar_inv)
151	#endif
152
153	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: ar1 !<
154	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_in !<
155	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_inv !<
156	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_out_y !<
157	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_out_z !<
158
159
160	CALL cpu_log( log_point_s(3), 'poisfft', 'start' )
161
162	IF ( .NOT. poisfft_initialized ) CALL poisfft_init
163
164	#if !__acc_fft_device
165	!$ACC UPDATE HOST(ar)
166	#endif
167
168	#ifndef _OPENACC
169	!
170	!-- Two-dimensional Fourier Transformation in x- and y-direction.
171	IF ( pdims(2) == 1 .AND. pdims(1) > 1 ) THEN
172
173	!
174	!-- 1d-domain-decomposition along x:
175	!-- FFT along y and transposition y --> x
176	CALL ffty_tr_yx( ar, ar )
177
178	!
179	!-- FFT along x, solving the tridiagonal system and backward FFT
180	CALL fftx_tri_fftx( ar )
181
182	!
183	!-- Transposition x --> y and backward FFT along y
184	CALL tr_xy_ffty( ar, ar )
185
186	ELSEIF ( pdims(1) == 1 .AND. pdims(2) > 1 ) THEN
187
188	!
189	!-- 1d-domain-decomposition along y:
190	!-- FFT along x and transposition x --> y
191	CALL fftx_tr_xy( ar, ar )
192
193	!
194	!-- FFT along y, solving the tridiagonal system and backward FFT
195	CALL ffty_tri_ffty( ar )
196
197	!
198	!-- Transposition y --> x and backward FFT along x
199	CALL tr_yx_fftx( ar, ar )
200
201	ELSEIF ( .NOT. transpose_compute_overlap ) THEN
202	#endif
203
204	!
205	!-- 2d-domain-decomposition or no decomposition (1 PE run)
206	!-- Transposition z --> x
207	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
208	CALL resort_for_zx( ar, ar_inv )
209	CALL transpose_zx( ar_inv, ar )
210	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
211
212	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
213	IF ( temperton_fft_vec ) THEN
214	!
215	!-- Vector version outputs a transformed array ar_inv that does not require resorting
216	!-- (which is done for ar further below)
217	CALL fft_x( ar, 'forward', ar_inv=ar_inv)
218	ELSE
219	CALL fft_x( ar, 'forward')
220	ENDIF
221	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
222
223	!
224	!-- Transposition x --> y
225	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
226	IF( .NOT. temperton_fft_vec ) CALL resort_for_xy( ar, ar_inv )
227	CALL transpose_xy( ar_inv, ar )
228	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
229
230	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
231	IF ( temperton_fft_vec ) THEN
232	!
233	!-- Input array ar_inv from fft_x can be directly used here.
234	!-- The output (also in array ar_inv) does not require resorting below.
235	CALL fft_y( ar, 'forward', ar_inv = ar_inv, nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
236	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
237	ELSE
238	CALL fft_y( ar, 'forward', ar_tr = ar, nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
239	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
240	ENDIF
241	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
242
243	!
244	!-- Transposition y --> z
245	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
246	IF ( .NOT. temperton_fft_vec ) CALL resort_for_yz( ar, ar_inv )
247	CALL transpose_yz( ar_inv, ar )
248	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
249
250	!
251	!-- Solve the tridiagonal equation system along z
252	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
253	CALL tridia_substi( ar )
254	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
255
256	!
257	!-- Inverse Fourier Transformation
258	!-- Transposition z --> y
259	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
260	CALL transpose_zy( ar, ar_inv )
261	!
262	!-- The fft_y below (vector branch) can directly process ar_inv (i.e. does not require a
263	!-- resorting)
264	IF ( .NOT. temperton_fft_vec ) CALL resort_for_zy( ar_inv, ar )
265	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
266
267	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
268	IF ( temperton_fft_vec ) THEN
269	!
270	!-- Output array ar_inv can be used as input to the below fft_x routine without resorting
271	CALL fft_y( ar, 'backward', ar_inv = ar_inv, nxl_y_bound = nxl_y, nxr_y_bound = nxr_y,&
272	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
273	ELSE
274	CALL fft_y( ar, 'backward', ar_tr = ar, nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
275	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
276	ENDIF
277
278	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
279
280	!
281	!-- Transposition y --> x
282	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
283	CALL transpose_yx( ar, ar_inv )
284	IF ( .NOT. temperton_fft_vec ) CALL resort_for_yx( ar_inv, ar )
285	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
286
287	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
288	IF ( temperton_fft_vec ) THEN
289	CALL fft_x( ar, 'backward', ar_inv=ar_inv )
290	ELSE
291	CALL fft_x( ar, 'backward' )
292	ENDIF
293	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
294
295	!
296	!-- Transposition x --> z
297	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
298	CALL transpose_xz( ar, ar_inv )
299	CALL resort_for_xz( ar_inv, ar )
300	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
301
302	#ifndef _OPENACC
303	ELSE
304
305	!
306	!-- 2d-domain-decomposition or no decomposition (1 PE run) with
307	!-- overlapping transposition / fft
308	!-- cputime logging must not use barriers, which would prevent overlapping
309	ALLOCATE( f_out_y(0:ny,nxl_y:nxr_y,nzb_y:nzt_y), &
310	f_out_z(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
311	!
312	!-- Transposition z --> x + subsequent fft along x
313	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
314	CALL resort_for_zx( ar, f_inv )
315	!
316	!-- Save original indices and gridpoint counter
317	isave(1) = nz
318	isave(2) = nzb_x
319	isave(3) = nzt_x
320	isave(4) = sendrecvcount_zx
321	!
322	!-- Set new indices for transformation
323	nblk = nz / pdims(1)
324	nz = pdims(1)
325	nnz_x = 1
326	nzb_x = 1 + myidx * nnz_x
327	nzt_x = ( myidx + 1 ) * nnz_x
328	sendrecvcount_zx = nnx * nny * nnz_x
329
330	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
331	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
332
333	DO kk = 1, nblk
334
335	IF ( kk == 1 ) THEN
336	CALL cpu_log( log_point_s(5), 'transpo forward', 'start', cpu_log_nowait )
337	ELSE
338	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
339	ENDIF
340
341	DO knew = 1, nz
342	ki = kk + nblk * ( knew - 1 )
343	f_in(:,:,knew) = f_inv(:,:,ki)
344	ENDDO
345
346	CALL transpose_zx( f_in, ar1(:,:,:))
347	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
348
349	IF ( kk == 1 ) THEN
350	CALL cpu_log( log_point_s(4), 'fft_x', 'start', cpu_log_nowait )
351	ELSE
352	CALL cpu_log( log_point_s(4), 'fft_x', 'continue', cpu_log_nowait )
353	ENDIF
354
355	n = isave(2) + kk - 1
356	CALL fft_x( ar1(:,:,:), 'forward', ar_2d = f_out_z(:,:,n))
357	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
358
359	ENDDO
360	!
361	!-- Restore original indices/counters
362	nz = isave(1)
363	nzb_x = isave(2)
364	nzt_x = isave(3)
365	sendrecvcount_zx = isave(4)
366
367	DEALLOCATE( ar1, f_in, f_inv )
368
369	!
370	!-- Transposition x --> y + subsequent fft along y
371	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
372	CALL resort_for_xy( f_out_z, f_inv )
373	!
374	!-- Save original indices and gridpoint counter
375	isave(1) = nx
376	isave(2) = nxl_y
377	isave(3) = nxr_y
378	isave(4) = sendrecvcount_xy
379	!
380	!-- Set new indices for transformation
381	nblk = ( ( nx+1 ) / pdims(2) ) - 1
382	nx = pdims(2)
383	nnx_y = 1
384	nxl_y = myidy * nnx_y
385	nxr_y = ( myidy + 1 ) * nnx_y - 1
386	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
387
388	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
389	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
390
391	DO ii = 0, nblk
392
393	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
394
395	DO inew = 0, nx-1
396	iind = ii + ( nblk + 1 ) * inew
397	f_in(:,:,inew) = f_inv(:,:,iind)
398	ENDDO
399
400	CALL transpose_xy( f_in, ar1(:,:,:) )
401
402	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
403
404	IF ( ii == 1 ) THEN
405	CALL cpu_log( log_point_s(7), 'fft_y', 'start', cpu_log_nowait )
406	ELSE
407	CALL cpu_log( log_point_s(7), 'fft_y', 'continue', cpu_log_nowait )
408	ENDIF
409
410	nxl_y_bound = isave(2)
411	nxr_y_bound = isave(3)
412	n = isave(2) + ii
413	CALL fft_y( ar1(:,:,:), 'forward', ar_tr = f_out_y, &
414	nxl_y_bound = nxl_y_bound, nxr_y_bound = nxr_y_bound, &
415	nxl_y_l = n, nxr_y_l = n )
416
417	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
418
419	ENDDO
420	!
421	!-- Restore original indices/counters
422	nx = isave(1)
423	nxl_y = isave(2)
424	nxr_y = isave(3)
425	sendrecvcount_xy = isave(4)
426
427	DEALLOCATE( ar1, f_in, f_inv )
428
429	!
430	!-- Transposition y --> z + subsequent tridia + resort for z --> y
431	ALLOCATE( f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
432	CALL resort_for_yz( f_out_y, f_inv )
433	!
434	!-- Save original indices and gridpoint counter
435	isave(1) = ny
436	isave(2) = nys_z
437	isave(3) = nyn_z
438	isave(4) = sendrecvcount_yz
439	!
440	!-- Set new indices for transformation
441	nblk = ( ( ny+1 ) / pdims(1) ) - 1
442	ny = pdims(1)
443	nny_z = 1
444	nys_z = myidx * nny_z
445	nyn_z = ( myidx + 1 ) * nny_z - 1
446	sendrecvcount_yz = ( nxr_y-nxl_y+1 ) * nny_z * ( nzt_y-nzb_y+1 )
447
448	ALLOCATE( ar1(nxl_z:nxr_z,nys_z:nyn_z,1:nz) )
449	ALLOCATE( f_in(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
450
451	DO jj = 0, nblk
452	!
453	!-- Forward Fourier Transformation
454	!-- Transposition y --> z
455	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
456
457	DO jnew = 0, ny-1
458	jind = jj + ( nblk + 1 ) * jnew
459	f_in(:,:,jnew) = f_inv(:,:,jind)
460	ENDDO
461
462	CALL transpose_yz( f_in, ar1(:,:,:) )
463
464	IF ( jj == nblk ) THEN
465	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
466	ELSE
467	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
468	ENDIF
469
470	!
471	!-- Solve the tridiagonal equation system along z
472	CALL cpu_log( log_point_s(6), 'tridia', 'start', cpu_log_nowait )
473
474	n = isave(2) + jj
475	CALL tridia_substi_overlap( ar1(:,:,:), n )
476
477	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
478
479	!
480	!-- Inverse Fourier Transformation
481	!-- Transposition z --> y
482	!-- Only one thread should call MPI routines, therefore forward and
483	!-- backward tranpose are in the same section
484	IF ( jj == 0 ) THEN
485	CALL cpu_log( log_point_s(8), 'transpo invers', 'start', cpu_log_nowait )
486	ELSE
487	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
488	ENDIF
489
490	CALL transpose_zy( ar1(:,:,:), f_in )
491
492	DO jnew = 0, ny-1
493	jind = jj + ( nblk + 1 ) * jnew
494	f_inv(:,:,jind) = f_in(:,:,jnew)
495	ENDDO
496
497	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
498
499	ENDDO
500	!
501	!-- Restore original indices/counters
502	ny = isave(1)
503	nys_z = isave(2)
504	nyn_z = isave(3)
505	sendrecvcount_yz = isave(4)
506
507	CALL resort_for_zy( f_inv, f_out_y )
508
509	DEALLOCATE( ar1, f_in, f_inv )
510
511	!
512	!-- fft along y backward + subsequent transposition y --> x
513	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
514	!
515	!-- Save original indices and gridpoint counter
516	isave(1) = nx
517	isave(2) = nxl_y
518	isave(3) = nxr_y
519	isave(4) = sendrecvcount_xy
520	!
521	!-- Set new indices for transformation
522	nblk = (( nx+1 ) / pdims(2) ) - 1
523	nx = pdims(2)
524	nnx_y = 1
525	nxl_y = myidy * nnx_y
526	nxr_y = ( myidy + 1 ) * nnx_y - 1
527	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
528
529	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
530	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
531
532	DO ii = 0, nblk
533
534	CALL cpu_log( log_point_s(7), 'fft_y', 'continue', cpu_log_nowait )
535
536	n = isave(2) + ii
537	nxl_y_bound = isave(2)
538	nxr_y_bound = isave(3)
539
540	CALL fft_y( ar1(:,:,:), 'backward', ar_tr = f_out_y, &
541	nxl_y_bound = nxl_y_bound, nxr_y_bound = nxr_y_bound, &
542	nxl_y_l = n, nxr_y_l = n )
543
544	IF ( ii == nblk ) THEN
545	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
546	ELSE
547	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
548	ENDIF
549
550	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
551
552	CALL transpose_yx( ar1(:,:,:), f_in )
553
554	DO inew = 0, nx-1
555	iind = ii + (nblk+1) * inew
556	f_inv(:,:,iind) = f_in(:,:,inew)
557	ENDDO
558
559	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
560
561	ENDDO
562	!
563	!-- Restore original indices/counters
564	nx = isave(1)
565	nxl_y = isave(2)
566	nxr_y = isave(3)
567	sendrecvcount_xy = isave(4)
568
569	CALL resort_for_yx( f_inv, f_out_z )
570
571	DEALLOCATE( ar1, f_in, f_inv )
572
573	!
574	!-- fft along x backward + subsequent final transposition x --> z
575	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
576	!
577	!-- Save original indices and gridpoint counter
578	isave(1) = nz
579	isave(2) = nzb_x
580	isave(3) = nzt_x
581	isave(4) = sendrecvcount_zx
582	!
583	!-- Set new indices for transformation
584	nblk = nz / pdims(1)
585	nz = pdims(1)
586	nnz_x = 1
587	nzb_x = 1 + myidx * nnz_x
588	nzt_x = ( myidx + 1 ) * nnz_x
589	sendrecvcount_zx = nnx * nny * nnz_x
590
591	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
592	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
593
594	DO kk = 1, nblk
595
596	CALL cpu_log( log_point_s(4), 'fft_x', 'continue', cpu_log_nowait )
597
598	n = isave(2) + kk - 1
599	CALL fft_x( ar1(:,:,:), 'backward', f_out_z(:,:,n))
600
601	IF ( kk == nblk ) THEN
602	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
603	ELSE
604	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
605	ENDIF
606
607	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
608
609	CALL transpose_xz( ar1(:,:,:), f_in )
610
611	DO knew = 1, nz
612	ki = kk + nblk * (knew-1)
613	f_inv(:,:,ki) = f_in(:,:,knew)
614	ENDDO
615
616	IF ( kk == nblk ) THEN
617	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
618	ELSE
619	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
620	ENDIF
621
622	ENDDO
623	!
624	!-- Restore original indices/counters
625	nz = isave(1)
626	nzb_x = isave(2)
627	nzt_x = isave(3)
628	sendrecvcount_zx = isave(4)
629
630	CALL resort_for_xz( f_inv, ar )
631
632	DEALLOCATE( ar1, f_in, f_inv )
633
634	ENDIF
635	#endif
636
637	#if !__acc_fft_device
638	!$ACC UPDATE DEVICE(ar)
639	#endif
640
641	CALL cpu_log( log_point_s(3), 'poisfft', 'stop' )
642
643	END SUBROUTINE poisfft
644
645
646	!------------------------------------------------------------------------------!
647	! Description:
648	! ------------
649	!> Fourier-transformation along y with subsequent transposition y --> x for
650	!> a 1d-decomposition along x.
651	!>
652	!> @attention The performance of this routine is much faster on the NEC-SX6,
653	!> if the first index of work_ffty_vec is odd. Otherwise
654	!> memory bank conflicts may occur (especially if the index is a
655	!> multiple of 128). That's why work_ffty_vec is dimensioned as
656	!> 0:ny+1.
657	!> Of course, this will not work if users are using an odd number
658	!> of gridpoints along y.
659	!------------------------------------------------------------------------------!
660	SUBROUTINE ffty_tr_yx( f_in, f_out )
661
662	USE control_parameters, &
663	ONLY: loop_optimization
664
665	USE cpulog, &
666	ONLY: cpu_log, log_point_s
667
668	USE kinds
669
670	USE pegrid
671
672	IMPLICIT NONE
673
674	INTEGER(iwp) :: i !<
675	INTEGER(iwp) :: iend !<
676	INTEGER(iwp) :: iouter !<
677	INTEGER(iwp) :: ir !<
678	INTEGER(iwp) :: j !<
679	INTEGER(iwp) :: k !<
680
681	INTEGER(iwp), PARAMETER :: stridex = 4 !<
682
683	REAL(wp), DIMENSION(1:nz,0:ny,nxl:nxr) :: f_in !<
684	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_out !<
685	REAL(wp), DIMENSION(nxl:nxr,1:nz,0:ny) :: work !<
686
687	REAL(wp), DIMENSION(:,:), ALLOCATABLE :: work_ffty !<
688	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: work_ffty_vec !<
689
690	!
691	!-- Carry out the FFT along y, where all data are present due to the
692	!-- 1d-decomposition along x. Resort the data in a way that x becomes
693	!-- the first index.
694	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'start' )
695
696	IF ( loop_optimization == 'vector' ) THEN
697
698	ALLOCATE( work_ffty_vec(0:ny+1,1:nz,nxl:nxr) )
699	!
700	!-- Code optimized for vector processors
701	!$OMP PARALLEL PRIVATE ( i, j, k )
702	!$OMP DO
703	DO i = nxl, nxr
704
705	DO j = 0, ny
706	DO k = 1, nz
707	work_ffty_vec(j,k,i) = f_in(k,j,i)
708	ENDDO
709	ENDDO
710
711	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' )
712
713	ENDDO
714
715	!$OMP DO
716	DO k = 1, nz
717	DO j = 0, ny
718	DO i = nxl, nxr
719	work(i,k,j) = work_ffty_vec(j,k,i)
720	ENDDO
721	ENDDO
722	ENDDO
723	!$OMP END PARALLEL
724
725	DEALLOCATE( work_ffty_vec )
726
727	ELSE
728	!
729	!-- Cache optimized code.
730	ALLOCATE( work_ffty(0:ny,stridex) )
731	!
732	!-- The i-(x-)direction is split into a strided outer loop and an inner
733	!-- loop for better cache performance
734	!$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty)
735	!$OMP DO
736	DO iouter = nxl, nxr, stridex
737
738	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
739
740	DO k = 1, nz
741
742	DO i = iouter, iend
743
744	ir = i-iouter+1 ! counter within a stride
745	DO j = 0, ny
746	work_ffty(j,ir) = f_in(k,j,i)
747	ENDDO
748	!
749	!-- FFT along y
750	CALL fft_y_1d( work_ffty(:,ir), 'forward' )
751
752	ENDDO
753
754	!
755	!-- Resort
756	DO j = 0, ny
757	DO i = iouter, iend
758	work(i,k,j) = work_ffty(j,i-iouter+1)
759	ENDDO
760	ENDDO
761
762	ENDDO
763
764	ENDDO
765	!$OMP END PARALLEL
766
767	DEALLOCATE( work_ffty )
768
769	ENDIF
770
771	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'pause' )
772
773	!
774	!-- Transpose array
775	#if defined( __parallel )
776	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
777	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
778	CALL MPI_ALLTOALL( work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
779	f_out(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
780	comm1dx, ierr )
781	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
782	#endif
783
784	END SUBROUTINE ffty_tr_yx
785
786
787	!------------------------------------------------------------------------------!
788	! Description:
789	! ------------
790	!> Transposition x --> y with a subsequent backward Fourier transformation for
791	!> a 1d-decomposition along x
792	!------------------------------------------------------------------------------!
793	SUBROUTINE tr_xy_ffty( f_in, f_out )
794
795	USE control_parameters, &
796	ONLY: loop_optimization
797
798	USE cpulog, &
799	ONLY: cpu_log, log_point_s
800
801	USE kinds
802
803	USE pegrid
804
805	IMPLICIT NONE
806
807	INTEGER(iwp) :: i !<
808	INTEGER(iwp) :: iend !<
809	INTEGER(iwp) :: iouter !<
810	INTEGER(iwp) :: ir !<
811	INTEGER(iwp) :: j !<
812	INTEGER(iwp) :: k !<
813
814	INTEGER(iwp), PARAMETER :: stridex = 4 !<
815
816	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_in !<
817	REAL(wp), DIMENSION(1:nz,0:ny,nxl:nxr) :: f_out !<
818	REAL(wp), DIMENSION(nxl:nxr,1:nz,0:ny) :: work !<
819
820	REAL(wp), DIMENSION(:,:), ALLOCATABLE :: work_ffty !<
821	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: work_ffty_vec !<
822
823	!
824	!-- Transpose array
825	#if defined( __parallel )
826	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
827	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
828	CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
829	work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
830	comm1dx, ierr )
831	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
832	#endif
833
834	!
835	!-- Resort the data in a way that y becomes the first index and carry out the
836	!-- backward fft along y.
837	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'continue' )
838
839	IF ( loop_optimization == 'vector' ) THEN
840
841	ALLOCATE( work_ffty_vec(0:ny+1,1:nz,nxl:nxr) )
842	!
843	!-- Code optimized for vector processors
844	!$OMP PARALLEL PRIVATE ( i, j, k )
845	!$OMP DO
846	DO k = 1, nz
847	DO j = 0, ny
848	DO i = nxl, nxr
849	work_ffty_vec(j,k,i) = work(i,k,j)
850	ENDDO
851	ENDDO
852	ENDDO
853
854	!$OMP DO
855	DO i = nxl, nxr
856
857	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' )
858
859	DO j = 0, ny
860	DO k = 1, nz
861	f_out(k,j,i) = work_ffty_vec(j,k,i)
862	ENDDO
863	ENDDO
864
865	ENDDO
866	!$OMP END PARALLEL
867
868	DEALLOCATE( work_ffty_vec )
869
870	ELSE
871	!
872	!-- Cache optimized code.
873	ALLOCATE( work_ffty(0:ny,stridex) )
874	!
875	!-- The i-(x-)direction is split into a strided outer loop and an inner
876	!-- loop for better cache performance
877	!$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty )
878	!$OMP DO
879	DO iouter = nxl, nxr, stridex
880
881	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
882
883	DO k = 1, nz
884	!
885	!-- Resort
886	DO j = 0, ny
887	DO i = iouter, iend
888	work_ffty(j,i-iouter+1) = work(i,k,j)
889	ENDDO
890	ENDDO
891
892	DO i = iouter, iend
893
894	!
895	!-- FFT along y
896	ir = i-iouter+1 ! counter within a stride
897	CALL fft_y_1d( work_ffty(:,ir), 'backward' )
898
899	DO j = 0, ny
900	f_out(k,j,i) = work_ffty(j,ir)
901	ENDDO
902	ENDDO
903
904	ENDDO
905
906	ENDDO
907	!$OMP END PARALLEL
908
909	DEALLOCATE( work_ffty )
910
911	ENDIF
912
913	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'stop' )
914
915	END SUBROUTINE tr_xy_ffty
916
917
918	!------------------------------------------------------------------------------!
919	! Description:
920	! ------------
921	!> FFT along x, solution of the tridiagonal system and backward FFT for
922	!> a 1d-decomposition along x
923	!>
924	!> @warning this subroutine may still not work for hybrid parallelization
925	!> with OpenMP (for possible necessary changes see the original
926	!> routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
927	!------------------------------------------------------------------------------!
928	SUBROUTINE fftx_tri_fftx( ar )
929
930	USE control_parameters, &
931	ONLY: loop_optimization
932
933	USE cpulog, &
934	ONLY: cpu_log, log_point_s
935
936	USE grid_variables, &
937	ONLY: ddx2, ddy2
938
939	USE kinds
940
941	USE pegrid
942
943	IMPLICIT NONE
944
945	INTEGER(iwp) :: i !<
946	INTEGER(iwp) :: j !<
947	INTEGER(iwp) :: k !<
948	INTEGER(iwp) :: m !<
949	INTEGER(iwp) :: n !<
950	!$ INTEGER(iwp) :: omp_get_thread_num !<
951	INTEGER(iwp) :: tn !<
952
953	REAL(wp), DIMENSION(0:nx) :: work_fftx !<
954	REAL(wp), DIMENSION(0:nx,1:nz) :: work_trix !<
955	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: ar !<
956	REAL(wp), DIMENSION(:,:,:,:), ALLOCATABLE :: tri !<
957
958
959	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'start' )
960
961	ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) )
962
963	tn = 0 ! Default thread number in case of one thread
964	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_fftx, work_trix )
965	DO j = nys_x, nyn_x
966
967	!$ tn = omp_get_thread_num()
968
969	IF ( loop_optimization == 'vector' ) THEN
970	!
971	!-- Code optimized for vector processors
972	DO k = 1, nz
973
974	m = 0
975	DO n = 1, pdims(1)
976	DO i = 1, nnx
977	work_trix(m,k) = ar(i,k,j,n)
978	m = m + 1
979	ENDDO
980	ENDDO
981
982	ENDDO
983
984	CALL fft_x_m( work_trix, 'forward' )
985
986	ELSE
987	!
988	!-- Cache optimized code
989	DO k = 1, nz
990
991	m = 0
992	DO n = 1, pdims(1)
993	DO i = 1, nnx
994	work_fftx(m) = ar(i,k,j,n)
995	m = m + 1
996	ENDDO
997	ENDDO
998
999	CALL fft_x_1d( work_fftx, 'forward' )
1000
1001	DO i = 0, nx
1002	work_trix(i,k) = work_fftx(i)
1003	ENDDO
1004
1005	ENDDO
1006
1007	ENDIF
1008
1009	!
1010	!-- Solve the linear equation system
1011	CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) )
1012
1013	IF ( loop_optimization == 'vector' ) THEN
1014	!
1015	!-- Code optimized for vector processors
1016	CALL fft_x_m( work_trix, 'backward' )
1017
1018	DO k = 1, nz
1019
1020	m = 0
1021	DO n = 1, pdims(1)
1022	DO i = 1, nnx
1023	ar(i,k,j,n) = work_trix(m,k)
1024	m = m + 1
1025	ENDDO
1026	ENDDO
1027
1028	ENDDO
1029
1030	ELSE
1031	!
1032	!-- Cache optimized code
1033	DO k = 1, nz
1034
1035	DO i = 0, nx
1036	work_fftx(i) = work_trix(i,k)
1037	ENDDO
1038
1039	CALL fft_x_1d( work_fftx, 'backward' )
1040
1041	m = 0
1042	DO n = 1, pdims(1)
1043	DO i = 1, nnx
1044	ar(i,k,j,n) = work_fftx(m)
1045	m = m + 1
1046	ENDDO
1047	ENDDO
1048
1049	ENDDO
1050
1051	ENDIF
1052
1053	ENDDO
1054
1055	DEALLOCATE( tri )
1056
1057	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'stop' )
1058
1059	END SUBROUTINE fftx_tri_fftx
1060
1061
1062	!------------------------------------------------------------------------------!
1063	! Description:
1064	! ------------
1065	!> Fourier-transformation along x with subsequent transposition x --> y for
1066	!> a 1d-decomposition along y.
1067	!>
1068	!> @attention NEC-branch of this routine may significantly profit from
1069	!> further optimizations. So far, performance is much worse than
1070	!> for routine ffty_tr_yx (more than three times slower).
1071	!------------------------------------------------------------------------------!
1072	SUBROUTINE fftx_tr_xy( f_in, f_out )
1073
1074
1075	USE control_parameters, &
1076	ONLY: loop_optimization
1077
1078	USE cpulog, &
1079	ONLY: cpu_log, log_point_s
1080
1081	USE kinds
1082
1083	USE pegrid
1084
1085	IMPLICIT NONE
1086
1087	INTEGER(iwp) :: i !<
1088	INTEGER(iwp) :: j !<
1089	INTEGER(iwp) :: k !<
1090
1091	REAL(wp), DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx !<
1092	REAL(wp), DIMENSION(1:nz,nys:nyn,0:nx) :: f_in !<
1093	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_out !<
1094	REAL(wp), DIMENSION(nys:nyn,1:nz,0:nx) :: work !<
1095
1096	!
1097	!-- Carry out the FFT along x, where all data are present due to the
1098	!-- 1d-decomposition along y. Resort the data in a way that y becomes
1099	!-- the first index.
1100	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'start' )
1101
1102	IF ( loop_optimization == 'vector' ) THEN
1103	!
1104	!-- Code for vector processors
1105	!$OMP PARALLEL PRIVATE ( i, j, k )
1106	!$OMP DO
1107	DO i = 0, nx
1108
1109	DO j = nys, nyn
1110	DO k = 1, nz
1111	work_fftx(i,k,j) = f_in(k,j,i)
1112	ENDDO
1113	ENDDO
1114
1115	ENDDO
1116
1117	!$OMP DO
1118	DO j = nys, nyn
1119
1120	CALL fft_x_m( work_fftx(:,:,j), 'forward' )
1121
1122	DO k = 1, nz
1123	DO i = 0, nx
1124	work(j,k,i) = work_fftx(i,k,j)
1125	ENDDO
1126	ENDDO
1127
1128	ENDDO
1129	!$OMP END PARALLEL
1130
1131	ELSE
1132
1133	!
1134	!-- Cache optimized code (there might be still a potential for better
1135	!-- optimization).
1136	!$OMP PARALLEL PRIVATE (i,j,k)
1137	!$OMP DO
1138	DO i = 0, nx
1139
1140	DO j = nys, nyn
1141	DO k = 1, nz
1142	work_fftx(i,k,j) = f_in(k,j,i)
1143	ENDDO
1144	ENDDO
1145
1146	ENDDO
1147
1148	!$OMP DO
1149	DO j = nys, nyn
1150	DO k = 1, nz
1151
1152	CALL fft_x_1d( work_fftx(0:nx,k,j), 'forward' )
1153
1154	DO i = 0, nx
1155	work(j,k,i) = work_fftx(i,k,j)
1156	ENDDO
1157	ENDDO
1158
1159	ENDDO
1160	!$OMP END PARALLEL
1161
1162	ENDIF
1163	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'pause' )
1164
1165	!
1166	!-- Transpose array
1167	#if defined( __parallel )
1168	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1169	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1170	CALL MPI_ALLTOALL( work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
1171	f_out(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
1172	comm1dy, ierr )
1173	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1174	#endif
1175
1176	END SUBROUTINE fftx_tr_xy
1177
1178
1179	!------------------------------------------------------------------------------!
1180	! Description:
1181	! ------------
1182	!> Transposition y --> x with a subsequent backward Fourier transformation for
1183	!> a 1d-decomposition along x.
1184	!------------------------------------------------------------------------------!
1185	SUBROUTINE tr_yx_fftx( f_in, f_out )
1186
1187
1188	USE control_parameters, &
1189	ONLY: loop_optimization
1190
1191	USE cpulog, &
1192	ONLY: cpu_log, log_point_s
1193
1194	USE kinds
1195
1196	USE pegrid
1197
1198	IMPLICIT NONE
1199
1200	INTEGER(iwp) :: i !<
1201	INTEGER(iwp) :: j !<
1202	INTEGER(iwp) :: k !<
1203
1204	REAL(wp), DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx !<
1205	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_in !<
1206	REAL(wp), DIMENSION(1:nz,nys:nyn,0:nx) :: f_out !<
1207	REAL(wp), DIMENSION(nys:nyn,1:nz,0:nx) :: work !<
1208
1209	!
1210	!-- Transpose array
1211	#if defined( __parallel )
1212	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1213	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1214	CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
1215	work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
1216	comm1dy, ierr )
1217	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1218	#endif
1219
1220	!
1221	!-- Carry out the FFT along x, where all data are present due to the
1222	!-- 1d-decomposition along y. Resort the data in a way that y becomes
1223	!-- the first index.
1224	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'continue' )
1225
1226	IF ( loop_optimization == 'vector' ) THEN
1227	!
1228	!-- Code optimized for vector processors
1229	!$OMP PARALLEL PRIVATE ( i, j, k )
1230	!$OMP DO
1231	DO j = nys, nyn
1232
1233	DO k = 1, nz
1234	DO i = 0, nx
1235	work_fftx(i,k,j) = work(j,k,i)
1236	ENDDO
1237	ENDDO
1238
1239	CALL fft_x_m( work_fftx(:,:,j), 'backward' )
1240
1241	ENDDO
1242
1243	!$OMP DO
1244	DO i = 0, nx
1245	DO j = nys, nyn
1246	DO k = 1, nz
1247	f_out(k,j,i) = work_fftx(i,k,j)
1248	ENDDO
1249	ENDDO
1250	ENDDO
1251	!$OMP END PARALLEL
1252
1253	ELSE
1254
1255	!
1256	!-- Cache optimized code (there might be still a potential for better
1257	!-- optimization).
1258	!$OMP PARALLEL PRIVATE (i,j,k)
1259	!$OMP DO
1260	DO j = nys, nyn
1261	DO k = 1, nz
1262
1263	DO i = 0, nx
1264	work_fftx(i,k,j) = work(j,k,i)
1265	ENDDO
1266
1267	CALL fft_x_1d( work_fftx(0:nx,k,j), 'backward' )
1268
1269	ENDDO
1270	ENDDO
1271
1272	!$OMP DO
1273	DO i = 0, nx
1274	DO j = nys, nyn
1275	DO k = 1, nz
1276	f_out(k,j,i) = work_fftx(i,k,j)
1277	ENDDO
1278	ENDDO
1279	ENDDO
1280	!$OMP END PARALLEL
1281
1282	ENDIF
1283	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'stop' )
1284
1285	END SUBROUTINE tr_yx_fftx
1286
1287
1288	!------------------------------------------------------------------------------!
1289	! Description:
1290	! ------------
1291	!> FFT along y, solution of the tridiagonal system and backward FFT for
1292	!> a 1d-decomposition along y.
1293	!>
1294	!> @warning this subroutine may still not work for hybrid parallelization
1295	!> with OpenMP (for possible necessary changes see the original
1296	!> routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
1297	!------------------------------------------------------------------------------!
1298	SUBROUTINE ffty_tri_ffty( ar )
1299
1300
1301	USE control_parameters, &
1302	ONLY: loop_optimization
1303
1304	USE cpulog, &
1305	ONLY: cpu_log, log_point_s
1306
1307	USE grid_variables, &
1308	ONLY: ddx2, ddy2
1309
1310	USE kinds
1311
1312	USE pegrid
1313
1314	IMPLICIT NONE
1315
1316	INTEGER(iwp) :: i !<
1317	INTEGER(iwp) :: j !<
1318	INTEGER(iwp) :: k !<
1319	INTEGER(iwp) :: m !<
1320	INTEGER(iwp) :: n !<
1321	!$ INTEGER(iwp) :: omp_get_thread_num !<
1322	INTEGER(iwp) :: tn !<
1323
1324	REAL(wp), DIMENSION(0:ny) :: work_ffty !<
1325	REAL(wp), DIMENSION(0:ny,1:nz) :: work_triy !<
1326	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: ar !<
1327	REAL(wp), DIMENSION(:,:,:,:), ALLOCATABLE :: tri !<
1328
1329
1330	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'start' )
1331
1332	ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) )
1333
1334	tn = 0 ! Default thread number in case of one thread
1335	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy )
1336	DO i = nxl_y, nxr_y
1337
1338	!$ tn = omp_get_thread_num()
1339
1340	IF ( loop_optimization == 'vector' ) THEN
1341	!
1342	!-- Code optimized for vector processors
1343	DO k = 1, nz
1344
1345	m = 0
1346	DO n = 1, pdims(2)
1347	DO j = 1, nny
1348	work_triy(m,k) = ar(j,k,i,n)
1349	m = m + 1
1350	ENDDO
1351	ENDDO
1352
1353	ENDDO
1354
1355	CALL fft_y_m( work_triy, ny, 'forward' )
1356
1357	ELSE
1358	!
1359	!-- Cache optimized code
1360	DO k = 1, nz
1361
1362	m = 0
1363	DO n = 1, pdims(2)
1364	DO j = 1, nny
1365	work_ffty(m) = ar(j,k,i,n)
1366	m = m + 1
1367	ENDDO
1368	ENDDO
1369
1370	CALL fft_y_1d( work_ffty, 'forward' )
1371
1372	DO j = 0, ny
1373	work_triy(j,k) = work_ffty(j)
1374	ENDDO
1375
1376	ENDDO
1377
1378	ENDIF
1379
1380	!
1381	!-- Solve the linear equation system
1382	CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) )
1383
1384	IF ( loop_optimization == 'vector' ) THEN
1385	!
1386	!-- Code optimized for vector processors
1387	CALL fft_y_m( work_triy, ny, 'backward' )
1388
1389	DO k = 1, nz
1390
1391	m = 0
1392	DO n = 1, pdims(2)
1393	DO j = 1, nny
1394	ar(j,k,i,n) = work_triy(m,k)
1395	m = m + 1
1396	ENDDO
1397	ENDDO
1398
1399	ENDDO
1400
1401	ELSE
1402	!
1403	!-- Cache optimized code
1404	DO k = 1, nz
1405
1406	DO j = 0, ny
1407	work_ffty(j) = work_triy(j,k)
1408	ENDDO
1409
1410	CALL fft_y_1d( work_ffty, 'backward' )
1411
1412	m = 0
1413	DO n = 1, pdims(2)
1414	DO j = 1, nny
1415	ar(j,k,i,n) = work_ffty(m)
1416	m = m + 1
1417	ENDDO
1418	ENDDO
1419
1420	ENDDO
1421
1422	ENDIF
1423
1424	ENDDO
1425
1426	DEALLOCATE( tri )
1427
1428	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'stop' )
1429
1430	END SUBROUTINE ffty_tri_ffty
1431
1432	END MODULE poisfft_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |