Home

Context Navigation

source: palm/trunk/SOURCE/poisfft_mod.f90 @ 3913

Last change on this file since 3913 was 3690, checked in by knoop, 6 years ago
Enabled OpenACC usage without using the cudaFFT library. Added respective palmtest build configuration and testcase.
Property svn:keywords set to `Id`
File size: 45.9 KB

Line
1	!> @file poisfft_mod.f90
2	!------------------------------------------------------------------------------!
3	! This file is part of the PALM model system.
4	!
5	! PALM is free software: you can redistribute it and/or modify it under the
6	! terms of the GNU General Public License as published by the Free Software
7	! Foundation, either version 3 of the License, or (at your option) any later
8	! version.
9	!
10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13	!
14	! You should have received a copy of the GNU General Public License along with
15	! PALM. If not, see <http://www.gnu.org/licenses/>.
16	!
17	! Copyright 1997-2019 Leibniz Universitaet Hannover
18	!------------------------------------------------------------------------------!
19	!
20	! Current revisions:
21	! -----------------
22	!
23	!
24	! Former revisions:
25	! -----------------
26	! $Id: poisfft_mod.f90 3690 2019-01-22 22:56:42Z gronemeier $
27	! OpenACC port for SPEC
28	!
29	! 3241 2018-09-12 15:02:00Z raasch
30	! unused variables removed,
31	! declarations of omp_get_thread_num now as omp-directive
32	!
33	! 2718 2018-01-02 08:49:38Z maronga
34	! Corrected "Former revisions" section
35	!
36	! 2696 2017-12-14 17:12:51Z kanani
37	! Change in file header (GPL part)
38	!
39	! 2300 2017-06-29 13:31:14Z raasch
40	! settings depending on host variable removed or replaced by loop_optimization
41	!
42	! 2119 2017-01-17 16:51:50Z raasch
43	!
44	! 2118 2017-01-17 16:38:49Z raasch
45	! OpenACC directives and related code removed
46	!
47	! 2000 2016-08-20 18:09:15Z knoop
48	! Forced header and separation lines into 80 columns
49	!
50	! 1850 2016-04-08 13:29:27Z maronga
51	! Module renamed
52	!
53	!
54	! 1804 2016-04-05 16:30:18Z maronga
55	! Removed code for parameter file check (__check)
56	!
57	! 1682 2015-10-07 23:56:08Z knoop
58	! Code annotations made doxygen readable
59	!
60	! 1482 2014-10-18 12:34:45Z raasch
61	! use 2d-decomposition, if accelerator boards are used
62	!
63	! 1406 2014-05-16 13:47:01Z raasch
64	! bugfix for pgi 14.4: declare create moved after array declaration
65	!
66	! 1320 2014-03-20 08:40:49Z raasch
67	! ONLY-attribute added to USE-statements,
68	! kind-parameters added to all INTEGER and REAL declaration statements,
69	! kinds are defined in new module kinds,
70	! old module precision_kind is removed,
71	! revision history before 2012 removed,
72	! comment fields (!:) to be used for variable explanations added to
73	! all variable declaration statements
74	!
75	! 1318 2014-03-17 13:35:16Z raasch
76	! module interfaces removed
77	!
78	! 1306 2014-03-13 14:30:59Z raasch
79	! openmp sections removed from the overlap branch,
80	! second argument removed from parameter list
81	!
82	! 1216 2013-08-26 09:31:42Z raasch
83	! resorting of arrays moved to separate routines resort_for_...,
84	! one argument, used as temporary work array, removed from all transpose
85	! routines
86	! overlapping fft / transposition implemented
87	!
88	! 1212 2013-08-15 08:46:27Z raasch
89	! tridia routines moved to seperate module tridia_solver
90	!
91	! 1208 2013-08-13 06:41:49Z raasch
92	! acc-update clauses added for "ar" so that ffts other than cufft can also be
93	! used (although they are not ported and will give a poor performance)
94	!
95	! 1111 2013-03-08 23:54:10Z raasch
96	! further openACC porting of non-parallel (MPI) branch:
97	! tridiagonal routines split into extermal subroutines (instead using CONTAINS),
98	! no distinction between parallel/non-parallel in poisfft and tridia any more,
99	! tridia routines moved to end of file because of probable bug in PGI compiler 12.5
100	! (otherwise "invalid device function" is indicated during runtime),
101	! optimization of tridia routines: constant elements and coefficients of tri are
102	! stored in seperate arrays ddzuw and tric, last dimension of tri reduced from 5
103	! to 2,
104	! poisfft_init is now called internally from poisfft, maketri is called from
105	! poisfft_init,
106	! ibc_p_b = 2 removed
107	!
108	! 1106 2013-03-04 05:31:38Z raasch
109	! routines fftx, ffty, fftxp, fftyp removed, calls replaced by fft_x, fft_y,
110	! in the 1D-decomposition routines fft_x, ffty are replaced by fft_x_1d,
111	! fft_y_1d
112	!
113	! 1103 2013-02-20 02:15:53Z raasch
114	! tri, ar, and ar1 arguments in tridia-routines (2d) are removed because they
115	! sometimes cause segmentation faults with intel 12.1 compiler
116	!
117	! 1092 2013-02-02 11:24:22Z raasch
118	! unused variables removed
119	!
120	! 1036 2012-10-22 13:43:42Z raasch
121	! code put under GPL (PALM 3.9)
122	!
123	! 2012-09-21 07:03:55Z raasch
124	! FLOAT type conversion replaced by REAL
125	!
126	! 1003 2012-09-14 14:35:53Z raasch
127	! indices nxa, nya, etc. replaced by nx, ny, etc.
128	!
129	! 940 2012-07-09 14:31:00Z raasch
130	! special handling of tri-array as an argument in tridia_1dd routines switched
131	! off because it caused segmentation faults with intel 12.1 compiler
132	!
133	! 877 2012-04-03 11:21:44Z suehring
134	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
135	! pressure at the top of the model domain.
136	!
137	! 809 2012-01-30 13:32:58Z maronga
138	! Bugfix: replaced .AND. and .NOT. with && and ! in the preprocessor directives
139	!
140	! 807 2012-01-25 11:53:51Z maronga
141	! New cpp directive "__check" implemented which is used by check_namelist_files
142	! (most of the code is unneeded by check_namelist_files).
143	!
144	! Revision 1.1 1997/07/24 11:24:14 raasch
145	! Initial revision
146	!
147	!
148	! Description:
149	! ------------
150	!> Solves the Poisson equation with a 2D spectral method
151	!> d^2 p / dx^2 + d^2 p / dy^2 + d^2 p / dz^2 = s
152	!>
153	!> Input:
154	!> real ar contains (nnz,nny,nnx) elements of the velocity divergence,
155	!> starting from (1,nys,nxl)
156	!>
157	!> Output:
158	!> real ar contains the solution for perturbation pressure p
159	!------------------------------------------------------------------------------!
160	MODULE poisfft_mod
161
162
163	USE fft_xy, &
164	ONLY: fft_init, fft_y, fft_y_1d, fft_y_m, fft_x, fft_x_1d, fft_x_m
165
166	USE indices, &
167	ONLY: nnx, nny, nx, nxl, nxr, ny, nys, nyn, nz
168
169	USE transpose_indices, &
170	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nys_x, nys_z, nyn_x, nyn_z, nzb_x, &
171	nzb_y, nzt_x, nzt_y
172
173	USE tridia_solver, &
174	ONLY: tridia_1dd, tridia_init, tridia_substi, tridia_substi_overlap
175
176	IMPLICIT NONE
177
178	LOGICAL, SAVE :: poisfft_initialized = .FALSE.
179
180	PRIVATE
181
182	PUBLIC poisfft, poisfft_init
183
184	INTERFACE poisfft
185	MODULE PROCEDURE poisfft
186	END INTERFACE poisfft
187
188	INTERFACE poisfft_init
189	MODULE PROCEDURE poisfft_init
190	END INTERFACE poisfft_init
191
192
193	CONTAINS
194
195	!------------------------------------------------------------------------------!
196	! Description:
197	! ------------
198	!> Setup coefficients for FFT and the tridiagonal solver
199	!------------------------------------------------------------------------------!
200	SUBROUTINE poisfft_init
201
202	IMPLICIT NONE
203
204
205	CALL fft_init
206
207	CALL tridia_init
208
209	poisfft_initialized = .TRUE.
210
211	END SUBROUTINE poisfft_init
212
213
214
215	!------------------------------------------------------------------------------!
216	! Description:
217	! ------------
218	!> Two-dimensional Fourier Transformation in x- and y-direction.
219	!------------------------------------------------------------------------------!
220	SUBROUTINE poisfft( ar )
221
222	USE control_parameters, &
223	ONLY: transpose_compute_overlap
224
225	USE cpulog, &
226	ONLY: cpu_log, cpu_log_nowait, log_point_s
227
228	USE kinds
229
230	USE pegrid
231
232	IMPLICIT NONE
233
234	INTEGER(iwp) :: ii !<
235	INTEGER(iwp) :: iind !<
236	INTEGER(iwp) :: inew !<
237	INTEGER(iwp) :: jj !<
238	INTEGER(iwp) :: jind !<
239	INTEGER(iwp) :: jnew !<
240	INTEGER(iwp) :: ki !<
241	INTEGER(iwp) :: kk !<
242	INTEGER(iwp) :: knew !<
243	INTEGER(iwp) :: n !<
244	INTEGER(iwp) :: nblk !<
245	INTEGER(iwp) :: nnx_y !<
246	INTEGER(iwp) :: nny_z !<
247	INTEGER(iwp) :: nnz_x !<
248	INTEGER(iwp) :: nxl_y_bound !<
249	INTEGER(iwp) :: nxr_y_bound !<
250
251	INTEGER(iwp), DIMENSION(4) :: isave !<
252
253	REAL(wp), DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar !<
254	REAL(wp), DIMENSION(nys:nyn,nxl:nxr,1:nz) :: ar_inv !<
255
256	#define __acc_fft_device ( defined( _OPENACC ) && ( defined ( __cuda_fft ) ) )
257	#if __acc_fft_device
258	!$ACC DECLARE CREATE(ar_inv)
259	#endif
260
261	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: ar1 !<
262	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_in !<
263	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_inv !<
264	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_out_y !<
265	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_out_z !<
266
267
268	CALL cpu_log( log_point_s(3), 'poisfft', 'start' )
269
270	IF ( .NOT. poisfft_initialized ) CALL poisfft_init
271
272	#if !__acc_fft_device
273	!$ACC UPDATE HOST(ar)
274	#endif
275
276	#ifndef _OPENACC
277	!
278	!-- Two-dimensional Fourier Transformation in x- and y-direction.
279	IF ( pdims(2) == 1 .AND. pdims(1) > 1 ) THEN
280
281	!
282	!-- 1d-domain-decomposition along x:
283	!-- FFT along y and transposition y --> x
284	CALL ffty_tr_yx( ar, ar )
285
286	!
287	!-- FFT along x, solving the tridiagonal system and backward FFT
288	CALL fftx_tri_fftx( ar )
289
290	!
291	!-- Transposition x --> y and backward FFT along y
292	CALL tr_xy_ffty( ar, ar )
293
294	ELSEIF ( pdims(1) == 1 .AND. pdims(2) > 1 ) THEN
295
296	!
297	!-- 1d-domain-decomposition along y:
298	!-- FFT along x and transposition x --> y
299	CALL fftx_tr_xy( ar, ar )
300
301	!
302	!-- FFT along y, solving the tridiagonal system and backward FFT
303	CALL ffty_tri_ffty( ar )
304
305	!
306	!-- Transposition y --> x and backward FFT along x
307	CALL tr_yx_fftx( ar, ar )
308
309	ELSEIF ( .NOT. transpose_compute_overlap ) THEN
310	#endif
311
312	!
313	!-- 2d-domain-decomposition or no decomposition (1 PE run)
314	!-- Transposition z --> x
315	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
316	CALL resort_for_zx( ar, ar_inv )
317	CALL transpose_zx( ar_inv, ar )
318	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
319
320	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
321	CALL fft_x( ar, 'forward' )
322	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
323
324	!
325	!-- Transposition x --> y
326	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
327	CALL resort_for_xy( ar, ar_inv )
328	CALL transpose_xy( ar_inv, ar )
329	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
330
331	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
332	CALL fft_y( ar, 'forward', ar_tr = ar, &
333	nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
334	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
335	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
336
337	!
338	!-- Transposition y --> z
339	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
340	CALL resort_for_yz( ar, ar_inv )
341	CALL transpose_yz( ar_inv, ar )
342	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
343
344	!
345	!-- Solve the tridiagonal equation system along z
346	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
347	CALL tridia_substi( ar )
348	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
349
350	!
351	!-- Inverse Fourier Transformation
352	!-- Transposition z --> y
353	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
354	CALL transpose_zy( ar, ar_inv )
355	CALL resort_for_zy( ar_inv, ar )
356	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
357
358	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
359	CALL fft_y( ar, 'backward', ar_tr = ar, &
360	nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
361	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
362	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
363
364	!
365	!-- Transposition y --> x
366	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
367	CALL transpose_yx( ar, ar_inv )
368	CALL resort_for_yx( ar_inv, ar )
369	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
370
371	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
372	CALL fft_x( ar, 'backward' )
373	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
374
375	!
376	!-- Transposition x --> z
377	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
378	CALL transpose_xz( ar, ar_inv )
379	CALL resort_for_xz( ar_inv, ar )
380	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
381
382	#ifndef _OPENACC
383	ELSE
384
385	!
386	!-- 2d-domain-decomposition or no decomposition (1 PE run) with
387	!-- overlapping transposition / fft
388	!-- cputime logging must not use barriers, which would prevent overlapping
389	ALLOCATE( f_out_y(0:ny,nxl_y:nxr_y,nzb_y:nzt_y), &
390	f_out_z(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
391	!
392	!-- Transposition z --> x + subsequent fft along x
393	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
394	CALL resort_for_zx( ar, f_inv )
395	!
396	!-- Save original indices and gridpoint counter
397	isave(1) = nz
398	isave(2) = nzb_x
399	isave(3) = nzt_x
400	isave(4) = sendrecvcount_zx
401	!
402	!-- Set new indices for transformation
403	nblk = nz / pdims(1)
404	nz = pdims(1)
405	nnz_x = 1
406	nzb_x = 1 + myidx * nnz_x
407	nzt_x = ( myidx + 1 ) * nnz_x
408	sendrecvcount_zx = nnx * nny * nnz_x
409
410	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
411	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
412
413	DO kk = 1, nblk
414
415	IF ( kk == 1 ) THEN
416	CALL cpu_log( log_point_s(5), 'transpo forward', 'start', cpu_log_nowait )
417	ELSE
418	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
419	ENDIF
420
421	DO knew = 1, nz
422	ki = kk + nblk * ( knew - 1 )
423	f_in(:,:,knew) = f_inv(:,:,ki)
424	ENDDO
425
426	CALL transpose_zx( f_in, ar1(:,:,:))
427	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
428
429	IF ( kk == 1 ) THEN
430	CALL cpu_log( log_point_s(4), 'fft_x', 'start', cpu_log_nowait )
431	ELSE
432	CALL cpu_log( log_point_s(4), 'fft_x', 'continue', cpu_log_nowait )
433	ENDIF
434
435	n = isave(2) + kk - 1
436	CALL fft_x( ar1(:,:,:), 'forward', ar_2d = f_out_z(:,:,n))
437	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
438
439	ENDDO
440	!
441	!-- Restore original indices/counters
442	nz = isave(1)
443	nzb_x = isave(2)
444	nzt_x = isave(3)
445	sendrecvcount_zx = isave(4)
446
447	DEALLOCATE( ar1, f_in, f_inv )
448
449	!
450	!-- Transposition x --> y + subsequent fft along y
451	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
452	CALL resort_for_xy( f_out_z, f_inv )
453	!
454	!-- Save original indices and gridpoint counter
455	isave(1) = nx
456	isave(2) = nxl_y
457	isave(3) = nxr_y
458	isave(4) = sendrecvcount_xy
459	!
460	!-- Set new indices for transformation
461	nblk = ( ( nx+1 ) / pdims(2) ) - 1
462	nx = pdims(2)
463	nnx_y = 1
464	nxl_y = myidy * nnx_y
465	nxr_y = ( myidy + 1 ) * nnx_y - 1
466	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
467
468	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
469	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
470
471	DO ii = 0, nblk
472
473	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
474
475	DO inew = 0, nx-1
476	iind = ii + ( nblk + 1 ) * inew
477	f_in(:,:,inew) = f_inv(:,:,iind)
478	ENDDO
479
480	CALL transpose_xy( f_in, ar1(:,:,:) )
481
482	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
483
484	IF ( ii == 1 ) THEN
485	CALL cpu_log( log_point_s(7), 'fft_y', 'start', cpu_log_nowait )
486	ELSE
487	CALL cpu_log( log_point_s(7), 'fft_y', 'continue', cpu_log_nowait )
488	ENDIF
489
490	nxl_y_bound = isave(2)
491	nxr_y_bound = isave(3)
492	n = isave(2) + ii
493	CALL fft_y( ar1(:,:,:), 'forward', ar_tr = f_out_y, &
494	nxl_y_bound = nxl_y_bound, nxr_y_bound = nxr_y_bound, &
495	nxl_y_l = n, nxr_y_l = n )
496
497	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
498
499	ENDDO
500	!
501	!-- Restore original indices/counters
502	nx = isave(1)
503	nxl_y = isave(2)
504	nxr_y = isave(3)
505	sendrecvcount_xy = isave(4)
506
507	DEALLOCATE( ar1, f_in, f_inv )
508
509	!
510	!-- Transposition y --> z + subsequent tridia + resort for z --> y
511	ALLOCATE( f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
512	CALL resort_for_yz( f_out_y, f_inv )
513	!
514	!-- Save original indices and gridpoint counter
515	isave(1) = ny
516	isave(2) = nys_z
517	isave(3) = nyn_z
518	isave(4) = sendrecvcount_yz
519	!
520	!-- Set new indices for transformation
521	nblk = ( ( ny+1 ) / pdims(1) ) - 1
522	ny = pdims(1)
523	nny_z = 1
524	nys_z = myidx * nny_z
525	nyn_z = ( myidx + 1 ) * nny_z - 1
526	sendrecvcount_yz = ( nxr_y-nxl_y+1 ) * nny_z * ( nzt_y-nzb_y+1 )
527
528	ALLOCATE( ar1(nxl_z:nxr_z,nys_z:nyn_z,1:nz) )
529	ALLOCATE( f_in(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
530
531	DO jj = 0, nblk
532	!
533	!-- Forward Fourier Transformation
534	!-- Transposition y --> z
535	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
536
537	DO jnew = 0, ny-1
538	jind = jj + ( nblk + 1 ) * jnew
539	f_in(:,:,jnew) = f_inv(:,:,jind)
540	ENDDO
541
542	CALL transpose_yz( f_in, ar1(:,:,:) )
543
544	IF ( jj == nblk ) THEN
545	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
546	ELSE
547	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
548	ENDIF
549
550	!
551	!-- Solve the tridiagonal equation system along z
552	CALL cpu_log( log_point_s(6), 'tridia', 'start', cpu_log_nowait )
553
554	n = isave(2) + jj
555	CALL tridia_substi_overlap( ar1(:,:,:), n )
556
557	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
558
559	!
560	!-- Inverse Fourier Transformation
561	!-- Transposition z --> y
562	!-- Only one thread should call MPI routines, therefore forward and
563	!-- backward tranpose are in the same section
564	IF ( jj == 0 ) THEN
565	CALL cpu_log( log_point_s(8), 'transpo invers', 'start', cpu_log_nowait )
566	ELSE
567	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
568	ENDIF
569
570	CALL transpose_zy( ar1(:,:,:), f_in )
571
572	DO jnew = 0, ny-1
573	jind = jj + ( nblk + 1 ) * jnew
574	f_inv(:,:,jind) = f_in(:,:,jnew)
575	ENDDO
576
577	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
578
579	ENDDO
580	!
581	!-- Restore original indices/counters
582	ny = isave(1)
583	nys_z = isave(2)
584	nyn_z = isave(3)
585	sendrecvcount_yz = isave(4)
586
587	CALL resort_for_zy( f_inv, f_out_y )
588
589	DEALLOCATE( ar1, f_in, f_inv )
590
591	!
592	!-- fft along y backward + subsequent transposition y --> x
593	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
594	!
595	!-- Save original indices and gridpoint counter
596	isave(1) = nx
597	isave(2) = nxl_y
598	isave(3) = nxr_y
599	isave(4) = sendrecvcount_xy
600	!
601	!-- Set new indices for transformation
602	nblk = (( nx+1 ) / pdims(2) ) - 1
603	nx = pdims(2)
604	nnx_y = 1
605	nxl_y = myidy * nnx_y
606	nxr_y = ( myidy + 1 ) * nnx_y - 1
607	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
608
609	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
610	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
611
612	DO ii = 0, nblk
613
614	CALL cpu_log( log_point_s(7), 'fft_y', 'continue', cpu_log_nowait )
615
616	n = isave(2) + ii
617	nxl_y_bound = isave(2)
618	nxr_y_bound = isave(3)
619
620	CALL fft_y( ar1(:,:,:), 'backward', ar_tr = f_out_y, &
621	nxl_y_bound = nxl_y_bound, nxr_y_bound = nxr_y_bound, &
622	nxl_y_l = n, nxr_y_l = n )
623
624	IF ( ii == nblk ) THEN
625	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
626	ELSE
627	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
628	ENDIF
629
630	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
631
632	CALL transpose_yx( ar1(:,:,:), f_in )
633
634	DO inew = 0, nx-1
635	iind = ii + (nblk+1) * inew
636	f_inv(:,:,iind) = f_in(:,:,inew)
637	ENDDO
638
639	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
640
641	ENDDO
642	!
643	!-- Restore original indices/counters
644	nx = isave(1)
645	nxl_y = isave(2)
646	nxr_y = isave(3)
647	sendrecvcount_xy = isave(4)
648
649	CALL resort_for_yx( f_inv, f_out_z )
650
651	DEALLOCATE( ar1, f_in, f_inv )
652
653	!
654	!-- fft along x backward + subsequent final transposition x --> z
655	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
656	!
657	!-- Save original indices and gridpoint counter
658	isave(1) = nz
659	isave(2) = nzb_x
660	isave(3) = nzt_x
661	isave(4) = sendrecvcount_zx
662	!
663	!-- Set new indices for transformation
664	nblk = nz / pdims(1)
665	nz = pdims(1)
666	nnz_x = 1
667	nzb_x = 1 + myidx * nnz_x
668	nzt_x = ( myidx + 1 ) * nnz_x
669	sendrecvcount_zx = nnx * nny * nnz_x
670
671	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
672	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
673
674	DO kk = 1, nblk
675
676	CALL cpu_log( log_point_s(4), 'fft_x', 'continue', cpu_log_nowait )
677
678	n = isave(2) + kk - 1
679	CALL fft_x( ar1(:,:,:), 'backward', f_out_z(:,:,n))
680
681	IF ( kk == nblk ) THEN
682	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
683	ELSE
684	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
685	ENDIF
686
687	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
688
689	CALL transpose_xz( ar1(:,:,:), f_in )
690
691	DO knew = 1, nz
692	ki = kk + nblk * (knew-1)
693	f_inv(:,:,ki) = f_in(:,:,knew)
694	ENDDO
695
696	IF ( kk == nblk ) THEN
697	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
698	ELSE
699	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
700	ENDIF
701
702	ENDDO
703	!
704	!-- Restore original indices/counters
705	nz = isave(1)
706	nzb_x = isave(2)
707	nzt_x = isave(3)
708	sendrecvcount_zx = isave(4)
709
710	CALL resort_for_xz( f_inv, ar )
711
712	DEALLOCATE( ar1, f_in, f_inv )
713
714	ENDIF
715	#endif
716
717	#if !__acc_fft_device
718	!$ACC UPDATE DEVICE(ar)
719	#endif
720
721	CALL cpu_log( log_point_s(3), 'poisfft', 'stop' )
722
723	END SUBROUTINE poisfft
724
725
726	!------------------------------------------------------------------------------!
727	! Description:
728	! ------------
729	!> Fourier-transformation along y with subsequent transposition y --> x for
730	!> a 1d-decomposition along x.
731	!>
732	!> @attention The performance of this routine is much faster on the NEC-SX6,
733	!> if the first index of work_ffty_vec is odd. Otherwise
734	!> memory bank conflicts may occur (especially if the index is a
735	!> multiple of 128). That's why work_ffty_vec is dimensioned as
736	!> 0:ny+1.
737	!> Of course, this will not work if users are using an odd number
738	!> of gridpoints along y.
739	!------------------------------------------------------------------------------!
740	SUBROUTINE ffty_tr_yx( f_in, f_out )
741
742	USE control_parameters, &
743	ONLY: loop_optimization
744
745	USE cpulog, &
746	ONLY: cpu_log, log_point_s
747
748	USE kinds
749
750	USE pegrid
751
752	IMPLICIT NONE
753
754	INTEGER(iwp) :: i !<
755	INTEGER(iwp) :: iend !<
756	INTEGER(iwp) :: iouter !<
757	INTEGER(iwp) :: ir !<
758	INTEGER(iwp) :: j !<
759	INTEGER(iwp) :: k !<
760
761	INTEGER(iwp), PARAMETER :: stridex = 4 !<
762
763	REAL(wp), DIMENSION(1:nz,0:ny,nxl:nxr) :: f_in !<
764	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_out !<
765	REAL(wp), DIMENSION(nxl:nxr,1:nz,0:ny) :: work !<
766
767	REAL(wp), DIMENSION(:,:), ALLOCATABLE :: work_ffty !<
768	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: work_ffty_vec !<
769
770	!
771	!-- Carry out the FFT along y, where all data are present due to the
772	!-- 1d-decomposition along x. Resort the data in a way that x becomes
773	!-- the first index.
774	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'start' )
775
776	IF ( loop_optimization == 'vector' ) THEN
777
778	ALLOCATE( work_ffty_vec(0:ny+1,1:nz,nxl:nxr) )
779	!
780	!-- Code optimized for vector processors
781	!$OMP PARALLEL PRIVATE ( i, j, k )
782	!$OMP DO
783	DO i = nxl, nxr
784
785	DO j = 0, ny
786	DO k = 1, nz
787	work_ffty_vec(j,k,i) = f_in(k,j,i)
788	ENDDO
789	ENDDO
790
791	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' )
792
793	ENDDO
794
795	!$OMP DO
796	DO k = 1, nz
797	DO j = 0, ny
798	DO i = nxl, nxr
799	work(i,k,j) = work_ffty_vec(j,k,i)
800	ENDDO
801	ENDDO
802	ENDDO
803	!$OMP END PARALLEL
804
805	DEALLOCATE( work_ffty_vec )
806
807	ELSE
808	!
809	!-- Cache optimized code.
810	ALLOCATE( work_ffty(0:ny,stridex) )
811	!
812	!-- The i-(x-)direction is split into a strided outer loop and an inner
813	!-- loop for better cache performance
814	!$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty)
815	!$OMP DO
816	DO iouter = nxl, nxr, stridex
817
818	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
819
820	DO k = 1, nz
821
822	DO i = iouter, iend
823
824	ir = i-iouter+1 ! counter within a stride
825	DO j = 0, ny
826	work_ffty(j,ir) = f_in(k,j,i)
827	ENDDO
828	!
829	!-- FFT along y
830	CALL fft_y_1d( work_ffty(:,ir), 'forward' )
831
832	ENDDO
833
834	!
835	!-- Resort
836	DO j = 0, ny
837	DO i = iouter, iend
838	work(i,k,j) = work_ffty(j,i-iouter+1)
839	ENDDO
840	ENDDO
841
842	ENDDO
843
844	ENDDO
845	!$OMP END PARALLEL
846
847	DEALLOCATE( work_ffty )
848
849	ENDIF
850
851	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'pause' )
852
853	!
854	!-- Transpose array
855	#if defined( __parallel )
856	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
857	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
858	CALL MPI_ALLTOALL( work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
859	f_out(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
860	comm1dx, ierr )
861	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
862	#endif
863
864	END SUBROUTINE ffty_tr_yx
865
866
867	!------------------------------------------------------------------------------!
868	! Description:
869	! ------------
870	!> Transposition x --> y with a subsequent backward Fourier transformation for
871	!> a 1d-decomposition along x
872	!------------------------------------------------------------------------------!
873	SUBROUTINE tr_xy_ffty( f_in, f_out )
874
875	USE control_parameters, &
876	ONLY: loop_optimization
877
878	USE cpulog, &
879	ONLY: cpu_log, log_point_s
880
881	USE kinds
882
883	USE pegrid
884
885	IMPLICIT NONE
886
887	INTEGER(iwp) :: i !<
888	INTEGER(iwp) :: iend !<
889	INTEGER(iwp) :: iouter !<
890	INTEGER(iwp) :: ir !<
891	INTEGER(iwp) :: j !<
892	INTEGER(iwp) :: k !<
893
894	INTEGER(iwp), PARAMETER :: stridex = 4 !<
895
896	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_in !<
897	REAL(wp), DIMENSION(1:nz,0:ny,nxl:nxr) :: f_out !<
898	REAL(wp), DIMENSION(nxl:nxr,1:nz,0:ny) :: work !<
899
900	REAL(wp), DIMENSION(:,:), ALLOCATABLE :: work_ffty !<
901	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: work_ffty_vec !<
902
903	!
904	!-- Transpose array
905	#if defined( __parallel )
906	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
907	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
908	CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
909	work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
910	comm1dx, ierr )
911	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
912	#endif
913
914	!
915	!-- Resort the data in a way that y becomes the first index and carry out the
916	!-- backward fft along y.
917	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'continue' )
918
919	IF ( loop_optimization == 'vector' ) THEN
920
921	ALLOCATE( work_ffty_vec(0:ny+1,1:nz,nxl:nxr) )
922	!
923	!-- Code optimized for vector processors
924	!$OMP PARALLEL PRIVATE ( i, j, k )
925	!$OMP DO
926	DO k = 1, nz
927	DO j = 0, ny
928	DO i = nxl, nxr
929	work_ffty_vec(j,k,i) = work(i,k,j)
930	ENDDO
931	ENDDO
932	ENDDO
933
934	!$OMP DO
935	DO i = nxl, nxr
936
937	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' )
938
939	DO j = 0, ny
940	DO k = 1, nz
941	f_out(k,j,i) = work_ffty_vec(j,k,i)
942	ENDDO
943	ENDDO
944
945	ENDDO
946	!$OMP END PARALLEL
947
948	DEALLOCATE( work_ffty_vec )
949
950	ELSE
951	!
952	!-- Cache optimized code.
953	ALLOCATE( work_ffty(0:ny,stridex) )
954	!
955	!-- The i-(x-)direction is split into a strided outer loop and an inner
956	!-- loop for better cache performance
957	!$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty )
958	!$OMP DO
959	DO iouter = nxl, nxr, stridex
960
961	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
962
963	DO k = 1, nz
964	!
965	!-- Resort
966	DO j = 0, ny
967	DO i = iouter, iend
968	work_ffty(j,i-iouter+1) = work(i,k,j)
969	ENDDO
970	ENDDO
971
972	DO i = iouter, iend
973
974	!
975	!-- FFT along y
976	ir = i-iouter+1 ! counter within a stride
977	CALL fft_y_1d( work_ffty(:,ir), 'backward' )
978
979	DO j = 0, ny
980	f_out(k,j,i) = work_ffty(j,ir)
981	ENDDO
982	ENDDO
983
984	ENDDO
985
986	ENDDO
987	!$OMP END PARALLEL
988
989	DEALLOCATE( work_ffty )
990
991	ENDIF
992
993	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'stop' )
994
995	END SUBROUTINE tr_xy_ffty
996
997
998	!------------------------------------------------------------------------------!
999	! Description:
1000	! ------------
1001	!> FFT along x, solution of the tridiagonal system and backward FFT for
1002	!> a 1d-decomposition along x
1003	!>
1004	!> @warning this subroutine may still not work for hybrid parallelization
1005	!> with OpenMP (for possible necessary changes see the original
1006	!> routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
1007	!------------------------------------------------------------------------------!
1008	SUBROUTINE fftx_tri_fftx( ar )
1009
1010	USE control_parameters, &
1011	ONLY: loop_optimization
1012
1013	USE cpulog, &
1014	ONLY: cpu_log, log_point_s
1015
1016	USE grid_variables, &
1017	ONLY: ddx2, ddy2
1018
1019	USE kinds
1020
1021	USE pegrid
1022
1023	IMPLICIT NONE
1024
1025	INTEGER(iwp) :: i !<
1026	INTEGER(iwp) :: j !<
1027	INTEGER(iwp) :: k !<
1028	INTEGER(iwp) :: m !<
1029	INTEGER(iwp) :: n !<
1030	!$ INTEGER(iwp) :: omp_get_thread_num !<
1031	INTEGER(iwp) :: tn !<
1032
1033	REAL(wp), DIMENSION(0:nx) :: work_fftx !<
1034	REAL(wp), DIMENSION(0:nx,1:nz) :: work_trix !<
1035	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: ar !<
1036	REAL(wp), DIMENSION(:,:,:,:), ALLOCATABLE :: tri !<
1037
1038
1039	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'start' )
1040
1041	ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) )
1042
1043	tn = 0 ! Default thread number in case of one thread
1044	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_fftx, work_trix )
1045	DO j = nys_x, nyn_x
1046
1047	!$ tn = omp_get_thread_num()
1048
1049	IF ( loop_optimization == 'vector' ) THEN
1050	!
1051	!-- Code optimized for vector processors
1052	DO k = 1, nz
1053
1054	m = 0
1055	DO n = 1, pdims(1)
1056	DO i = 1, nnx
1057	work_trix(m,k) = ar(i,k,j,n)
1058	m = m + 1
1059	ENDDO
1060	ENDDO
1061
1062	ENDDO
1063
1064	CALL fft_x_m( work_trix, 'forward' )
1065
1066	ELSE
1067	!
1068	!-- Cache optimized code
1069	DO k = 1, nz
1070
1071	m = 0
1072	DO n = 1, pdims(1)
1073	DO i = 1, nnx
1074	work_fftx(m) = ar(i,k,j,n)
1075	m = m + 1
1076	ENDDO
1077	ENDDO
1078
1079	CALL fft_x_1d( work_fftx, 'forward' )
1080
1081	DO i = 0, nx
1082	work_trix(i,k) = work_fftx(i)
1083	ENDDO
1084
1085	ENDDO
1086
1087	ENDIF
1088
1089	!
1090	!-- Solve the linear equation system
1091	CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) )
1092
1093	IF ( loop_optimization == 'vector' ) THEN
1094	!
1095	!-- Code optimized for vector processors
1096	CALL fft_x_m( work_trix, 'backward' )
1097
1098	DO k = 1, nz
1099
1100	m = 0
1101	DO n = 1, pdims(1)
1102	DO i = 1, nnx
1103	ar(i,k,j,n) = work_trix(m,k)
1104	m = m + 1
1105	ENDDO
1106	ENDDO
1107
1108	ENDDO
1109
1110	ELSE
1111	!
1112	!-- Cache optimized code
1113	DO k = 1, nz
1114
1115	DO i = 0, nx
1116	work_fftx(i) = work_trix(i,k)
1117	ENDDO
1118
1119	CALL fft_x_1d( work_fftx, 'backward' )
1120
1121	m = 0
1122	DO n = 1, pdims(1)
1123	DO i = 1, nnx
1124	ar(i,k,j,n) = work_fftx(m)
1125	m = m + 1
1126	ENDDO
1127	ENDDO
1128
1129	ENDDO
1130
1131	ENDIF
1132
1133	ENDDO
1134
1135	DEALLOCATE( tri )
1136
1137	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'stop' )
1138
1139	END SUBROUTINE fftx_tri_fftx
1140
1141
1142	!------------------------------------------------------------------------------!
1143	! Description:
1144	! ------------
1145	!> Fourier-transformation along x with subsequent transposition x --> y for
1146	!> a 1d-decomposition along y.
1147	!>
1148	!> @attention NEC-branch of this routine may significantly profit from
1149	!> further optimizations. So far, performance is much worse than
1150	!> for routine ffty_tr_yx (more than three times slower).
1151	!------------------------------------------------------------------------------!
1152	SUBROUTINE fftx_tr_xy( f_in, f_out )
1153
1154
1155	USE control_parameters, &
1156	ONLY: loop_optimization
1157
1158	USE cpulog, &
1159	ONLY: cpu_log, log_point_s
1160
1161	USE kinds
1162
1163	USE pegrid
1164
1165	IMPLICIT NONE
1166
1167	INTEGER(iwp) :: i !<
1168	INTEGER(iwp) :: j !<
1169	INTEGER(iwp) :: k !<
1170
1171	REAL(wp), DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx !<
1172	REAL(wp), DIMENSION(1:nz,nys:nyn,0:nx) :: f_in !<
1173	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_out !<
1174	REAL(wp), DIMENSION(nys:nyn,1:nz,0:nx) :: work !<
1175
1176	!
1177	!-- Carry out the FFT along x, where all data are present due to the
1178	!-- 1d-decomposition along y. Resort the data in a way that y becomes
1179	!-- the first index.
1180	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'start' )
1181
1182	IF ( loop_optimization == 'vector' ) THEN
1183	!
1184	!-- Code for vector processors
1185	!$OMP PARALLEL PRIVATE ( i, j, k )
1186	!$OMP DO
1187	DO i = 0, nx
1188
1189	DO j = nys, nyn
1190	DO k = 1, nz
1191	work_fftx(i,k,j) = f_in(k,j,i)
1192	ENDDO
1193	ENDDO
1194
1195	ENDDO
1196
1197	!$OMP DO
1198	DO j = nys, nyn
1199
1200	CALL fft_x_m( work_fftx(:,:,j), 'forward' )
1201
1202	DO k = 1, nz
1203	DO i = 0, nx
1204	work(j,k,i) = work_fftx(i,k,j)
1205	ENDDO
1206	ENDDO
1207
1208	ENDDO
1209	!$OMP END PARALLEL
1210
1211	ELSE
1212
1213	!
1214	!-- Cache optimized code (there might be still a potential for better
1215	!-- optimization).
1216	!$OMP PARALLEL PRIVATE (i,j,k)
1217	!$OMP DO
1218	DO i = 0, nx
1219
1220	DO j = nys, nyn
1221	DO k = 1, nz
1222	work_fftx(i,k,j) = f_in(k,j,i)
1223	ENDDO
1224	ENDDO
1225
1226	ENDDO
1227
1228	!$OMP DO
1229	DO j = nys, nyn
1230	DO k = 1, nz
1231
1232	CALL fft_x_1d( work_fftx(0:nx,k,j), 'forward' )
1233
1234	DO i = 0, nx
1235	work(j,k,i) = work_fftx(i,k,j)
1236	ENDDO
1237	ENDDO
1238
1239	ENDDO
1240	!$OMP END PARALLEL
1241
1242	ENDIF
1243	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'pause' )
1244
1245	!
1246	!-- Transpose array
1247	#if defined( __parallel )
1248	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1249	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1250	CALL MPI_ALLTOALL( work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
1251	f_out(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
1252	comm1dy, ierr )
1253	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1254	#endif
1255
1256	END SUBROUTINE fftx_tr_xy
1257
1258
1259	!------------------------------------------------------------------------------!
1260	! Description:
1261	! ------------
1262	!> Transposition y --> x with a subsequent backward Fourier transformation for
1263	!> a 1d-decomposition along x.
1264	!------------------------------------------------------------------------------!
1265	SUBROUTINE tr_yx_fftx( f_in, f_out )
1266
1267
1268	USE control_parameters, &
1269	ONLY: loop_optimization
1270
1271	USE cpulog, &
1272	ONLY: cpu_log, log_point_s
1273
1274	USE kinds
1275
1276	USE pegrid
1277
1278	IMPLICIT NONE
1279
1280	INTEGER(iwp) :: i !<
1281	INTEGER(iwp) :: j !<
1282	INTEGER(iwp) :: k !<
1283
1284	REAL(wp), DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx !<
1285	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_in !<
1286	REAL(wp), DIMENSION(1:nz,nys:nyn,0:nx) :: f_out !<
1287	REAL(wp), DIMENSION(nys:nyn,1:nz,0:nx) :: work !<
1288
1289	!
1290	!-- Transpose array
1291	#if defined( __parallel )
1292	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1293	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1294	CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
1295	work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
1296	comm1dy, ierr )
1297	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1298	#endif
1299
1300	!
1301	!-- Carry out the FFT along x, where all data are present due to the
1302	!-- 1d-decomposition along y. Resort the data in a way that y becomes
1303	!-- the first index.
1304	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'continue' )
1305
1306	IF ( loop_optimization == 'vector' ) THEN
1307	!
1308	!-- Code optimized for vector processors
1309	!$OMP PARALLEL PRIVATE ( i, j, k )
1310	!$OMP DO
1311	DO j = nys, nyn
1312
1313	DO k = 1, nz
1314	DO i = 0, nx
1315	work_fftx(i,k,j) = work(j,k,i)
1316	ENDDO
1317	ENDDO
1318
1319	CALL fft_x_m( work_fftx(:,:,j), 'backward' )
1320
1321	ENDDO
1322
1323	!$OMP DO
1324	DO i = 0, nx
1325	DO j = nys, nyn
1326	DO k = 1, nz
1327	f_out(k,j,i) = work_fftx(i,k,j)
1328	ENDDO
1329	ENDDO
1330	ENDDO
1331	!$OMP END PARALLEL
1332
1333	ELSE
1334
1335	!
1336	!-- Cache optimized code (there might be still a potential for better
1337	!-- optimization).
1338	!$OMP PARALLEL PRIVATE (i,j,k)
1339	!$OMP DO
1340	DO j = nys, nyn
1341	DO k = 1, nz
1342
1343	DO i = 0, nx
1344	work_fftx(i,k,j) = work(j,k,i)
1345	ENDDO
1346
1347	CALL fft_x_1d( work_fftx(0:nx,k,j), 'backward' )
1348
1349	ENDDO
1350	ENDDO
1351
1352	!$OMP DO
1353	DO i = 0, nx
1354	DO j = nys, nyn
1355	DO k = 1, nz
1356	f_out(k,j,i) = work_fftx(i,k,j)
1357	ENDDO
1358	ENDDO
1359	ENDDO
1360	!$OMP END PARALLEL
1361
1362	ENDIF
1363	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'stop' )
1364
1365	END SUBROUTINE tr_yx_fftx
1366
1367
1368	!------------------------------------------------------------------------------!
1369	! Description:
1370	! ------------
1371	!> FFT along y, solution of the tridiagonal system and backward FFT for
1372	!> a 1d-decomposition along y.
1373	!>
1374	!> @warning this subroutine may still not work for hybrid parallelization
1375	!> with OpenMP (for possible necessary changes see the original
1376	!> routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
1377	!------------------------------------------------------------------------------!
1378	SUBROUTINE ffty_tri_ffty( ar )
1379
1380
1381	USE control_parameters, &
1382	ONLY: loop_optimization
1383
1384	USE cpulog, &
1385	ONLY: cpu_log, log_point_s
1386
1387	USE grid_variables, &
1388	ONLY: ddx2, ddy2
1389
1390	USE kinds
1391
1392	USE pegrid
1393
1394	IMPLICIT NONE
1395
1396	INTEGER(iwp) :: i !<
1397	INTEGER(iwp) :: j !<
1398	INTEGER(iwp) :: k !<
1399	INTEGER(iwp) :: m !<
1400	INTEGER(iwp) :: n !<
1401	!$ INTEGER(iwp) :: omp_get_thread_num !<
1402	INTEGER(iwp) :: tn !<
1403
1404	REAL(wp), DIMENSION(0:ny) :: work_ffty !<
1405	REAL(wp), DIMENSION(0:ny,1:nz) :: work_triy !<
1406	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: ar !<
1407	REAL(wp), DIMENSION(:,:,:,:), ALLOCATABLE :: tri !<
1408
1409
1410	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'start' )
1411
1412	ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) )
1413
1414	tn = 0 ! Default thread number in case of one thread
1415	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy )
1416	DO i = nxl_y, nxr_y
1417
1418	!$ tn = omp_get_thread_num()
1419
1420	IF ( loop_optimization == 'vector' ) THEN
1421	!
1422	!-- Code optimized for vector processors
1423	DO k = 1, nz
1424
1425	m = 0
1426	DO n = 1, pdims(2)
1427	DO j = 1, nny
1428	work_triy(m,k) = ar(j,k,i,n)
1429	m = m + 1
1430	ENDDO
1431	ENDDO
1432
1433	ENDDO
1434
1435	CALL fft_y_m( work_triy, ny, 'forward' )
1436
1437	ELSE
1438	!
1439	!-- Cache optimized code
1440	DO k = 1, nz
1441
1442	m = 0
1443	DO n = 1, pdims(2)
1444	DO j = 1, nny
1445	work_ffty(m) = ar(j,k,i,n)
1446	m = m + 1
1447	ENDDO
1448	ENDDO
1449
1450	CALL fft_y_1d( work_ffty, 'forward' )
1451
1452	DO j = 0, ny
1453	work_triy(j,k) = work_ffty(j)
1454	ENDDO
1455
1456	ENDDO
1457
1458	ENDIF
1459
1460	!
1461	!-- Solve the linear equation system
1462	CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) )
1463
1464	IF ( loop_optimization == 'vector' ) THEN
1465	!
1466	!-- Code optimized for vector processors
1467	CALL fft_y_m( work_triy, ny, 'backward' )
1468
1469	DO k = 1, nz
1470
1471	m = 0
1472	DO n = 1, pdims(2)
1473	DO j = 1, nny
1474	ar(j,k,i,n) = work_triy(m,k)
1475	m = m + 1
1476	ENDDO
1477	ENDDO
1478
1479	ENDDO
1480
1481	ELSE
1482	!
1483	!-- Cache optimized code
1484	DO k = 1, nz
1485
1486	DO j = 0, ny
1487	work_ffty(j) = work_triy(j,k)
1488	ENDDO
1489
1490	CALL fft_y_1d( work_ffty, 'backward' )
1491
1492	m = 0
1493	DO n = 1, pdims(2)
1494	DO j = 1, nny
1495	ar(j,k,i,n) = work_ffty(m)
1496	m = m + 1
1497	ENDDO
1498	ENDDO
1499
1500	ENDDO
1501
1502	ENDIF
1503
1504	ENDDO
1505
1506	DEALLOCATE( tri )
1507
1508	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'stop' )
1509
1510	END SUBROUTINE ffty_tri_ffty
1511
1512	END MODULE poisfft_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |