Home

Context Navigation

source: palm/trunk/SOURCE/poisfft_mod.f90 @ 3352

Last change on this file since 3352 was 3241, checked in by raasch, 6 years ago
various changes to avoid compiler warnings (mainly removal of unused variables)
Property svn:keywords set to `Id`
File size: 45.5 KB

Line
1	!> @file poisfft_mod.f90
2	!------------------------------------------------------------------------------!
3	! This file is part of the PALM model system.
4	!
5	! PALM is free software: you can redistribute it and/or modify it under the
6	! terms of the GNU General Public License as published by the Free Software
7	! Foundation, either version 3 of the License, or (at your option) any later
8	! version.
9	!
10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13	!
14	! You should have received a copy of the GNU General Public License along with
15	! PALM. If not, see <http://www.gnu.org/licenses/>.
16	!
17	! Copyright 1997-2018 Leibniz Universitaet Hannover
18	!------------------------------------------------------------------------------!
19	!
20	! Current revisions:
21	! -----------------
22	!
23	!
24	! Former revisions:
25	! -----------------
26	! $Id: poisfft_mod.f90 3241 2018-09-12 15:02:00Z knoop $
27	! unused variables removed,
28	! declarations of omp_get_thread_num now as omp-directive
29	!
30	! 2718 2018-01-02 08:49:38Z maronga
31	! Corrected "Former revisions" section
32	!
33	! 2696 2017-12-14 17:12:51Z kanani
34	! Change in file header (GPL part)
35	!
36	! 2300 2017-06-29 13:31:14Z raasch
37	! settings depending on host variable removed or replaced by loop_optimization
38	!
39	! 2119 2017-01-17 16:51:50Z raasch
40	!
41	! 2118 2017-01-17 16:38:49Z raasch
42	! OpenACC directives and related code removed
43	!
44	! 2000 2016-08-20 18:09:15Z knoop
45	! Forced header and separation lines into 80 columns
46	!
47	! 1850 2016-04-08 13:29:27Z maronga
48	! Module renamed
49	!
50	!
51	! 1804 2016-04-05 16:30:18Z maronga
52	! Removed code for parameter file check (__check)
53	!
54	! 1682 2015-10-07 23:56:08Z knoop
55	! Code annotations made doxygen readable
56	!
57	! 1482 2014-10-18 12:34:45Z raasch
58	! use 2d-decomposition, if accelerator boards are used
59	!
60	! 1406 2014-05-16 13:47:01Z raasch
61	! bugfix for pgi 14.4: declare create moved after array declaration
62	!
63	! 1320 2014-03-20 08:40:49Z raasch
64	! ONLY-attribute added to USE-statements,
65	! kind-parameters added to all INTEGER and REAL declaration statements,
66	! kinds are defined in new module kinds,
67	! old module precision_kind is removed,
68	! revision history before 2012 removed,
69	! comment fields (!:) to be used for variable explanations added to
70	! all variable declaration statements
71	!
72	! 1318 2014-03-17 13:35:16Z raasch
73	! module interfaces removed
74	!
75	! 1306 2014-03-13 14:30:59Z raasch
76	! openmp sections removed from the overlap branch,
77	! second argument removed from parameter list
78	!
79	! 1216 2013-08-26 09:31:42Z raasch
80	! resorting of arrays moved to separate routines resort_for_...,
81	! one argument, used as temporary work array, removed from all transpose
82	! routines
83	! overlapping fft / transposition implemented
84	!
85	! 1212 2013-08-15 08:46:27Z raasch
86	! tridia routines moved to seperate module tridia_solver
87	!
88	! 1208 2013-08-13 06:41:49Z raasch
89	! acc-update clauses added for "ar" so that ffts other than cufft can also be
90	! used (although they are not ported and will give a poor performance)
91	!
92	! 1111 2013-03-08 23:54:10Z raasch
93	! further openACC porting of non-parallel (MPI) branch:
94	! tridiagonal routines split into extermal subroutines (instead using CONTAINS),
95	! no distinction between parallel/non-parallel in poisfft and tridia any more,
96	! tridia routines moved to end of file because of probable bug in PGI compiler 12.5
97	! (otherwise "invalid device function" is indicated during runtime),
98	! optimization of tridia routines: constant elements and coefficients of tri are
99	! stored in seperate arrays ddzuw and tric, last dimension of tri reduced from 5
100	! to 2,
101	! poisfft_init is now called internally from poisfft, maketri is called from
102	! poisfft_init,
103	! ibc_p_b = 2 removed
104	!
105	! 1106 2013-03-04 05:31:38Z raasch
106	! routines fftx, ffty, fftxp, fftyp removed, calls replaced by fft_x, fft_y,
107	! in the 1D-decomposition routines fft_x, ffty are replaced by fft_x_1d,
108	! fft_y_1d
109	!
110	! 1103 2013-02-20 02:15:53Z raasch
111	! tri, ar, and ar1 arguments in tridia-routines (2d) are removed because they
112	! sometimes cause segmentation faults with intel 12.1 compiler
113	!
114	! 1092 2013-02-02 11:24:22Z raasch
115	! unused variables removed
116	!
117	! 1036 2012-10-22 13:43:42Z raasch
118	! code put under GPL (PALM 3.9)
119	!
120	! 2012-09-21 07:03:55Z raasch
121	! FLOAT type conversion replaced by REAL
122	!
123	! 1003 2012-09-14 14:35:53Z raasch
124	! indices nxa, nya, etc. replaced by nx, ny, etc.
125	!
126	! 940 2012-07-09 14:31:00Z raasch
127	! special handling of tri-array as an argument in tridia_1dd routines switched
128	! off because it caused segmentation faults with intel 12.1 compiler
129	!
130	! 877 2012-04-03 11:21:44Z suehring
131	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
132	! pressure at the top of the model domain.
133	!
134	! 809 2012-01-30 13:32:58Z maronga
135	! Bugfix: replaced .AND. and .NOT. with && and ! in the preprocessor directives
136	!
137	! 807 2012-01-25 11:53:51Z maronga
138	! New cpp directive "__check" implemented which is used by check_namelist_files
139	! (most of the code is unneeded by check_namelist_files).
140	!
141	! Revision 1.1 1997/07/24 11:24:14 raasch
142	! Initial revision
143	!
144	!
145	! Description:
146	! ------------
147	!> Solves the Poisson equation with a 2D spectral method
148	!> d^2 p / dx^2 + d^2 p / dy^2 + d^2 p / dz^2 = s
149	!>
150	!> Input:
151	!> real ar contains (nnz,nny,nnx) elements of the velocity divergence,
152	!> starting from (1,nys,nxl)
153	!>
154	!> Output:
155	!> real ar contains the solution for perturbation pressure p
156	!------------------------------------------------------------------------------!
157	MODULE poisfft_mod
158
159
160	USE fft_xy, &
161	ONLY: fft_init, fft_y, fft_y_1d, fft_y_m, fft_x, fft_x_1d, fft_x_m
162
163	USE indices, &
164	ONLY: nnx, nny, nx, nxl, nxr, ny, nys, nyn, nz
165
166	USE transpose_indices, &
167	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nys_x, nys_z, nyn_x, nyn_z, nzb_x, &
168	nzb_y, nzt_x, nzt_y
169
170	USE tridia_solver, &
171	ONLY: tridia_1dd, tridia_init, tridia_substi, tridia_substi_overlap
172
173	IMPLICIT NONE
174
175	LOGICAL, SAVE :: poisfft_initialized = .FALSE.
176
177	PRIVATE
178
179	PUBLIC poisfft, poisfft_init
180
181	INTERFACE poisfft
182	MODULE PROCEDURE poisfft
183	END INTERFACE poisfft
184
185	INTERFACE poisfft_init
186	MODULE PROCEDURE poisfft_init
187	END INTERFACE poisfft_init
188
189
190	CONTAINS
191
192	!------------------------------------------------------------------------------!
193	! Description:
194	! ------------
195	!> Setup coefficients for FFT and the tridiagonal solver
196	!------------------------------------------------------------------------------!
197	SUBROUTINE poisfft_init
198
199	IMPLICIT NONE
200
201
202	CALL fft_init
203
204	CALL tridia_init
205
206	poisfft_initialized = .TRUE.
207
208	END SUBROUTINE poisfft_init
209
210
211
212	!------------------------------------------------------------------------------!
213	! Description:
214	! ------------
215	!> Two-dimensional Fourier Transformation in x- and y-direction.
216	!------------------------------------------------------------------------------!
217	SUBROUTINE poisfft( ar )
218
219	USE control_parameters, &
220	ONLY: transpose_compute_overlap
221
222	USE cpulog, &
223	ONLY: cpu_log, cpu_log_nowait, log_point_s
224
225	USE kinds
226
227	USE pegrid
228
229	IMPLICIT NONE
230
231	INTEGER(iwp) :: ii !<
232	INTEGER(iwp) :: iind !<
233	INTEGER(iwp) :: inew !<
234	INTEGER(iwp) :: jj !<
235	INTEGER(iwp) :: jind !<
236	INTEGER(iwp) :: jnew !<
237	INTEGER(iwp) :: ki !<
238	INTEGER(iwp) :: kk !<
239	INTEGER(iwp) :: knew !<
240	INTEGER(iwp) :: n !<
241	INTEGER(iwp) :: nblk !<
242	INTEGER(iwp) :: nnx_y !<
243	INTEGER(iwp) :: nny_z !<
244	INTEGER(iwp) :: nnz_x !<
245	INTEGER(iwp) :: nxl_y_bound !<
246	INTEGER(iwp) :: nxr_y_bound !<
247
248	INTEGER(iwp), DIMENSION(4) :: isave !<
249
250	REAL(wp), DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar !<
251	REAL(wp), DIMENSION(nys:nyn,nxl:nxr,1:nz) :: ar_inv !<
252
253	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: ar1 !<
254	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_in !<
255	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_inv !<
256	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_out_y !<
257	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_out_z !<
258
259
260	CALL cpu_log( log_point_s(3), 'poisfft', 'start' )
261
262	IF ( .NOT. poisfft_initialized ) CALL poisfft_init
263
264	!
265	!-- Two-dimensional Fourier Transformation in x- and y-direction.
266	IF ( pdims(2) == 1 .AND. pdims(1) > 1 ) THEN
267
268	!
269	!-- 1d-domain-decomposition along x:
270	!-- FFT along y and transposition y --> x
271	CALL ffty_tr_yx( ar, ar )
272
273	!
274	!-- FFT along x, solving the tridiagonal system and backward FFT
275	CALL fftx_tri_fftx( ar )
276
277	!
278	!-- Transposition x --> y and backward FFT along y
279	CALL tr_xy_ffty( ar, ar )
280
281	ELSEIF ( pdims(1) == 1 .AND. pdims(2) > 1 ) THEN
282
283	!
284	!-- 1d-domain-decomposition along y:
285	!-- FFT along x and transposition x --> y
286	CALL fftx_tr_xy( ar, ar )
287
288	!
289	!-- FFT along y, solving the tridiagonal system and backward FFT
290	CALL ffty_tri_ffty( ar )
291
292	!
293	!-- Transposition y --> x and backward FFT along x
294	CALL tr_yx_fftx( ar, ar )
295
296	ELSEIF ( .NOT. transpose_compute_overlap ) THEN
297
298	!
299	!-- 2d-domain-decomposition or no decomposition (1 PE run)
300	!-- Transposition z --> x
301	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
302	CALL resort_for_zx( ar, ar_inv )
303	CALL transpose_zx( ar_inv, ar )
304	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
305
306	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
307	CALL fft_x( ar, 'forward' )
308	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
309
310	!
311	!-- Transposition x --> y
312	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
313	CALL resort_for_xy( ar, ar_inv )
314	CALL transpose_xy( ar_inv, ar )
315	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
316
317	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
318	CALL fft_y( ar, 'forward', ar_tr = ar, &
319	nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
320	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
321	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
322
323	!
324	!-- Transposition y --> z
325	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
326	CALL resort_for_yz( ar, ar_inv )
327	CALL transpose_yz( ar_inv, ar )
328	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
329
330	!
331	!-- Solve the tridiagonal equation system along z
332	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
333	CALL tridia_substi( ar )
334	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
335
336	!
337	!-- Inverse Fourier Transformation
338	!-- Transposition z --> y
339	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
340	CALL transpose_zy( ar, ar_inv )
341	CALL resort_for_zy( ar_inv, ar )
342	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
343
344	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
345	CALL fft_y( ar, 'backward', ar_tr = ar, &
346	nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
347	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
348	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
349
350	!
351	!-- Transposition y --> x
352	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
353	CALL transpose_yx( ar, ar_inv )
354	CALL resort_for_yx( ar_inv, ar )
355	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
356
357	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
358	CALL fft_x( ar, 'backward' )
359	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
360
361	!
362	!-- Transposition x --> z
363	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
364	CALL transpose_xz( ar, ar_inv )
365	CALL resort_for_xz( ar_inv, ar )
366	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
367
368	ELSE
369
370	!
371	!-- 2d-domain-decomposition or no decomposition (1 PE run) with
372	!-- overlapping transposition / fft
373	!-- cputime logging must not use barriers, which would prevent overlapping
374	ALLOCATE( f_out_y(0:ny,nxl_y:nxr_y,nzb_y:nzt_y), &
375	f_out_z(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
376	!
377	!-- Transposition z --> x + subsequent fft along x
378	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
379	CALL resort_for_zx( ar, f_inv )
380	!
381	!-- Save original indices and gridpoint counter
382	isave(1) = nz
383	isave(2) = nzb_x
384	isave(3) = nzt_x
385	isave(4) = sendrecvcount_zx
386	!
387	!-- Set new indices for transformation
388	nblk = nz / pdims(1)
389	nz = pdims(1)
390	nnz_x = 1
391	nzb_x = 1 + myidx * nnz_x
392	nzt_x = ( myidx + 1 ) * nnz_x
393	sendrecvcount_zx = nnx * nny * nnz_x
394
395	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
396	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
397
398	DO kk = 1, nblk
399
400	IF ( kk == 1 ) THEN
401	CALL cpu_log( log_point_s(5), 'transpo forward', 'start', cpu_log_nowait )
402	ELSE
403	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
404	ENDIF
405
406	DO knew = 1, nz
407	ki = kk + nblk * ( knew - 1 )
408	f_in(:,:,knew) = f_inv(:,:,ki)
409	ENDDO
410
411	CALL transpose_zx( f_in, ar1(:,:,:))
412	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
413
414	IF ( kk == 1 ) THEN
415	CALL cpu_log( log_point_s(4), 'fft_x', 'start', cpu_log_nowait )
416	ELSE
417	CALL cpu_log( log_point_s(4), 'fft_x', 'continue', cpu_log_nowait )
418	ENDIF
419
420	n = isave(2) + kk - 1
421	CALL fft_x( ar1(:,:,:), 'forward', ar_2d = f_out_z(:,:,n))
422	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
423
424	ENDDO
425	!
426	!-- Restore original indices/counters
427	nz = isave(1)
428	nzb_x = isave(2)
429	nzt_x = isave(3)
430	sendrecvcount_zx = isave(4)
431
432	DEALLOCATE( ar1, f_in, f_inv )
433
434	!
435	!-- Transposition x --> y + subsequent fft along y
436	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
437	CALL resort_for_xy( f_out_z, f_inv )
438	!
439	!-- Save original indices and gridpoint counter
440	isave(1) = nx
441	isave(2) = nxl_y
442	isave(3) = nxr_y
443	isave(4) = sendrecvcount_xy
444	!
445	!-- Set new indices for transformation
446	nblk = ( ( nx+1 ) / pdims(2) ) - 1
447	nx = pdims(2)
448	nnx_y = 1
449	nxl_y = myidy * nnx_y
450	nxr_y = ( myidy + 1 ) * nnx_y - 1
451	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
452
453	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
454	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
455
456	DO ii = 0, nblk
457
458	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
459
460	DO inew = 0, nx-1
461	iind = ii + ( nblk + 1 ) * inew
462	f_in(:,:,inew) = f_inv(:,:,iind)
463	ENDDO
464
465	CALL transpose_xy( f_in, ar1(:,:,:) )
466
467	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
468
469	IF ( ii == 1 ) THEN
470	CALL cpu_log( log_point_s(7), 'fft_y', 'start', cpu_log_nowait )
471	ELSE
472	CALL cpu_log( log_point_s(7), 'fft_y', 'continue', cpu_log_nowait )
473	ENDIF
474
475	nxl_y_bound = isave(2)
476	nxr_y_bound = isave(3)
477	n = isave(2) + ii
478	CALL fft_y( ar1(:,:,:), 'forward', ar_tr = f_out_y, &
479	nxl_y_bound = nxl_y_bound, nxr_y_bound = nxr_y_bound, &
480	nxl_y_l = n, nxr_y_l = n )
481
482	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
483
484	ENDDO
485	!
486	!-- Restore original indices/counters
487	nx = isave(1)
488	nxl_y = isave(2)
489	nxr_y = isave(3)
490	sendrecvcount_xy = isave(4)
491
492	DEALLOCATE( ar1, f_in, f_inv )
493
494	!
495	!-- Transposition y --> z + subsequent tridia + resort for z --> y
496	ALLOCATE( f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
497	CALL resort_for_yz( f_out_y, f_inv )
498	!
499	!-- Save original indices and gridpoint counter
500	isave(1) = ny
501	isave(2) = nys_z
502	isave(3) = nyn_z
503	isave(4) = sendrecvcount_yz
504	!
505	!-- Set new indices for transformation
506	nblk = ( ( ny+1 ) / pdims(1) ) - 1
507	ny = pdims(1)
508	nny_z = 1
509	nys_z = myidx * nny_z
510	nyn_z = ( myidx + 1 ) * nny_z - 1
511	sendrecvcount_yz = ( nxr_y-nxl_y+1 ) * nny_z * ( nzt_y-nzb_y+1 )
512
513	ALLOCATE( ar1(nxl_z:nxr_z,nys_z:nyn_z,1:nz) )
514	ALLOCATE( f_in(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
515
516	DO jj = 0, nblk
517	!
518	!-- Forward Fourier Transformation
519	!-- Transposition y --> z
520	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
521
522	DO jnew = 0, ny-1
523	jind = jj + ( nblk + 1 ) * jnew
524	f_in(:,:,jnew) = f_inv(:,:,jind)
525	ENDDO
526
527	CALL transpose_yz( f_in, ar1(:,:,:) )
528
529	IF ( jj == nblk ) THEN
530	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
531	ELSE
532	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
533	ENDIF
534
535	!
536	!-- Solve the tridiagonal equation system along z
537	CALL cpu_log( log_point_s(6), 'tridia', 'start', cpu_log_nowait )
538
539	n = isave(2) + jj
540	CALL tridia_substi_overlap( ar1(:,:,:), n )
541
542	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
543
544	!
545	!-- Inverse Fourier Transformation
546	!-- Transposition z --> y
547	!-- Only one thread should call MPI routines, therefore forward and
548	!-- backward tranpose are in the same section
549	IF ( jj == 0 ) THEN
550	CALL cpu_log( log_point_s(8), 'transpo invers', 'start', cpu_log_nowait )
551	ELSE
552	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
553	ENDIF
554
555	CALL transpose_zy( ar1(:,:,:), f_in )
556
557	DO jnew = 0, ny-1
558	jind = jj + ( nblk + 1 ) * jnew
559	f_inv(:,:,jind) = f_in(:,:,jnew)
560	ENDDO
561
562	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
563
564	ENDDO
565	!
566	!-- Restore original indices/counters
567	ny = isave(1)
568	nys_z = isave(2)
569	nyn_z = isave(3)
570	sendrecvcount_yz = isave(4)
571
572	CALL resort_for_zy( f_inv, f_out_y )
573
574	DEALLOCATE( ar1, f_in, f_inv )
575
576	!
577	!-- fft along y backward + subsequent transposition y --> x
578	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
579	!
580	!-- Save original indices and gridpoint counter
581	isave(1) = nx
582	isave(2) = nxl_y
583	isave(3) = nxr_y
584	isave(4) = sendrecvcount_xy
585	!
586	!-- Set new indices for transformation
587	nblk = (( nx+1 ) / pdims(2) ) - 1
588	nx = pdims(2)
589	nnx_y = 1
590	nxl_y = myidy * nnx_y
591	nxr_y = ( myidy + 1 ) * nnx_y - 1
592	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
593
594	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
595	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
596
597	DO ii = 0, nblk
598
599	CALL cpu_log( log_point_s(7), 'fft_y', 'continue', cpu_log_nowait )
600
601	n = isave(2) + ii
602	nxl_y_bound = isave(2)
603	nxr_y_bound = isave(3)
604
605	CALL fft_y( ar1(:,:,:), 'backward', ar_tr = f_out_y, &
606	nxl_y_bound = nxl_y_bound, nxr_y_bound = nxr_y_bound, &
607	nxl_y_l = n, nxr_y_l = n )
608
609	IF ( ii == nblk ) THEN
610	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
611	ELSE
612	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
613	ENDIF
614
615	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
616
617	CALL transpose_yx( ar1(:,:,:), f_in )
618
619	DO inew = 0, nx-1
620	iind = ii + (nblk+1) * inew
621	f_inv(:,:,iind) = f_in(:,:,inew)
622	ENDDO
623
624	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
625
626	ENDDO
627	!
628	!-- Restore original indices/counters
629	nx = isave(1)
630	nxl_y = isave(2)
631	nxr_y = isave(3)
632	sendrecvcount_xy = isave(4)
633
634	CALL resort_for_yx( f_inv, f_out_z )
635
636	DEALLOCATE( ar1, f_in, f_inv )
637
638	!
639	!-- fft along x backward + subsequent final transposition x --> z
640	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
641	!
642	!-- Save original indices and gridpoint counter
643	isave(1) = nz
644	isave(2) = nzb_x
645	isave(3) = nzt_x
646	isave(4) = sendrecvcount_zx
647	!
648	!-- Set new indices for transformation
649	nblk = nz / pdims(1)
650	nz = pdims(1)
651	nnz_x = 1
652	nzb_x = 1 + myidx * nnz_x
653	nzt_x = ( myidx + 1 ) * nnz_x
654	sendrecvcount_zx = nnx * nny * nnz_x
655
656	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
657	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
658
659	DO kk = 1, nblk
660
661	CALL cpu_log( log_point_s(4), 'fft_x', 'continue', cpu_log_nowait )
662
663	n = isave(2) + kk - 1
664	CALL fft_x( ar1(:,:,:), 'backward', f_out_z(:,:,n))
665
666	IF ( kk == nblk ) THEN
667	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
668	ELSE
669	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
670	ENDIF
671
672	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
673
674	CALL transpose_xz( ar1(:,:,:), f_in )
675
676	DO knew = 1, nz
677	ki = kk + nblk * (knew-1)
678	f_inv(:,:,ki) = f_in(:,:,knew)
679	ENDDO
680
681	IF ( kk == nblk ) THEN
682	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
683	ELSE
684	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
685	ENDIF
686
687	ENDDO
688	!
689	!-- Restore original indices/counters
690	nz = isave(1)
691	nzb_x = isave(2)
692	nzt_x = isave(3)
693	sendrecvcount_zx = isave(4)
694
695	CALL resort_for_xz( f_inv, ar )
696
697	DEALLOCATE( ar1, f_in, f_inv )
698
699	ENDIF
700
701	CALL cpu_log( log_point_s(3), 'poisfft', 'stop' )
702
703	END SUBROUTINE poisfft
704
705
706	!------------------------------------------------------------------------------!
707	! Description:
708	! ------------
709	!> Fourier-transformation along y with subsequent transposition y --> x for
710	!> a 1d-decomposition along x.
711	!>
712	!> @attention The performance of this routine is much faster on the NEC-SX6,
713	!> if the first index of work_ffty_vec is odd. Otherwise
714	!> memory bank conflicts may occur (especially if the index is a
715	!> multiple of 128). That's why work_ffty_vec is dimensioned as
716	!> 0:ny+1.
717	!> Of course, this will not work if users are using an odd number
718	!> of gridpoints along y.
719	!------------------------------------------------------------------------------!
720	SUBROUTINE ffty_tr_yx( f_in, f_out )
721
722	USE control_parameters, &
723	ONLY: loop_optimization
724
725	USE cpulog, &
726	ONLY: cpu_log, log_point_s
727
728	USE kinds
729
730	USE pegrid
731
732	IMPLICIT NONE
733
734	INTEGER(iwp) :: i !<
735	INTEGER(iwp) :: iend !<
736	INTEGER(iwp) :: iouter !<
737	INTEGER(iwp) :: ir !<
738	INTEGER(iwp) :: j !<
739	INTEGER(iwp) :: k !<
740
741	INTEGER(iwp), PARAMETER :: stridex = 4 !<
742
743	REAL(wp), DIMENSION(1:nz,0:ny,nxl:nxr) :: f_in !<
744	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_out !<
745	REAL(wp), DIMENSION(nxl:nxr,1:nz,0:ny) :: work !<
746
747	REAL(wp), DIMENSION(:,:), ALLOCATABLE :: work_ffty !<
748	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: work_ffty_vec !<
749
750	!
751	!-- Carry out the FFT along y, where all data are present due to the
752	!-- 1d-decomposition along x. Resort the data in a way that x becomes
753	!-- the first index.
754	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'start' )
755
756	IF ( loop_optimization == 'vector' ) THEN
757
758	ALLOCATE( work_ffty_vec(0:ny+1,1:nz,nxl:nxr) )
759	!
760	!-- Code optimized for vector processors
761	!$OMP PARALLEL PRIVATE ( i, j, k )
762	!$OMP DO
763	DO i = nxl, nxr
764
765	DO j = 0, ny
766	DO k = 1, nz
767	work_ffty_vec(j,k,i) = f_in(k,j,i)
768	ENDDO
769	ENDDO
770
771	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' )
772
773	ENDDO
774
775	!$OMP DO
776	DO k = 1, nz
777	DO j = 0, ny
778	DO i = nxl, nxr
779	work(i,k,j) = work_ffty_vec(j,k,i)
780	ENDDO
781	ENDDO
782	ENDDO
783	!$OMP END PARALLEL
784
785	DEALLOCATE( work_ffty_vec )
786
787	ELSE
788	!
789	!-- Cache optimized code.
790	ALLOCATE( work_ffty(0:ny,stridex) )
791	!
792	!-- The i-(x-)direction is split into a strided outer loop and an inner
793	!-- loop for better cache performance
794	!$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty)
795	!$OMP DO
796	DO iouter = nxl, nxr, stridex
797
798	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
799
800	DO k = 1, nz
801
802	DO i = iouter, iend
803
804	ir = i-iouter+1 ! counter within a stride
805	DO j = 0, ny
806	work_ffty(j,ir) = f_in(k,j,i)
807	ENDDO
808	!
809	!-- FFT along y
810	CALL fft_y_1d( work_ffty(:,ir), 'forward' )
811
812	ENDDO
813
814	!
815	!-- Resort
816	DO j = 0, ny
817	DO i = iouter, iend
818	work(i,k,j) = work_ffty(j,i-iouter+1)
819	ENDDO
820	ENDDO
821
822	ENDDO
823
824	ENDDO
825	!$OMP END PARALLEL
826
827	DEALLOCATE( work_ffty )
828
829	ENDIF
830
831	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'pause' )
832
833	!
834	!-- Transpose array
835	#if defined( __parallel )
836	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
837	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
838	CALL MPI_ALLTOALL( work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
839	f_out(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
840	comm1dx, ierr )
841	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
842	#endif
843
844	END SUBROUTINE ffty_tr_yx
845
846
847	!------------------------------------------------------------------------------!
848	! Description:
849	! ------------
850	!> Transposition x --> y with a subsequent backward Fourier transformation for
851	!> a 1d-decomposition along x
852	!------------------------------------------------------------------------------!
853	SUBROUTINE tr_xy_ffty( f_in, f_out )
854
855	USE control_parameters, &
856	ONLY: loop_optimization
857
858	USE cpulog, &
859	ONLY: cpu_log, log_point_s
860
861	USE kinds
862
863	USE pegrid
864
865	IMPLICIT NONE
866
867	INTEGER(iwp) :: i !<
868	INTEGER(iwp) :: iend !<
869	INTEGER(iwp) :: iouter !<
870	INTEGER(iwp) :: ir !<
871	INTEGER(iwp) :: j !<
872	INTEGER(iwp) :: k !<
873
874	INTEGER(iwp), PARAMETER :: stridex = 4 !<
875
876	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_in !<
877	REAL(wp), DIMENSION(1:nz,0:ny,nxl:nxr) :: f_out !<
878	REAL(wp), DIMENSION(nxl:nxr,1:nz,0:ny) :: work !<
879
880	REAL(wp), DIMENSION(:,:), ALLOCATABLE :: work_ffty !<
881	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: work_ffty_vec !<
882
883	!
884	!-- Transpose array
885	#if defined( __parallel )
886	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
887	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
888	CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
889	work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
890	comm1dx, ierr )
891	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
892	#endif
893
894	!
895	!-- Resort the data in a way that y becomes the first index and carry out the
896	!-- backward fft along y.
897	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'continue' )
898
899	IF ( loop_optimization == 'vector' ) THEN
900
901	ALLOCATE( work_ffty_vec(0:ny+1,1:nz,nxl:nxr) )
902	!
903	!-- Code optimized for vector processors
904	!$OMP PARALLEL PRIVATE ( i, j, k )
905	!$OMP DO
906	DO k = 1, nz
907	DO j = 0, ny
908	DO i = nxl, nxr
909	work_ffty_vec(j,k,i) = work(i,k,j)
910	ENDDO
911	ENDDO
912	ENDDO
913
914	!$OMP DO
915	DO i = nxl, nxr
916
917	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' )
918
919	DO j = 0, ny
920	DO k = 1, nz
921	f_out(k,j,i) = work_ffty_vec(j,k,i)
922	ENDDO
923	ENDDO
924
925	ENDDO
926	!$OMP END PARALLEL
927
928	DEALLOCATE( work_ffty_vec )
929
930	ELSE
931	!
932	!-- Cache optimized code.
933	ALLOCATE( work_ffty(0:ny,stridex) )
934	!
935	!-- The i-(x-)direction is split into a strided outer loop and an inner
936	!-- loop for better cache performance
937	!$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty )
938	!$OMP DO
939	DO iouter = nxl, nxr, stridex
940
941	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
942
943	DO k = 1, nz
944	!
945	!-- Resort
946	DO j = 0, ny
947	DO i = iouter, iend
948	work_ffty(j,i-iouter+1) = work(i,k,j)
949	ENDDO
950	ENDDO
951
952	DO i = iouter, iend
953
954	!
955	!-- FFT along y
956	ir = i-iouter+1 ! counter within a stride
957	CALL fft_y_1d( work_ffty(:,ir), 'backward' )
958
959	DO j = 0, ny
960	f_out(k,j,i) = work_ffty(j,ir)
961	ENDDO
962	ENDDO
963
964	ENDDO
965
966	ENDDO
967	!$OMP END PARALLEL
968
969	DEALLOCATE( work_ffty )
970
971	ENDIF
972
973	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'stop' )
974
975	END SUBROUTINE tr_xy_ffty
976
977
978	!------------------------------------------------------------------------------!
979	! Description:
980	! ------------
981	!> FFT along x, solution of the tridiagonal system and backward FFT for
982	!> a 1d-decomposition along x
983	!>
984	!> @warning this subroutine may still not work for hybrid parallelization
985	!> with OpenMP (for possible necessary changes see the original
986	!> routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
987	!------------------------------------------------------------------------------!
988	SUBROUTINE fftx_tri_fftx( ar )
989
990	USE control_parameters, &
991	ONLY: loop_optimization
992
993	USE cpulog, &
994	ONLY: cpu_log, log_point_s
995
996	USE grid_variables, &
997	ONLY: ddx2, ddy2
998
999	USE kinds
1000
1001	USE pegrid
1002
1003	IMPLICIT NONE
1004
1005	INTEGER(iwp) :: i !<
1006	INTEGER(iwp) :: j !<
1007	INTEGER(iwp) :: k !<
1008	INTEGER(iwp) :: m !<
1009	INTEGER(iwp) :: n !<
1010	!$ INTEGER(iwp) :: omp_get_thread_num !<
1011	INTEGER(iwp) :: tn !<
1012
1013	REAL(wp), DIMENSION(0:nx) :: work_fftx !<
1014	REAL(wp), DIMENSION(0:nx,1:nz) :: work_trix !<
1015	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: ar !<
1016	REAL(wp), DIMENSION(:,:,:,:), ALLOCATABLE :: tri !<
1017
1018
1019	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'start' )
1020
1021	ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) )
1022
1023	tn = 0 ! Default thread number in case of one thread
1024	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_fftx, work_trix )
1025	DO j = nys_x, nyn_x
1026
1027	!$ tn = omp_get_thread_num()
1028
1029	IF ( loop_optimization == 'vector' ) THEN
1030	!
1031	!-- Code optimized for vector processors
1032	DO k = 1, nz
1033
1034	m = 0
1035	DO n = 1, pdims(1)
1036	DO i = 1, nnx
1037	work_trix(m,k) = ar(i,k,j,n)
1038	m = m + 1
1039	ENDDO
1040	ENDDO
1041
1042	ENDDO
1043
1044	CALL fft_x_m( work_trix, 'forward' )
1045
1046	ELSE
1047	!
1048	!-- Cache optimized code
1049	DO k = 1, nz
1050
1051	m = 0
1052	DO n = 1, pdims(1)
1053	DO i = 1, nnx
1054	work_fftx(m) = ar(i,k,j,n)
1055	m = m + 1
1056	ENDDO
1057	ENDDO
1058
1059	CALL fft_x_1d( work_fftx, 'forward' )
1060
1061	DO i = 0, nx
1062	work_trix(i,k) = work_fftx(i)
1063	ENDDO
1064
1065	ENDDO
1066
1067	ENDIF
1068
1069	!
1070	!-- Solve the linear equation system
1071	CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) )
1072
1073	IF ( loop_optimization == 'vector' ) THEN
1074	!
1075	!-- Code optimized for vector processors
1076	CALL fft_x_m( work_trix, 'backward' )
1077
1078	DO k = 1, nz
1079
1080	m = 0
1081	DO n = 1, pdims(1)
1082	DO i = 1, nnx
1083	ar(i,k,j,n) = work_trix(m,k)
1084	m = m + 1
1085	ENDDO
1086	ENDDO
1087
1088	ENDDO
1089
1090	ELSE
1091	!
1092	!-- Cache optimized code
1093	DO k = 1, nz
1094
1095	DO i = 0, nx
1096	work_fftx(i) = work_trix(i,k)
1097	ENDDO
1098
1099	CALL fft_x_1d( work_fftx, 'backward' )
1100
1101	m = 0
1102	DO n = 1, pdims(1)
1103	DO i = 1, nnx
1104	ar(i,k,j,n) = work_fftx(m)
1105	m = m + 1
1106	ENDDO
1107	ENDDO
1108
1109	ENDDO
1110
1111	ENDIF
1112
1113	ENDDO
1114
1115	DEALLOCATE( tri )
1116
1117	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'stop' )
1118
1119	END SUBROUTINE fftx_tri_fftx
1120
1121
1122	!------------------------------------------------------------------------------!
1123	! Description:
1124	! ------------
1125	!> Fourier-transformation along x with subsequent transposition x --> y for
1126	!> a 1d-decomposition along y.
1127	!>
1128	!> @attention NEC-branch of this routine may significantly profit from
1129	!> further optimizations. So far, performance is much worse than
1130	!> for routine ffty_tr_yx (more than three times slower).
1131	!------------------------------------------------------------------------------!
1132	SUBROUTINE fftx_tr_xy( f_in, f_out )
1133
1134
1135	USE control_parameters, &
1136	ONLY: loop_optimization
1137
1138	USE cpulog, &
1139	ONLY: cpu_log, log_point_s
1140
1141	USE kinds
1142
1143	USE pegrid
1144
1145	IMPLICIT NONE
1146
1147	INTEGER(iwp) :: i !<
1148	INTEGER(iwp) :: j !<
1149	INTEGER(iwp) :: k !<
1150
1151	REAL(wp), DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx !<
1152	REAL(wp), DIMENSION(1:nz,nys:nyn,0:nx) :: f_in !<
1153	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_out !<
1154	REAL(wp), DIMENSION(nys:nyn,1:nz,0:nx) :: work !<
1155
1156	!
1157	!-- Carry out the FFT along x, where all data are present due to the
1158	!-- 1d-decomposition along y. Resort the data in a way that y becomes
1159	!-- the first index.
1160	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'start' )
1161
1162	IF ( loop_optimization == 'vector' ) THEN
1163	!
1164	!-- Code for vector processors
1165	!$OMP PARALLEL PRIVATE ( i, j, k )
1166	!$OMP DO
1167	DO i = 0, nx
1168
1169	DO j = nys, nyn
1170	DO k = 1, nz
1171	work_fftx(i,k,j) = f_in(k,j,i)
1172	ENDDO
1173	ENDDO
1174
1175	ENDDO
1176
1177	!$OMP DO
1178	DO j = nys, nyn
1179
1180	CALL fft_x_m( work_fftx(:,:,j), 'forward' )
1181
1182	DO k = 1, nz
1183	DO i = 0, nx
1184	work(j,k,i) = work_fftx(i,k,j)
1185	ENDDO
1186	ENDDO
1187
1188	ENDDO
1189	!$OMP END PARALLEL
1190
1191	ELSE
1192
1193	!
1194	!-- Cache optimized code (there might be still a potential for better
1195	!-- optimization).
1196	!$OMP PARALLEL PRIVATE (i,j,k)
1197	!$OMP DO
1198	DO i = 0, nx
1199
1200	DO j = nys, nyn
1201	DO k = 1, nz
1202	work_fftx(i,k,j) = f_in(k,j,i)
1203	ENDDO
1204	ENDDO
1205
1206	ENDDO
1207
1208	!$OMP DO
1209	DO j = nys, nyn
1210	DO k = 1, nz
1211
1212	CALL fft_x_1d( work_fftx(0:nx,k,j), 'forward' )
1213
1214	DO i = 0, nx
1215	work(j,k,i) = work_fftx(i,k,j)
1216	ENDDO
1217	ENDDO
1218
1219	ENDDO
1220	!$OMP END PARALLEL
1221
1222	ENDIF
1223	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'pause' )
1224
1225	!
1226	!-- Transpose array
1227	#if defined( __parallel )
1228	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1229	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1230	CALL MPI_ALLTOALL( work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
1231	f_out(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
1232	comm1dy, ierr )
1233	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1234	#endif
1235
1236	END SUBROUTINE fftx_tr_xy
1237
1238
1239	!------------------------------------------------------------------------------!
1240	! Description:
1241	! ------------
1242	!> Transposition y --> x with a subsequent backward Fourier transformation for
1243	!> a 1d-decomposition along x.
1244	!------------------------------------------------------------------------------!
1245	SUBROUTINE tr_yx_fftx( f_in, f_out )
1246
1247
1248	USE control_parameters, &
1249	ONLY: loop_optimization
1250
1251	USE cpulog, &
1252	ONLY: cpu_log, log_point_s
1253
1254	USE kinds
1255
1256	USE pegrid
1257
1258	IMPLICIT NONE
1259
1260	INTEGER(iwp) :: i !<
1261	INTEGER(iwp) :: j !<
1262	INTEGER(iwp) :: k !<
1263
1264	REAL(wp), DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx !<
1265	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_in !<
1266	REAL(wp), DIMENSION(1:nz,nys:nyn,0:nx) :: f_out !<
1267	REAL(wp), DIMENSION(nys:nyn,1:nz,0:nx) :: work !<
1268
1269	!
1270	!-- Transpose array
1271	#if defined( __parallel )
1272	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1273	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1274	CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
1275	work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
1276	comm1dy, ierr )
1277	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1278	#endif
1279
1280	!
1281	!-- Carry out the FFT along x, where all data are present due to the
1282	!-- 1d-decomposition along y. Resort the data in a way that y becomes
1283	!-- the first index.
1284	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'continue' )
1285
1286	IF ( loop_optimization == 'vector' ) THEN
1287	!
1288	!-- Code optimized for vector processors
1289	!$OMP PARALLEL PRIVATE ( i, j, k )
1290	!$OMP DO
1291	DO j = nys, nyn
1292
1293	DO k = 1, nz
1294	DO i = 0, nx
1295	work_fftx(i,k,j) = work(j,k,i)
1296	ENDDO
1297	ENDDO
1298
1299	CALL fft_x_m( work_fftx(:,:,j), 'backward' )
1300
1301	ENDDO
1302
1303	!$OMP DO
1304	DO i = 0, nx
1305	DO j = nys, nyn
1306	DO k = 1, nz
1307	f_out(k,j,i) = work_fftx(i,k,j)
1308	ENDDO
1309	ENDDO
1310	ENDDO
1311	!$OMP END PARALLEL
1312
1313	ELSE
1314
1315	!
1316	!-- Cache optimized code (there might be still a potential for better
1317	!-- optimization).
1318	!$OMP PARALLEL PRIVATE (i,j,k)
1319	!$OMP DO
1320	DO j = nys, nyn
1321	DO k = 1, nz
1322
1323	DO i = 0, nx
1324	work_fftx(i,k,j) = work(j,k,i)
1325	ENDDO
1326
1327	CALL fft_x_1d( work_fftx(0:nx,k,j), 'backward' )
1328
1329	ENDDO
1330	ENDDO
1331
1332	!$OMP DO
1333	DO i = 0, nx
1334	DO j = nys, nyn
1335	DO k = 1, nz
1336	f_out(k,j,i) = work_fftx(i,k,j)
1337	ENDDO
1338	ENDDO
1339	ENDDO
1340	!$OMP END PARALLEL
1341
1342	ENDIF
1343	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'stop' )
1344
1345	END SUBROUTINE tr_yx_fftx
1346
1347
1348	!------------------------------------------------------------------------------!
1349	! Description:
1350	! ------------
1351	!> FFT along y, solution of the tridiagonal system and backward FFT for
1352	!> a 1d-decomposition along y.
1353	!>
1354	!> @warning this subroutine may still not work for hybrid parallelization
1355	!> with OpenMP (for possible necessary changes see the original
1356	!> routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
1357	!------------------------------------------------------------------------------!
1358	SUBROUTINE ffty_tri_ffty( ar )
1359
1360
1361	USE control_parameters, &
1362	ONLY: loop_optimization
1363
1364	USE cpulog, &
1365	ONLY: cpu_log, log_point_s
1366
1367	USE grid_variables, &
1368	ONLY: ddx2, ddy2
1369
1370	USE kinds
1371
1372	USE pegrid
1373
1374	IMPLICIT NONE
1375
1376	INTEGER(iwp) :: i !<
1377	INTEGER(iwp) :: j !<
1378	INTEGER(iwp) :: k !<
1379	INTEGER(iwp) :: m !<
1380	INTEGER(iwp) :: n !<
1381	!$ INTEGER(iwp) :: omp_get_thread_num !<
1382	INTEGER(iwp) :: tn !<
1383
1384	REAL(wp), DIMENSION(0:ny) :: work_ffty !<
1385	REAL(wp), DIMENSION(0:ny,1:nz) :: work_triy !<
1386	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: ar !<
1387	REAL(wp), DIMENSION(:,:,:,:), ALLOCATABLE :: tri !<
1388
1389
1390	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'start' )
1391
1392	ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) )
1393
1394	tn = 0 ! Default thread number in case of one thread
1395	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy )
1396	DO i = nxl_y, nxr_y
1397
1398	!$ tn = omp_get_thread_num()
1399
1400	IF ( loop_optimization == 'vector' ) THEN
1401	!
1402	!-- Code optimized for vector processors
1403	DO k = 1, nz
1404
1405	m = 0
1406	DO n = 1, pdims(2)
1407	DO j = 1, nny
1408	work_triy(m,k) = ar(j,k,i,n)
1409	m = m + 1
1410	ENDDO
1411	ENDDO
1412
1413	ENDDO
1414
1415	CALL fft_y_m( work_triy, ny, 'forward' )
1416
1417	ELSE
1418	!
1419	!-- Cache optimized code
1420	DO k = 1, nz
1421
1422	m = 0
1423	DO n = 1, pdims(2)
1424	DO j = 1, nny
1425	work_ffty(m) = ar(j,k,i,n)
1426	m = m + 1
1427	ENDDO
1428	ENDDO
1429
1430	CALL fft_y_1d( work_ffty, 'forward' )
1431
1432	DO j = 0, ny
1433	work_triy(j,k) = work_ffty(j)
1434	ENDDO
1435
1436	ENDDO
1437
1438	ENDIF
1439
1440	!
1441	!-- Solve the linear equation system
1442	CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) )
1443
1444	IF ( loop_optimization == 'vector' ) THEN
1445	!
1446	!-- Code optimized for vector processors
1447	CALL fft_y_m( work_triy, ny, 'backward' )
1448
1449	DO k = 1, nz
1450
1451	m = 0
1452	DO n = 1, pdims(2)
1453	DO j = 1, nny
1454	ar(j,k,i,n) = work_triy(m,k)
1455	m = m + 1
1456	ENDDO
1457	ENDDO
1458
1459	ENDDO
1460
1461	ELSE
1462	!
1463	!-- Cache optimized code
1464	DO k = 1, nz
1465
1466	DO j = 0, ny
1467	work_ffty(j) = work_triy(j,k)
1468	ENDDO
1469
1470	CALL fft_y_1d( work_ffty, 'backward' )
1471
1472	m = 0
1473	DO n = 1, pdims(2)
1474	DO j = 1, nny
1475	ar(j,k,i,n) = work_ffty(m)
1476	m = m + 1
1477	ENDDO
1478	ENDDO
1479
1480	ENDDO
1481
1482	ENDIF
1483
1484	ENDDO
1485
1486	DEALLOCATE( tri )
1487
1488	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'stop' )
1489
1490	END SUBROUTINE ffty_tri_ffty
1491
1492	END MODULE poisfft_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |