Home

Context Navigation

source: palm/trunk/SOURCE/poisfft_mod.f90 @ 2699

Last change on this file since 2699 was 2696, checked in by kanani, 7 years ago
Merge of branch palm4u into trunk
Property svn:keywords set to `Id`
File size: 45.4 KB

Line
1	!> @file poisfft_mod.f90
2	!------------------------------------------------------------------------------!
3	! This file is part of the PALM model system.
4	!
5	! PALM is free software: you can redistribute it and/or modify it under the
6	! terms of the GNU General Public License as published by the Free Software
7	! Foundation, either version 3 of the License, or (at your option) any later
8	! version.
9	!
10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13	!
14	! You should have received a copy of the GNU General Public License along with
15	! PALM. If not, see <http://www.gnu.org/licenses/>.
16	!
17	! Copyright 1997-2017 Leibniz Universitaet Hannover
18	!------------------------------------------------------------------------------!
19	!
20	! Current revisions:
21	! -----------------
22	!
23	!
24	! Former revisions:
25	! -----------------
26	! $Id: poisfft_mod.f90 2696 2017-12-14 17:12:51Z kanani $
27	! settings depending on host variable removed or replaced by loop_optimization
28	!
29	! 2119 2017-01-17 16:51:50Z raasch
30	!
31	! 2118 2017-01-17 16:38:49Z raasch
32	! OpenACC directives and related code removed
33	!
34	! 2000 2016-08-20 18:09:15Z knoop
35	! Forced header and separation lines into 80 columns
36	!
37	! 1850 2016-04-08 13:29:27Z maronga
38	! Module renamed
39	!
40	!
41	! 1804 2016-04-05 16:30:18Z maronga
42	! Removed code for parameter file check (__check)
43	!
44	! 1682 2015-10-07 23:56:08Z knoop
45	! Code annotations made doxygen readable
46	!
47	! 1482 2014-10-18 12:34:45Z raasch
48	! use 2d-decomposition, if accelerator boards are used
49	!
50	! 1406 2014-05-16 13:47:01Z raasch
51	! bugfix for pgi 14.4: declare create moved after array declaration
52	!
53	! 1320 2014-03-20 08:40:49Z raasch
54	! ONLY-attribute added to USE-statements,
55	! kind-parameters added to all INTEGER and REAL declaration statements,
56	! kinds are defined in new module kinds,
57	! old module precision_kind is removed,
58	! revision history before 2012 removed,
59	! comment fields (!:) to be used for variable explanations added to
60	! all variable declaration statements
61	!
62	! 1318 2014-03-17 13:35:16Z raasch
63	! module interfaces removed
64	!
65	! 1306 2014-03-13 14:30:59Z raasch
66	! openmp sections removed from the overlap branch,
67	! second argument removed from parameter list
68	!
69	! 1216 2013-08-26 09:31:42Z raasch
70	! resorting of arrays moved to separate routines resort_for_...,
71	! one argument, used as temporary work array, removed from all transpose
72	! routines
73	! overlapping fft / transposition implemented
74	!
75	! 1212 2013-08-15 08:46:27Z raasch
76	! tridia routines moved to seperate module tridia_solver
77	!
78	! 1208 2013-08-13 06:41:49Z raasch
79	! acc-update clauses added for "ar" so that ffts other than cufft can also be
80	! used (although they are not ported and will give a poor performance)
81	!
82	! 1111 2013-03-08 23:54:10Z raasch
83	! further openACC porting of non-parallel (MPI) branch:
84	! tridiagonal routines split into extermal subroutines (instead using CONTAINS),
85	! no distinction between parallel/non-parallel in poisfft and tridia any more,
86	! tridia routines moved to end of file because of probable bug in PGI compiler 12.5
87	! (otherwise "invalid device function" is indicated during runtime),
88	! optimization of tridia routines: constant elements and coefficients of tri are
89	! stored in seperate arrays ddzuw and tric, last dimension of tri reduced from 5
90	! to 2,
91	! poisfft_init is now called internally from poisfft, maketri is called from
92	! poisfft_init,
93	! ibc_p_b = 2 removed
94	!
95	! 1106 2013-03-04 05:31:38Z raasch
96	! routines fftx, ffty, fftxp, fftyp removed, calls replaced by fft_x, fft_y,
97	! in the 1D-decomposition routines fft_x, ffty are replaced by fft_x_1d,
98	! fft_y_1d
99	!
100	! 1103 2013-02-20 02:15:53Z raasch
101	! tri, ar, and ar1 arguments in tridia-routines (2d) are removed because they
102	! sometimes cause segmentation faults with intel 12.1 compiler
103	!
104	! 1092 2013-02-02 11:24:22Z raasch
105	! unused variables removed
106	!
107	! 1036 2012-10-22 13:43:42Z raasch
108	! code put under GPL (PALM 3.9)
109	!
110	! 2012-09-21 07:03:55Z raasch
111	! FLOAT type conversion replaced by REAL
112	!
113	! 1003 2012-09-14 14:35:53Z raasch
114	! indices nxa, nya, etc. replaced by nx, ny, etc.
115	!
116	! 940 2012-07-09 14:31:00Z raasch
117	! special handling of tri-array as an argument in tridia_1dd routines switched
118	! off because it caused segmentation faults with intel 12.1 compiler
119	!
120	! 877 2012-04-03 11:21:44Z suehring
121	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
122	! pressure at the top of the model domain.
123	!
124	! 809 2012-01-30 13:32:58Z maronga
125	! Bugfix: replaced .AND. and .NOT. with && and ! in the preprocessor directives
126	!
127	! 807 2012-01-25 11:53:51Z maronga
128	! New cpp directive "__check" implemented which is used by check_namelist_files
129	! (most of the code is unneeded by check_namelist_files).
130	!
131	! Revision 1.1 1997/07/24 11:24:14 raasch
132	! Initial revision
133	!
134	!
135	! Description:
136	! ------------
137	!> Solves the Poisson equation with a 2D spectral method
138	!> d^2 p / dx^2 + d^2 p / dy^2 + d^2 p / dz^2 = s
139	!>
140	!> Input:
141	!> real ar contains (nnz,nny,nnx) elements of the velocity divergence,
142	!> starting from (1,nys,nxl)
143	!>
144	!> Output:
145	!> real ar contains the solution for perturbation pressure p
146	!------------------------------------------------------------------------------!
147	MODULE poisfft_mod
148
149
150	USE fft_xy, &
151	ONLY: fft_init, fft_y, fft_y_1d, fft_y_m, fft_x, fft_x_1d, fft_x_m
152
153	USE indices, &
154	ONLY: nnx, nny, nx, nxl, nxr, ny, nys, nyn, nz
155
156	USE transpose_indices, &
157	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nys_x, nys_z, nyn_x, nyn_z, nzb_x, &
158	nzb_y, nzt_x, nzt_y
159
160	USE tridia_solver, &
161	ONLY: tridia_1dd, tridia_init, tridia_substi, tridia_substi_overlap
162
163	IMPLICIT NONE
164
165	LOGICAL, SAVE :: poisfft_initialized = .FALSE.
166
167	PRIVATE
168
169	PUBLIC poisfft, poisfft_init
170
171	INTERFACE poisfft
172	MODULE PROCEDURE poisfft
173	END INTERFACE poisfft
174
175	INTERFACE poisfft_init
176	MODULE PROCEDURE poisfft_init
177	END INTERFACE poisfft_init
178
179
180	CONTAINS
181
182	!------------------------------------------------------------------------------!
183	! Description:
184	! ------------
185	!> @todo Missing subroutine description.
186	!------------------------------------------------------------------------------!
187	SUBROUTINE poisfft_init
188
189	USE arrays_3d, &
190	ONLY: ddzu_pres, ddzw
191
192	USE kinds
193
194	IMPLICIT NONE
195
196	INTEGER(iwp) :: k !<
197
198
199	CALL fft_init
200
201	CALL tridia_init
202
203	poisfft_initialized = .TRUE.
204
205	END SUBROUTINE poisfft_init
206
207
208
209	!------------------------------------------------------------------------------!
210	! Description:
211	! ------------
212	!> Two-dimensional Fourier Transformation in x- and y-direction.
213	!------------------------------------------------------------------------------!
214	SUBROUTINE poisfft( ar )
215
216	USE control_parameters, &
217	ONLY: fft_method, transpose_compute_overlap
218
219	USE cpulog, &
220	ONLY: cpu_log, cpu_log_nowait, log_point_s
221
222	USE kinds
223
224	USE pegrid
225
226	IMPLICIT NONE
227
228	INTEGER(iwp) :: ii !<
229	INTEGER(iwp) :: iind !<
230	INTEGER(iwp) :: inew !<
231	INTEGER(iwp) :: jj !<
232	INTEGER(iwp) :: jind !<
233	INTEGER(iwp) :: jnew !<
234	INTEGER(iwp) :: ki !<
235	INTEGER(iwp) :: kk !<
236	INTEGER(iwp) :: knew !<
237	INTEGER(iwp) :: n !<
238	INTEGER(iwp) :: nblk !<
239	INTEGER(iwp) :: nnx_y !<
240	INTEGER(iwp) :: nny_z !<
241	INTEGER(iwp) :: nnz_t !<
242	INTEGER(iwp) :: nnz_x !<
243	INTEGER(iwp) :: nxl_y_bound !<
244	INTEGER(iwp) :: nxr_y_bound !<
245
246	INTEGER(iwp), DIMENSION(4) :: isave !<
247
248	REAL(wp), DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar !<
249	REAL(wp), DIMENSION(nys:nyn,nxl:nxr,1:nz) :: ar_inv !<
250
251	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: ar1 !<
252	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_in !<
253	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_inv !<
254	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_out_y !<
255	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_out_z !<
256
257
258	CALL cpu_log( log_point_s(3), 'poisfft', 'start' )
259
260	IF ( .NOT. poisfft_initialized ) CALL poisfft_init
261
262	!
263	!-- Two-dimensional Fourier Transformation in x- and y-direction.
264	IF ( pdims(2) == 1 .AND. pdims(1) > 1 ) THEN
265
266	!
267	!-- 1d-domain-decomposition along x:
268	!-- FFT along y and transposition y --> x
269	CALL ffty_tr_yx( ar, ar )
270
271	!
272	!-- FFT along x, solving the tridiagonal system and backward FFT
273	CALL fftx_tri_fftx( ar )
274
275	!
276	!-- Transposition x --> y and backward FFT along y
277	CALL tr_xy_ffty( ar, ar )
278
279	ELSEIF ( pdims(1) == 1 .AND. pdims(2) > 1 ) THEN
280
281	!
282	!-- 1d-domain-decomposition along y:
283	!-- FFT along x and transposition x --> y
284	CALL fftx_tr_xy( ar, ar )
285
286	!
287	!-- FFT along y, solving the tridiagonal system and backward FFT
288	CALL ffty_tri_ffty( ar )
289
290	!
291	!-- Transposition y --> x and backward FFT along x
292	CALL tr_yx_fftx( ar, ar )
293
294	ELSEIF ( .NOT. transpose_compute_overlap ) THEN
295
296	!
297	!-- 2d-domain-decomposition or no decomposition (1 PE run)
298	!-- Transposition z --> x
299	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
300	CALL resort_for_zx( ar, ar_inv )
301	CALL transpose_zx( ar_inv, ar )
302	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
303
304	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
305	CALL fft_x( ar, 'forward' )
306	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
307
308	!
309	!-- Transposition x --> y
310	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
311	CALL resort_for_xy( ar, ar_inv )
312	CALL transpose_xy( ar_inv, ar )
313	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
314
315	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
316	CALL fft_y( ar, 'forward', ar_tr = ar, &
317	nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
318	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
319	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
320
321	!
322	!-- Transposition y --> z
323	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
324	CALL resort_for_yz( ar, ar_inv )
325	CALL transpose_yz( ar_inv, ar )
326	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
327
328	!
329	!-- Solve the tridiagonal equation system along z
330	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
331	CALL tridia_substi( ar )
332	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
333
334	!
335	!-- Inverse Fourier Transformation
336	!-- Transposition z --> y
337	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
338	CALL transpose_zy( ar, ar_inv )
339	CALL resort_for_zy( ar_inv, ar )
340	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
341
342	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
343	CALL fft_y( ar, 'backward', ar_tr = ar, &
344	nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
345	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
346	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
347
348	!
349	!-- Transposition y --> x
350	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
351	CALL transpose_yx( ar, ar_inv )
352	CALL resort_for_yx( ar_inv, ar )
353	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
354
355	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
356	CALL fft_x( ar, 'backward' )
357	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
358
359	!
360	!-- Transposition x --> z
361	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
362	CALL transpose_xz( ar, ar_inv )
363	CALL resort_for_xz( ar_inv, ar )
364	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
365
366	ELSE
367
368	!
369	!-- 2d-domain-decomposition or no decomposition (1 PE run) with
370	!-- overlapping transposition / fft
371	!-- cputime logging must not use barriers, which would prevent overlapping
372	ALLOCATE( f_out_y(0:ny,nxl_y:nxr_y,nzb_y:nzt_y), &
373	f_out_z(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
374	!
375	!-- Transposition z --> x + subsequent fft along x
376	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
377	CALL resort_for_zx( ar, f_inv )
378	!
379	!-- Save original indices and gridpoint counter
380	isave(1) = nz
381	isave(2) = nzb_x
382	isave(3) = nzt_x
383	isave(4) = sendrecvcount_zx
384	!
385	!-- Set new indices for transformation
386	nblk = nz / pdims(1)
387	nz = pdims(1)
388	nnz_x = 1
389	nzb_x = 1 + myidx * nnz_x
390	nzt_x = ( myidx + 1 ) * nnz_x
391	sendrecvcount_zx = nnx * nny * nnz_x
392
393	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
394	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
395
396	DO kk = 1, nblk
397
398	IF ( kk == 1 ) THEN
399	CALL cpu_log( log_point_s(5), 'transpo forward', 'start', cpu_log_nowait )
400	ELSE
401	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
402	ENDIF
403
404	DO knew = 1, nz
405	ki = kk + nblk * ( knew - 1 )
406	f_in(:,:,knew) = f_inv(:,:,ki)
407	ENDDO
408
409	CALL transpose_zx( f_in, ar1(:,:,:))
410	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
411
412	IF ( kk == 1 ) THEN
413	CALL cpu_log( log_point_s(4), 'fft_x', 'start', cpu_log_nowait )
414	ELSE
415	CALL cpu_log( log_point_s(4), 'fft_x', 'continue', cpu_log_nowait )
416	ENDIF
417
418	n = isave(2) + kk - 1
419	CALL fft_x( ar1(:,:,:), 'forward', ar_2d = f_out_z(:,:,n))
420	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
421
422	ENDDO
423	!
424	!-- Restore original indices/counters
425	nz = isave(1)
426	nzb_x = isave(2)
427	nzt_x = isave(3)
428	sendrecvcount_zx = isave(4)
429
430	DEALLOCATE( ar1, f_in, f_inv )
431
432	!
433	!-- Transposition x --> y + subsequent fft along y
434	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
435	CALL resort_for_xy( f_out_z, f_inv )
436	!
437	!-- Save original indices and gridpoint counter
438	isave(1) = nx
439	isave(2) = nxl_y
440	isave(3) = nxr_y
441	isave(4) = sendrecvcount_xy
442	!
443	!-- Set new indices for transformation
444	nblk = ( ( nx+1 ) / pdims(2) ) - 1
445	nx = pdims(2)
446	nnx_y = 1
447	nxl_y = myidy * nnx_y
448	nxr_y = ( myidy + 1 ) * nnx_y - 1
449	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
450
451	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
452	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
453
454	DO ii = 0, nblk
455
456	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
457
458	DO inew = 0, nx-1
459	iind = ii + ( nblk + 1 ) * inew
460	f_in(:,:,inew) = f_inv(:,:,iind)
461	ENDDO
462
463	CALL transpose_xy( f_in, ar1(:,:,:) )
464
465	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
466
467	IF ( ii == 1 ) THEN
468	CALL cpu_log( log_point_s(7), 'fft_y', 'start', cpu_log_nowait )
469	ELSE
470	CALL cpu_log( log_point_s(7), 'fft_y', 'continue', cpu_log_nowait )
471	ENDIF
472
473	nxl_y_bound = isave(2)
474	nxr_y_bound = isave(3)
475	n = isave(2) + ii
476	CALL fft_y( ar1(:,:,:), 'forward', ar_tr = f_out_y, &
477	nxl_y_bound = nxl_y_bound, nxr_y_bound = nxr_y_bound, &
478	nxl_y_l = n, nxr_y_l = n )
479
480	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
481
482	ENDDO
483	!
484	!-- Restore original indices/counters
485	nx = isave(1)
486	nxl_y = isave(2)
487	nxr_y = isave(3)
488	sendrecvcount_xy = isave(4)
489
490	DEALLOCATE( ar1, f_in, f_inv )
491
492	!
493	!-- Transposition y --> z + subsequent tridia + resort for z --> y
494	ALLOCATE( f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
495	CALL resort_for_yz( f_out_y, f_inv )
496	!
497	!-- Save original indices and gridpoint counter
498	isave(1) = ny
499	isave(2) = nys_z
500	isave(3) = nyn_z
501	isave(4) = sendrecvcount_yz
502	!
503	!-- Set new indices for transformation
504	nblk = ( ( ny+1 ) / pdims(1) ) - 1
505	ny = pdims(1)
506	nny_z = 1
507	nys_z = myidx * nny_z
508	nyn_z = ( myidx + 1 ) * nny_z - 1
509	sendrecvcount_yz = ( nxr_y-nxl_y+1 ) * nny_z * ( nzt_y-nzb_y+1 )
510
511	ALLOCATE( ar1(nxl_z:nxr_z,nys_z:nyn_z,1:nz) )
512	ALLOCATE( f_in(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
513
514	DO jj = 0, nblk
515	!
516	!-- Forward Fourier Transformation
517	!-- Transposition y --> z
518	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
519
520	DO jnew = 0, ny-1
521	jind = jj + ( nblk + 1 ) * jnew
522	f_in(:,:,jnew) = f_inv(:,:,jind)
523	ENDDO
524
525	CALL transpose_yz( f_in, ar1(:,:,:) )
526
527	IF ( jj == nblk ) THEN
528	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
529	ELSE
530	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
531	ENDIF
532
533	!
534	!-- Solve the tridiagonal equation system along z
535	CALL cpu_log( log_point_s(6), 'tridia', 'start', cpu_log_nowait )
536
537	n = isave(2) + jj
538	CALL tridia_substi_overlap( ar1(:,:,:), n )
539
540	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
541
542	!
543	!-- Inverse Fourier Transformation
544	!-- Transposition z --> y
545	!-- Only one thread should call MPI routines, therefore forward and
546	!-- backward tranpose are in the same section
547	IF ( jj == 0 ) THEN
548	CALL cpu_log( log_point_s(8), 'transpo invers', 'start', cpu_log_nowait )
549	ELSE
550	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
551	ENDIF
552
553	CALL transpose_zy( ar1(:,:,:), f_in )
554
555	DO jnew = 0, ny-1
556	jind = jj + ( nblk + 1 ) * jnew
557	f_inv(:,:,jind) = f_in(:,:,jnew)
558	ENDDO
559
560	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
561
562	ENDDO
563	!
564	!-- Restore original indices/counters
565	ny = isave(1)
566	nys_z = isave(2)
567	nyn_z = isave(3)
568	sendrecvcount_yz = isave(4)
569
570	CALL resort_for_zy( f_inv, f_out_y )
571
572	DEALLOCATE( ar1, f_in, f_inv )
573
574	!
575	!-- fft along y backward + subsequent transposition y --> x
576	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
577	!
578	!-- Save original indices and gridpoint counter
579	isave(1) = nx
580	isave(2) = nxl_y
581	isave(3) = nxr_y
582	isave(4) = sendrecvcount_xy
583	!
584	!-- Set new indices for transformation
585	nblk = (( nx+1 ) / pdims(2) ) - 1
586	nx = pdims(2)
587	nnx_y = 1
588	nxl_y = myidy * nnx_y
589	nxr_y = ( myidy + 1 ) * nnx_y - 1
590	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
591
592	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
593	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
594
595	DO ii = 0, nblk
596
597	CALL cpu_log( log_point_s(7), 'fft_y', 'continue', cpu_log_nowait )
598
599	n = isave(2) + ii
600	nxl_y_bound = isave(2)
601	nxr_y_bound = isave(3)
602
603	CALL fft_y( ar1(:,:,:), 'backward', ar_tr = f_out_y, &
604	nxl_y_bound = nxl_y_bound, nxr_y_bound = nxr_y_bound, &
605	nxl_y_l = n, nxr_y_l = n )
606
607	IF ( ii == nblk ) THEN
608	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
609	ELSE
610	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
611	ENDIF
612
613	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
614
615	CALL transpose_yx( ar1(:,:,:), f_in )
616
617	DO inew = 0, nx-1
618	iind = ii + (nblk+1) * inew
619	f_inv(:,:,iind) = f_in(:,:,inew)
620	ENDDO
621
622	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
623
624	ENDDO
625	!
626	!-- Restore original indices/counters
627	nx = isave(1)
628	nxl_y = isave(2)
629	nxr_y = isave(3)
630	sendrecvcount_xy = isave(4)
631
632	CALL resort_for_yx( f_inv, f_out_z )
633
634	DEALLOCATE( ar1, f_in, f_inv )
635
636	!
637	!-- fft along x backward + subsequent final transposition x --> z
638	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
639	!
640	!-- Save original indices and gridpoint counter
641	isave(1) = nz
642	isave(2) = nzb_x
643	isave(3) = nzt_x
644	isave(4) = sendrecvcount_zx
645	!
646	!-- Set new indices for transformation
647	nblk = nz / pdims(1)
648	nz = pdims(1)
649	nnz_x = 1
650	nzb_x = 1 + myidx * nnz_x
651	nzt_x = ( myidx + 1 ) * nnz_x
652	sendrecvcount_zx = nnx * nny * nnz_x
653
654	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
655	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
656
657	DO kk = 1, nblk
658
659	CALL cpu_log( log_point_s(4), 'fft_x', 'continue', cpu_log_nowait )
660
661	n = isave(2) + kk - 1
662	CALL fft_x( ar1(:,:,:), 'backward', f_out_z(:,:,n))
663
664	IF ( kk == nblk ) THEN
665	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
666	ELSE
667	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
668	ENDIF
669
670	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
671
672	CALL transpose_xz( ar1(:,:,:), f_in )
673
674	DO knew = 1, nz
675	ki = kk + nblk * (knew-1)
676	f_inv(:,:,ki) = f_in(:,:,knew)
677	ENDDO
678
679	IF ( kk == nblk ) THEN
680	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
681	ELSE
682	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
683	ENDIF
684
685	ENDDO
686	!
687	!-- Restore original indices/counters
688	nz = isave(1)
689	nzb_x = isave(2)
690	nzt_x = isave(3)
691	sendrecvcount_zx = isave(4)
692
693	CALL resort_for_xz( f_inv, ar )
694
695	DEALLOCATE( ar1, f_in, f_inv )
696
697	ENDIF
698
699	CALL cpu_log( log_point_s(3), 'poisfft', 'stop' )
700
701	END SUBROUTINE poisfft
702
703
704	!------------------------------------------------------------------------------!
705	! Description:
706	! ------------
707	!> Fourier-transformation along y with subsequent transposition y --> x for
708	!> a 1d-decomposition along x.
709	!>
710	!> @attention The performance of this routine is much faster on the NEC-SX6,
711	!> if the first index of work_ffty_vec is odd. Otherwise
712	!> memory bank conflicts may occur (especially if the index is a
713	!> multiple of 128). That's why work_ffty_vec is dimensioned as
714	!> 0:ny+1.
715	!> Of course, this will not work if users are using an odd number
716	!> of gridpoints along y.
717	!------------------------------------------------------------------------------!
718	SUBROUTINE ffty_tr_yx( f_in, f_out )
719
720	USE control_parameters, &
721	ONLY: loop_optimization
722
723	USE cpulog, &
724	ONLY: cpu_log, log_point_s
725
726	USE kinds
727
728	USE pegrid
729
730	IMPLICIT NONE
731
732	INTEGER(iwp) :: i !<
733	INTEGER(iwp) :: iend !<
734	INTEGER(iwp) :: iouter !<
735	INTEGER(iwp) :: ir !<
736	INTEGER(iwp) :: j !<
737	INTEGER(iwp) :: k !<
738
739	INTEGER(iwp), PARAMETER :: stridex = 4 !<
740
741	REAL(wp), DIMENSION(1:nz,0:ny,nxl:nxr) :: f_in !<
742	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_out !<
743	REAL(wp), DIMENSION(nxl:nxr,1:nz,0:ny) :: work !<
744
745	REAL(wp), DIMENSION(:,:), ALLOCATABLE :: work_ffty !<
746	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: work_ffty_vec !<
747
748	!
749	!-- Carry out the FFT along y, where all data are present due to the
750	!-- 1d-decomposition along x. Resort the data in a way that x becomes
751	!-- the first index.
752	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'start' )
753
754	IF ( loop_optimization == 'vector' ) THEN
755
756	ALLOCATE( work_ffty_vec(0:ny+1,1:nz,nxl:nxr) )
757	!
758	!-- Code optimized for vector processors
759	!$OMP PARALLEL PRIVATE ( i, j, k )
760	!$OMP DO
761	DO i = nxl, nxr
762
763	DO j = 0, ny
764	DO k = 1, nz
765	work_ffty_vec(j,k,i) = f_in(k,j,i)
766	ENDDO
767	ENDDO
768
769	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' )
770
771	ENDDO
772
773	!$OMP DO
774	DO k = 1, nz
775	DO j = 0, ny
776	DO i = nxl, nxr
777	work(i,k,j) = work_ffty_vec(j,k,i)
778	ENDDO
779	ENDDO
780	ENDDO
781	!$OMP END PARALLEL
782
783	DEALLOCATE( work_ffty_vec )
784
785	ELSE
786	!
787	!-- Cache optimized code.
788	ALLOCATE( work_ffty(0:ny,stridex) )
789	!
790	!-- The i-(x-)direction is split into a strided outer loop and an inner
791	!-- loop for better cache performance
792	!$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty)
793	!$OMP DO
794	DO iouter = nxl, nxr, stridex
795
796	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
797
798	DO k = 1, nz
799
800	DO i = iouter, iend
801
802	ir = i-iouter+1 ! counter within a stride
803	DO j = 0, ny
804	work_ffty(j,ir) = f_in(k,j,i)
805	ENDDO
806	!
807	!-- FFT along y
808	CALL fft_y_1d( work_ffty(:,ir), 'forward' )
809
810	ENDDO
811
812	!
813	!-- Resort
814	DO j = 0, ny
815	DO i = iouter, iend
816	work(i,k,j) = work_ffty(j,i-iouter+1)
817	ENDDO
818	ENDDO
819
820	ENDDO
821
822	ENDDO
823	!$OMP END PARALLEL
824
825	DEALLOCATE( work_ffty )
826
827	ENDIF
828
829	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'pause' )
830
831	!
832	!-- Transpose array
833	#if defined( __parallel )
834	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
835	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
836	CALL MPI_ALLTOALL( work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
837	f_out(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
838	comm1dx, ierr )
839	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
840	#endif
841
842	END SUBROUTINE ffty_tr_yx
843
844
845	!------------------------------------------------------------------------------!
846	! Description:
847	! ------------
848	!> Transposition x --> y with a subsequent backward Fourier transformation for
849	!> a 1d-decomposition along x
850	!------------------------------------------------------------------------------!
851	SUBROUTINE tr_xy_ffty( f_in, f_out )
852
853	USE control_parameters, &
854	ONLY: loop_optimization
855
856	USE cpulog, &
857	ONLY: cpu_log, log_point_s
858
859	USE kinds
860
861	USE pegrid
862
863	IMPLICIT NONE
864
865	INTEGER(iwp) :: i !<
866	INTEGER(iwp) :: iend !<
867	INTEGER(iwp) :: iouter !<
868	INTEGER(iwp) :: ir !<
869	INTEGER(iwp) :: j !<
870	INTEGER(iwp) :: k !<
871
872	INTEGER(iwp), PARAMETER :: stridex = 4 !<
873
874	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_in !<
875	REAL(wp), DIMENSION(1:nz,0:ny,nxl:nxr) :: f_out !<
876	REAL(wp), DIMENSION(nxl:nxr,1:nz,0:ny) :: work !<
877
878	REAL(wp), DIMENSION(:,:), ALLOCATABLE :: work_ffty !<
879	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: work_ffty_vec !<
880
881	!
882	!-- Transpose array
883	#if defined( __parallel )
884	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
885	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
886	CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
887	work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
888	comm1dx, ierr )
889	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
890	#endif
891
892	!
893	!-- Resort the data in a way that y becomes the first index and carry out the
894	!-- backward fft along y.
895	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'continue' )
896
897	IF ( loop_optimization == 'vector' ) THEN
898
899	ALLOCATE( work_ffty_vec(0:ny+1,1:nz,nxl:nxr) )
900	!
901	!-- Code optimized for vector processors
902	!$OMP PARALLEL PRIVATE ( i, j, k )
903	!$OMP DO
904	DO k = 1, nz
905	DO j = 0, ny
906	DO i = nxl, nxr
907	work_ffty_vec(j,k,i) = work(i,k,j)
908	ENDDO
909	ENDDO
910	ENDDO
911
912	!$OMP DO
913	DO i = nxl, nxr
914
915	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' )
916
917	DO j = 0, ny
918	DO k = 1, nz
919	f_out(k,j,i) = work_ffty_vec(j,k,i)
920	ENDDO
921	ENDDO
922
923	ENDDO
924	!$OMP END PARALLEL
925
926	DEALLOCATE( work_ffty_vec )
927
928	ELSE
929	!
930	!-- Cache optimized code.
931	ALLOCATE( work_ffty(0:ny,stridex) )
932	!
933	!-- The i-(x-)direction is split into a strided outer loop and an inner
934	!-- loop for better cache performance
935	!$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty )
936	!$OMP DO
937	DO iouter = nxl, nxr, stridex
938
939	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
940
941	DO k = 1, nz
942	!
943	!-- Resort
944	DO j = 0, ny
945	DO i = iouter, iend
946	work_ffty(j,i-iouter+1) = work(i,k,j)
947	ENDDO
948	ENDDO
949
950	DO i = iouter, iend
951
952	!
953	!-- FFT along y
954	ir = i-iouter+1 ! counter within a stride
955	CALL fft_y_1d( work_ffty(:,ir), 'backward' )
956
957	DO j = 0, ny
958	f_out(k,j,i) = work_ffty(j,ir)
959	ENDDO
960	ENDDO
961
962	ENDDO
963
964	ENDDO
965	!$OMP END PARALLEL
966
967	DEALLOCATE( work_ffty )
968
969	ENDIF
970
971	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'stop' )
972
973	END SUBROUTINE tr_xy_ffty
974
975
976	!------------------------------------------------------------------------------!
977	! Description:
978	! ------------
979	!> FFT along x, solution of the tridiagonal system and backward FFT for
980	!> a 1d-decomposition along x
981	!>
982	!> @warning this subroutine may still not work for hybrid parallelization
983	!> with OpenMP (for possible necessary changes see the original
984	!> routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
985	!------------------------------------------------------------------------------!
986	SUBROUTINE fftx_tri_fftx( ar )
987
988	USE control_parameters, &
989	ONLY: loop_optimization
990
991	USE cpulog, &
992	ONLY: cpu_log, log_point_s
993
994	USE grid_variables, &
995	ONLY: ddx2, ddy2
996
997	USE kinds
998
999	USE pegrid
1000
1001	IMPLICIT NONE
1002
1003	INTEGER(iwp) :: i !<
1004	INTEGER(iwp) :: j !<
1005	INTEGER(iwp) :: k !<
1006	INTEGER(iwp) :: m !<
1007	INTEGER(iwp) :: n !<
1008	INTEGER(iwp) :: omp_get_thread_num !<
1009	INTEGER(iwp) :: tn !<
1010
1011	REAL(wp), DIMENSION(0:nx) :: work_fftx !<
1012	REAL(wp), DIMENSION(0:nx,1:nz) :: work_trix !<
1013	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: ar !<
1014	REAL(wp), DIMENSION(:,:,:,:), ALLOCATABLE :: tri !<
1015
1016
1017	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'start' )
1018
1019	ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) )
1020
1021	tn = 0 ! Default thread number in case of one thread
1022	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_fftx, work_trix )
1023	DO j = nys_x, nyn_x
1024
1025	!$ tn = omp_get_thread_num()
1026
1027	IF ( loop_optimization == 'vector' ) THEN
1028	!
1029	!-- Code optimized for vector processors
1030	DO k = 1, nz
1031
1032	m = 0
1033	DO n = 1, pdims(1)
1034	DO i = 1, nnx
1035	work_trix(m,k) = ar(i,k,j,n)
1036	m = m + 1
1037	ENDDO
1038	ENDDO
1039
1040	ENDDO
1041
1042	CALL fft_x_m( work_trix, 'forward' )
1043
1044	ELSE
1045	!
1046	!-- Cache optimized code
1047	DO k = 1, nz
1048
1049	m = 0
1050	DO n = 1, pdims(1)
1051	DO i = 1, nnx
1052	work_fftx(m) = ar(i,k,j,n)
1053	m = m + 1
1054	ENDDO
1055	ENDDO
1056
1057	CALL fft_x_1d( work_fftx, 'forward' )
1058
1059	DO i = 0, nx
1060	work_trix(i,k) = work_fftx(i)
1061	ENDDO
1062
1063	ENDDO
1064
1065	ENDIF
1066
1067	!
1068	!-- Solve the linear equation system
1069	CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) )
1070
1071	IF ( loop_optimization == 'vector' ) THEN
1072	!
1073	!-- Code optimized for vector processors
1074	CALL fft_x_m( work_trix, 'backward' )
1075
1076	DO k = 1, nz
1077
1078	m = 0
1079	DO n = 1, pdims(1)
1080	DO i = 1, nnx
1081	ar(i,k,j,n) = work_trix(m,k)
1082	m = m + 1
1083	ENDDO
1084	ENDDO
1085
1086	ENDDO
1087
1088	ELSE
1089	!
1090	!-- Cache optimized code
1091	DO k = 1, nz
1092
1093	DO i = 0, nx
1094	work_fftx(i) = work_trix(i,k)
1095	ENDDO
1096
1097	CALL fft_x_1d( work_fftx, 'backward' )
1098
1099	m = 0
1100	DO n = 1, pdims(1)
1101	DO i = 1, nnx
1102	ar(i,k,j,n) = work_fftx(m)
1103	m = m + 1
1104	ENDDO
1105	ENDDO
1106
1107	ENDDO
1108
1109	ENDIF
1110
1111	ENDDO
1112
1113	DEALLOCATE( tri )
1114
1115	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'stop' )
1116
1117	END SUBROUTINE fftx_tri_fftx
1118
1119
1120	!------------------------------------------------------------------------------!
1121	! Description:
1122	! ------------
1123	!> Fourier-transformation along x with subsequent transposition x --> y for
1124	!> a 1d-decomposition along y.
1125	!>
1126	!> @attention NEC-branch of this routine may significantly profit from
1127	!> further optimizations. So far, performance is much worse than
1128	!> for routine ffty_tr_yx (more than three times slower).
1129	!------------------------------------------------------------------------------!
1130	SUBROUTINE fftx_tr_xy( f_in, f_out )
1131
1132
1133	USE control_parameters, &
1134	ONLY: loop_optimization
1135
1136	USE cpulog, &
1137	ONLY: cpu_log, log_point_s
1138
1139	USE kinds
1140
1141	USE pegrid
1142
1143	IMPLICIT NONE
1144
1145	INTEGER(iwp) :: i !<
1146	INTEGER(iwp) :: j !<
1147	INTEGER(iwp) :: k !<
1148
1149	REAL(wp), DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx !<
1150	REAL(wp), DIMENSION(1:nz,nys:nyn,0:nx) :: f_in !<
1151	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_out !<
1152	REAL(wp), DIMENSION(nys:nyn,1:nz,0:nx) :: work !<
1153
1154	!
1155	!-- Carry out the FFT along x, where all data are present due to the
1156	!-- 1d-decomposition along y. Resort the data in a way that y becomes
1157	!-- the first index.
1158	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'start' )
1159
1160	IF ( loop_optimization == 'vector' ) THEN
1161	!
1162	!-- Code for vector processors
1163	!$OMP PARALLEL PRIVATE ( i, j, k )
1164	!$OMP DO
1165	DO i = 0, nx
1166
1167	DO j = nys, nyn
1168	DO k = 1, nz
1169	work_fftx(i,k,j) = f_in(k,j,i)
1170	ENDDO
1171	ENDDO
1172
1173	ENDDO
1174
1175	!$OMP DO
1176	DO j = nys, nyn
1177
1178	CALL fft_x_m( work_fftx(:,:,j), 'forward' )
1179
1180	DO k = 1, nz
1181	DO i = 0, nx
1182	work(j,k,i) = work_fftx(i,k,j)
1183	ENDDO
1184	ENDDO
1185
1186	ENDDO
1187	!$OMP END PARALLEL
1188
1189	ELSE
1190
1191	!
1192	!-- Cache optimized code (there might be still a potential for better
1193	!-- optimization).
1194	!$OMP PARALLEL PRIVATE (i,j,k)
1195	!$OMP DO
1196	DO i = 0, nx
1197
1198	DO j = nys, nyn
1199	DO k = 1, nz
1200	work_fftx(i,k,j) = f_in(k,j,i)
1201	ENDDO
1202	ENDDO
1203
1204	ENDDO
1205
1206	!$OMP DO
1207	DO j = nys, nyn
1208	DO k = 1, nz
1209
1210	CALL fft_x_1d( work_fftx(0:nx,k,j), 'forward' )
1211
1212	DO i = 0, nx
1213	work(j,k,i) = work_fftx(i,k,j)
1214	ENDDO
1215	ENDDO
1216
1217	ENDDO
1218	!$OMP END PARALLEL
1219
1220	ENDIF
1221	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'pause' )
1222
1223	!
1224	!-- Transpose array
1225	#if defined( __parallel )
1226	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1227	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1228	CALL MPI_ALLTOALL( work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
1229	f_out(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
1230	comm1dy, ierr )
1231	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1232	#endif
1233
1234	END SUBROUTINE fftx_tr_xy
1235
1236
1237	!------------------------------------------------------------------------------!
1238	! Description:
1239	! ------------
1240	!> Transposition y --> x with a subsequent backward Fourier transformation for
1241	!> a 1d-decomposition along x.
1242	!------------------------------------------------------------------------------!
1243	SUBROUTINE tr_yx_fftx( f_in, f_out )
1244
1245
1246	USE control_parameters, &
1247	ONLY: loop_optimization
1248
1249	USE cpulog, &
1250	ONLY: cpu_log, log_point_s
1251
1252	USE kinds
1253
1254	USE pegrid
1255
1256	IMPLICIT NONE
1257
1258	INTEGER(iwp) :: i !<
1259	INTEGER(iwp) :: j !<
1260	INTEGER(iwp) :: k !<
1261
1262	REAL(wp), DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx !<
1263	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_in !<
1264	REAL(wp), DIMENSION(1:nz,nys:nyn,0:nx) :: f_out !<
1265	REAL(wp), DIMENSION(nys:nyn,1:nz,0:nx) :: work !<
1266
1267	!
1268	!-- Transpose array
1269	#if defined( __parallel )
1270	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1271	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1272	CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
1273	work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
1274	comm1dy, ierr )
1275	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1276	#endif
1277
1278	!
1279	!-- Carry out the FFT along x, where all data are present due to the
1280	!-- 1d-decomposition along y. Resort the data in a way that y becomes
1281	!-- the first index.
1282	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'continue' )
1283
1284	IF ( loop_optimization == 'vector' ) THEN
1285	!
1286	!-- Code optimized for vector processors
1287	!$OMP PARALLEL PRIVATE ( i, j, k )
1288	!$OMP DO
1289	DO j = nys, nyn
1290
1291	DO k = 1, nz
1292	DO i = 0, nx
1293	work_fftx(i,k,j) = work(j,k,i)
1294	ENDDO
1295	ENDDO
1296
1297	CALL fft_x_m( work_fftx(:,:,j), 'backward' )
1298
1299	ENDDO
1300
1301	!$OMP DO
1302	DO i = 0, nx
1303	DO j = nys, nyn
1304	DO k = 1, nz
1305	f_out(k,j,i) = work_fftx(i,k,j)
1306	ENDDO
1307	ENDDO
1308	ENDDO
1309	!$OMP END PARALLEL
1310
1311	ELSE
1312
1313	!
1314	!-- Cache optimized code (there might be still a potential for better
1315	!-- optimization).
1316	!$OMP PARALLEL PRIVATE (i,j,k)
1317	!$OMP DO
1318	DO j = nys, nyn
1319	DO k = 1, nz
1320
1321	DO i = 0, nx
1322	work_fftx(i,k,j) = work(j,k,i)
1323	ENDDO
1324
1325	CALL fft_x_1d( work_fftx(0:nx,k,j), 'backward' )
1326
1327	ENDDO
1328	ENDDO
1329
1330	!$OMP DO
1331	DO i = 0, nx
1332	DO j = nys, nyn
1333	DO k = 1, nz
1334	f_out(k,j,i) = work_fftx(i,k,j)
1335	ENDDO
1336	ENDDO
1337	ENDDO
1338	!$OMP END PARALLEL
1339
1340	ENDIF
1341	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'stop' )
1342
1343	END SUBROUTINE tr_yx_fftx
1344
1345
1346	!------------------------------------------------------------------------------!
1347	! Description:
1348	! ------------
1349	!> FFT along y, solution of the tridiagonal system and backward FFT for
1350	!> a 1d-decomposition along y.
1351	!>
1352	!> @warning this subroutine may still not work for hybrid parallelization
1353	!> with OpenMP (for possible necessary changes see the original
1354	!> routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
1355	!------------------------------------------------------------------------------!
1356	SUBROUTINE ffty_tri_ffty( ar )
1357
1358
1359	USE control_parameters, &
1360	ONLY: loop_optimization
1361
1362	USE cpulog, &
1363	ONLY: cpu_log, log_point_s
1364
1365	USE grid_variables, &
1366	ONLY: ddx2, ddy2
1367
1368	USE kinds
1369
1370	USE pegrid
1371
1372	IMPLICIT NONE
1373
1374	INTEGER(iwp) :: i !<
1375	INTEGER(iwp) :: j !<
1376	INTEGER(iwp) :: k !<
1377	INTEGER(iwp) :: m !<
1378	INTEGER(iwp) :: n !<
1379	INTEGER(iwp) :: omp_get_thread_num !<
1380	INTEGER(iwp) :: tn !<
1381
1382	REAL(wp), DIMENSION(0:ny) :: work_ffty !<
1383	REAL(wp), DIMENSION(0:ny,1:nz) :: work_triy !<
1384	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: ar !<
1385	REAL(wp), DIMENSION(:,:,:,:), ALLOCATABLE :: tri !<
1386
1387
1388	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'start' )
1389
1390	ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) )
1391
1392	tn = 0 ! Default thread number in case of one thread
1393	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy )
1394	DO i = nxl_y, nxr_y
1395
1396	!$ tn = omp_get_thread_num()
1397
1398	IF ( loop_optimization == 'vector' ) THEN
1399	!
1400	!-- Code optimized for vector processors
1401	DO k = 1, nz
1402
1403	m = 0
1404	DO n = 1, pdims(2)
1405	DO j = 1, nny
1406	work_triy(m,k) = ar(j,k,i,n)
1407	m = m + 1
1408	ENDDO
1409	ENDDO
1410
1411	ENDDO
1412
1413	CALL fft_y_m( work_triy, ny, 'forward' )
1414
1415	ELSE
1416	!
1417	!-- Cache optimized code
1418	DO k = 1, nz
1419
1420	m = 0
1421	DO n = 1, pdims(2)
1422	DO j = 1, nny
1423	work_ffty(m) = ar(j,k,i,n)
1424	m = m + 1
1425	ENDDO
1426	ENDDO
1427
1428	CALL fft_y_1d( work_ffty, 'forward' )
1429
1430	DO j = 0, ny
1431	work_triy(j,k) = work_ffty(j)
1432	ENDDO
1433
1434	ENDDO
1435
1436	ENDIF
1437
1438	!
1439	!-- Solve the linear equation system
1440	CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) )
1441
1442	IF ( loop_optimization == 'vector' ) THEN
1443	!
1444	!-- Code optimized for vector processors
1445	CALL fft_y_m( work_triy, ny, 'backward' )
1446
1447	DO k = 1, nz
1448
1449	m = 0
1450	DO n = 1, pdims(2)
1451	DO j = 1, nny
1452	ar(j,k,i,n) = work_triy(m,k)
1453	m = m + 1
1454	ENDDO
1455	ENDDO
1456
1457	ENDDO
1458
1459	ELSE
1460	!
1461	!-- Cache optimized code
1462	DO k = 1, nz
1463
1464	DO j = 0, ny
1465	work_ffty(j) = work_triy(j,k)
1466	ENDDO
1467
1468	CALL fft_y_1d( work_ffty, 'backward' )
1469
1470	m = 0
1471	DO n = 1, pdims(2)
1472	DO j = 1, nny
1473	ar(j,k,i,n) = work_ffty(m)
1474	m = m + 1
1475	ENDDO
1476	ENDDO
1477
1478	ENDDO
1479
1480	ENDIF
1481
1482	ENDDO
1483
1484	DEALLOCATE( tri )
1485
1486	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'stop' )
1487
1488	END SUBROUTINE ffty_tri_ffty
1489
1490	END MODULE poisfft_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |