Home

Context Navigation

source: palm/trunk/SOURCE/poisfft_mod.f90 @ 3193

Last change on this file since 3193 was 2718, checked in by maronga, 7 years ago
deleting of deprecated files; headers updated where needed
Property svn:keywords set to `Id`
File size: 45.6 KB

Line
1	!> @file poisfft_mod.f90
2	!------------------------------------------------------------------------------!
3	! This file is part of the PALM model system.
4	!
5	! PALM is free software: you can redistribute it and/or modify it under the
6	! terms of the GNU General Public License as published by the Free Software
7	! Foundation, either version 3 of the License, or (at your option) any later
8	! version.
9	!
10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13	!
14	! You should have received a copy of the GNU General Public License along with
15	! PALM. If not, see <http://www.gnu.org/licenses/>.
16	!
17	! Copyright 1997-2018 Leibniz Universitaet Hannover
18	!------------------------------------------------------------------------------!
19	!
20	! Current revisions:
21	! -----------------
22	!
23	!
24	! Former revisions:
25	! -----------------
26	! $Id: poisfft_mod.f90 2718 2018-01-02 08:49:38Z witha $
27	! Corrected "Former revisions" section
28	!
29	! 2696 2017-12-14 17:12:51Z kanani
30	! Change in file header (GPL part)
31	!
32	! 2300 2017-06-29 13:31:14Z raasch
33	! settings depending on host variable removed or replaced by loop_optimization
34	!
35	! 2119 2017-01-17 16:51:50Z raasch
36	!
37	! 2118 2017-01-17 16:38:49Z raasch
38	! OpenACC directives and related code removed
39	!
40	! 2000 2016-08-20 18:09:15Z knoop
41	! Forced header and separation lines into 80 columns
42	!
43	! 1850 2016-04-08 13:29:27Z maronga
44	! Module renamed
45	!
46	!
47	! 1804 2016-04-05 16:30:18Z maronga
48	! Removed code for parameter file check (__check)
49	!
50	! 1682 2015-10-07 23:56:08Z knoop
51	! Code annotations made doxygen readable
52	!
53	! 1482 2014-10-18 12:34:45Z raasch
54	! use 2d-decomposition, if accelerator boards are used
55	!
56	! 1406 2014-05-16 13:47:01Z raasch
57	! bugfix for pgi 14.4: declare create moved after array declaration
58	!
59	! 1320 2014-03-20 08:40:49Z raasch
60	! ONLY-attribute added to USE-statements,
61	! kind-parameters added to all INTEGER and REAL declaration statements,
62	! kinds are defined in new module kinds,
63	! old module precision_kind is removed,
64	! revision history before 2012 removed,
65	! comment fields (!:) to be used for variable explanations added to
66	! all variable declaration statements
67	!
68	! 1318 2014-03-17 13:35:16Z raasch
69	! module interfaces removed
70	!
71	! 1306 2014-03-13 14:30:59Z raasch
72	! openmp sections removed from the overlap branch,
73	! second argument removed from parameter list
74	!
75	! 1216 2013-08-26 09:31:42Z raasch
76	! resorting of arrays moved to separate routines resort_for_...,
77	! one argument, used as temporary work array, removed from all transpose
78	! routines
79	! overlapping fft / transposition implemented
80	!
81	! 1212 2013-08-15 08:46:27Z raasch
82	! tridia routines moved to seperate module tridia_solver
83	!
84	! 1208 2013-08-13 06:41:49Z raasch
85	! acc-update clauses added for "ar" so that ffts other than cufft can also be
86	! used (although they are not ported and will give a poor performance)
87	!
88	! 1111 2013-03-08 23:54:10Z raasch
89	! further openACC porting of non-parallel (MPI) branch:
90	! tridiagonal routines split into extermal subroutines (instead using CONTAINS),
91	! no distinction between parallel/non-parallel in poisfft and tridia any more,
92	! tridia routines moved to end of file because of probable bug in PGI compiler 12.5
93	! (otherwise "invalid device function" is indicated during runtime),
94	! optimization of tridia routines: constant elements and coefficients of tri are
95	! stored in seperate arrays ddzuw and tric, last dimension of tri reduced from 5
96	! to 2,
97	! poisfft_init is now called internally from poisfft, maketri is called from
98	! poisfft_init,
99	! ibc_p_b = 2 removed
100	!
101	! 1106 2013-03-04 05:31:38Z raasch
102	! routines fftx, ffty, fftxp, fftyp removed, calls replaced by fft_x, fft_y,
103	! in the 1D-decomposition routines fft_x, ffty are replaced by fft_x_1d,
104	! fft_y_1d
105	!
106	! 1103 2013-02-20 02:15:53Z raasch
107	! tri, ar, and ar1 arguments in tridia-routines (2d) are removed because they
108	! sometimes cause segmentation faults with intel 12.1 compiler
109	!
110	! 1092 2013-02-02 11:24:22Z raasch
111	! unused variables removed
112	!
113	! 1036 2012-10-22 13:43:42Z raasch
114	! code put under GPL (PALM 3.9)
115	!
116	! 2012-09-21 07:03:55Z raasch
117	! FLOAT type conversion replaced by REAL
118	!
119	! 1003 2012-09-14 14:35:53Z raasch
120	! indices nxa, nya, etc. replaced by nx, ny, etc.
121	!
122	! 940 2012-07-09 14:31:00Z raasch
123	! special handling of tri-array as an argument in tridia_1dd routines switched
124	! off because it caused segmentation faults with intel 12.1 compiler
125	!
126	! 877 2012-04-03 11:21:44Z suehring
127	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
128	! pressure at the top of the model domain.
129	!
130	! 809 2012-01-30 13:32:58Z maronga
131	! Bugfix: replaced .AND. and .NOT. with && and ! in the preprocessor directives
132	!
133	! 807 2012-01-25 11:53:51Z maronga
134	! New cpp directive "__check" implemented which is used by check_namelist_files
135	! (most of the code is unneeded by check_namelist_files).
136	!
137	! Revision 1.1 1997/07/24 11:24:14 raasch
138	! Initial revision
139	!
140	!
141	! Description:
142	! ------------
143	!> Solves the Poisson equation with a 2D spectral method
144	!> d^2 p / dx^2 + d^2 p / dy^2 + d^2 p / dz^2 = s
145	!>
146	!> Input:
147	!> real ar contains (nnz,nny,nnx) elements of the velocity divergence,
148	!> starting from (1,nys,nxl)
149	!>
150	!> Output:
151	!> real ar contains the solution for perturbation pressure p
152	!------------------------------------------------------------------------------!
153	MODULE poisfft_mod
154
155
156	USE fft_xy, &
157	ONLY: fft_init, fft_y, fft_y_1d, fft_y_m, fft_x, fft_x_1d, fft_x_m
158
159	USE indices, &
160	ONLY: nnx, nny, nx, nxl, nxr, ny, nys, nyn, nz
161
162	USE transpose_indices, &
163	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nys_x, nys_z, nyn_x, nyn_z, nzb_x, &
164	nzb_y, nzt_x, nzt_y
165
166	USE tridia_solver, &
167	ONLY: tridia_1dd, tridia_init, tridia_substi, tridia_substi_overlap
168
169	IMPLICIT NONE
170
171	LOGICAL, SAVE :: poisfft_initialized = .FALSE.
172
173	PRIVATE
174
175	PUBLIC poisfft, poisfft_init
176
177	INTERFACE poisfft
178	MODULE PROCEDURE poisfft
179	END INTERFACE poisfft
180
181	INTERFACE poisfft_init
182	MODULE PROCEDURE poisfft_init
183	END INTERFACE poisfft_init
184
185
186	CONTAINS
187
188	!------------------------------------------------------------------------------!
189	! Description:
190	! ------------
191	!> @todo Missing subroutine description.
192	!------------------------------------------------------------------------------!
193	SUBROUTINE poisfft_init
194
195	USE arrays_3d, &
196	ONLY: ddzu_pres, ddzw
197
198	USE kinds
199
200	IMPLICIT NONE
201
202	INTEGER(iwp) :: k !<
203
204
205	CALL fft_init
206
207	CALL tridia_init
208
209	poisfft_initialized = .TRUE.
210
211	END SUBROUTINE poisfft_init
212
213
214
215	!------------------------------------------------------------------------------!
216	! Description:
217	! ------------
218	!> Two-dimensional Fourier Transformation in x- and y-direction.
219	!------------------------------------------------------------------------------!
220	SUBROUTINE poisfft( ar )
221
222	USE control_parameters, &
223	ONLY: fft_method, transpose_compute_overlap
224
225	USE cpulog, &
226	ONLY: cpu_log, cpu_log_nowait, log_point_s
227
228	USE kinds
229
230	USE pegrid
231
232	IMPLICIT NONE
233
234	INTEGER(iwp) :: ii !<
235	INTEGER(iwp) :: iind !<
236	INTEGER(iwp) :: inew !<
237	INTEGER(iwp) :: jj !<
238	INTEGER(iwp) :: jind !<
239	INTEGER(iwp) :: jnew !<
240	INTEGER(iwp) :: ki !<
241	INTEGER(iwp) :: kk !<
242	INTEGER(iwp) :: knew !<
243	INTEGER(iwp) :: n !<
244	INTEGER(iwp) :: nblk !<
245	INTEGER(iwp) :: nnx_y !<
246	INTEGER(iwp) :: nny_z !<
247	INTEGER(iwp) :: nnz_t !<
248	INTEGER(iwp) :: nnz_x !<
249	INTEGER(iwp) :: nxl_y_bound !<
250	INTEGER(iwp) :: nxr_y_bound !<
251
252	INTEGER(iwp), DIMENSION(4) :: isave !<
253
254	REAL(wp), DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar !<
255	REAL(wp), DIMENSION(nys:nyn,nxl:nxr,1:nz) :: ar_inv !<
256
257	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: ar1 !<
258	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_in !<
259	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_inv !<
260	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_out_y !<
261	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_out_z !<
262
263
264	CALL cpu_log( log_point_s(3), 'poisfft', 'start' )
265
266	IF ( .NOT. poisfft_initialized ) CALL poisfft_init
267
268	!
269	!-- Two-dimensional Fourier Transformation in x- and y-direction.
270	IF ( pdims(2) == 1 .AND. pdims(1) > 1 ) THEN
271
272	!
273	!-- 1d-domain-decomposition along x:
274	!-- FFT along y and transposition y --> x
275	CALL ffty_tr_yx( ar, ar )
276
277	!
278	!-- FFT along x, solving the tridiagonal system and backward FFT
279	CALL fftx_tri_fftx( ar )
280
281	!
282	!-- Transposition x --> y and backward FFT along y
283	CALL tr_xy_ffty( ar, ar )
284
285	ELSEIF ( pdims(1) == 1 .AND. pdims(2) > 1 ) THEN
286
287	!
288	!-- 1d-domain-decomposition along y:
289	!-- FFT along x and transposition x --> y
290	CALL fftx_tr_xy( ar, ar )
291
292	!
293	!-- FFT along y, solving the tridiagonal system and backward FFT
294	CALL ffty_tri_ffty( ar )
295
296	!
297	!-- Transposition y --> x and backward FFT along x
298	CALL tr_yx_fftx( ar, ar )
299
300	ELSEIF ( .NOT. transpose_compute_overlap ) THEN
301
302	!
303	!-- 2d-domain-decomposition or no decomposition (1 PE run)
304	!-- Transposition z --> x
305	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
306	CALL resort_for_zx( ar, ar_inv )
307	CALL transpose_zx( ar_inv, ar )
308	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
309
310	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
311	CALL fft_x( ar, 'forward' )
312	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
313
314	!
315	!-- Transposition x --> y
316	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
317	CALL resort_for_xy( ar, ar_inv )
318	CALL transpose_xy( ar_inv, ar )
319	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
320
321	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
322	CALL fft_y( ar, 'forward', ar_tr = ar, &
323	nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
324	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
325	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
326
327	!
328	!-- Transposition y --> z
329	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
330	CALL resort_for_yz( ar, ar_inv )
331	CALL transpose_yz( ar_inv, ar )
332	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
333
334	!
335	!-- Solve the tridiagonal equation system along z
336	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
337	CALL tridia_substi( ar )
338	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
339
340	!
341	!-- Inverse Fourier Transformation
342	!-- Transposition z --> y
343	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
344	CALL transpose_zy( ar, ar_inv )
345	CALL resort_for_zy( ar_inv, ar )
346	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
347
348	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
349	CALL fft_y( ar, 'backward', ar_tr = ar, &
350	nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
351	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
352	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
353
354	!
355	!-- Transposition y --> x
356	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
357	CALL transpose_yx( ar, ar_inv )
358	CALL resort_for_yx( ar_inv, ar )
359	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
360
361	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
362	CALL fft_x( ar, 'backward' )
363	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
364
365	!
366	!-- Transposition x --> z
367	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
368	CALL transpose_xz( ar, ar_inv )
369	CALL resort_for_xz( ar_inv, ar )
370	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
371
372	ELSE
373
374	!
375	!-- 2d-domain-decomposition or no decomposition (1 PE run) with
376	!-- overlapping transposition / fft
377	!-- cputime logging must not use barriers, which would prevent overlapping
378	ALLOCATE( f_out_y(0:ny,nxl_y:nxr_y,nzb_y:nzt_y), &
379	f_out_z(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
380	!
381	!-- Transposition z --> x + subsequent fft along x
382	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
383	CALL resort_for_zx( ar, f_inv )
384	!
385	!-- Save original indices and gridpoint counter
386	isave(1) = nz
387	isave(2) = nzb_x
388	isave(3) = nzt_x
389	isave(4) = sendrecvcount_zx
390	!
391	!-- Set new indices for transformation
392	nblk = nz / pdims(1)
393	nz = pdims(1)
394	nnz_x = 1
395	nzb_x = 1 + myidx * nnz_x
396	nzt_x = ( myidx + 1 ) * nnz_x
397	sendrecvcount_zx = nnx * nny * nnz_x
398
399	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
400	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
401
402	DO kk = 1, nblk
403
404	IF ( kk == 1 ) THEN
405	CALL cpu_log( log_point_s(5), 'transpo forward', 'start', cpu_log_nowait )
406	ELSE
407	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
408	ENDIF
409
410	DO knew = 1, nz
411	ki = kk + nblk * ( knew - 1 )
412	f_in(:,:,knew) = f_inv(:,:,ki)
413	ENDDO
414
415	CALL transpose_zx( f_in, ar1(:,:,:))
416	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
417
418	IF ( kk == 1 ) THEN
419	CALL cpu_log( log_point_s(4), 'fft_x', 'start', cpu_log_nowait )
420	ELSE
421	CALL cpu_log( log_point_s(4), 'fft_x', 'continue', cpu_log_nowait )
422	ENDIF
423
424	n = isave(2) + kk - 1
425	CALL fft_x( ar1(:,:,:), 'forward', ar_2d = f_out_z(:,:,n))
426	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
427
428	ENDDO
429	!
430	!-- Restore original indices/counters
431	nz = isave(1)
432	nzb_x = isave(2)
433	nzt_x = isave(3)
434	sendrecvcount_zx = isave(4)
435
436	DEALLOCATE( ar1, f_in, f_inv )
437
438	!
439	!-- Transposition x --> y + subsequent fft along y
440	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
441	CALL resort_for_xy( f_out_z, f_inv )
442	!
443	!-- Save original indices and gridpoint counter
444	isave(1) = nx
445	isave(2) = nxl_y
446	isave(3) = nxr_y
447	isave(4) = sendrecvcount_xy
448	!
449	!-- Set new indices for transformation
450	nblk = ( ( nx+1 ) / pdims(2) ) - 1
451	nx = pdims(2)
452	nnx_y = 1
453	nxl_y = myidy * nnx_y
454	nxr_y = ( myidy + 1 ) * nnx_y - 1
455	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
456
457	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
458	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
459
460	DO ii = 0, nblk
461
462	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
463
464	DO inew = 0, nx-1
465	iind = ii + ( nblk + 1 ) * inew
466	f_in(:,:,inew) = f_inv(:,:,iind)
467	ENDDO
468
469	CALL transpose_xy( f_in, ar1(:,:,:) )
470
471	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
472
473	IF ( ii == 1 ) THEN
474	CALL cpu_log( log_point_s(7), 'fft_y', 'start', cpu_log_nowait )
475	ELSE
476	CALL cpu_log( log_point_s(7), 'fft_y', 'continue', cpu_log_nowait )
477	ENDIF
478
479	nxl_y_bound = isave(2)
480	nxr_y_bound = isave(3)
481	n = isave(2) + ii
482	CALL fft_y( ar1(:,:,:), 'forward', ar_tr = f_out_y, &
483	nxl_y_bound = nxl_y_bound, nxr_y_bound = nxr_y_bound, &
484	nxl_y_l = n, nxr_y_l = n )
485
486	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
487
488	ENDDO
489	!
490	!-- Restore original indices/counters
491	nx = isave(1)
492	nxl_y = isave(2)
493	nxr_y = isave(3)
494	sendrecvcount_xy = isave(4)
495
496	DEALLOCATE( ar1, f_in, f_inv )
497
498	!
499	!-- Transposition y --> z + subsequent tridia + resort for z --> y
500	ALLOCATE( f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
501	CALL resort_for_yz( f_out_y, f_inv )
502	!
503	!-- Save original indices and gridpoint counter
504	isave(1) = ny
505	isave(2) = nys_z
506	isave(3) = nyn_z
507	isave(4) = sendrecvcount_yz
508	!
509	!-- Set new indices for transformation
510	nblk = ( ( ny+1 ) / pdims(1) ) - 1
511	ny = pdims(1)
512	nny_z = 1
513	nys_z = myidx * nny_z
514	nyn_z = ( myidx + 1 ) * nny_z - 1
515	sendrecvcount_yz = ( nxr_y-nxl_y+1 ) * nny_z * ( nzt_y-nzb_y+1 )
516
517	ALLOCATE( ar1(nxl_z:nxr_z,nys_z:nyn_z,1:nz) )
518	ALLOCATE( f_in(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
519
520	DO jj = 0, nblk
521	!
522	!-- Forward Fourier Transformation
523	!-- Transposition y --> z
524	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
525
526	DO jnew = 0, ny-1
527	jind = jj + ( nblk + 1 ) * jnew
528	f_in(:,:,jnew) = f_inv(:,:,jind)
529	ENDDO
530
531	CALL transpose_yz( f_in, ar1(:,:,:) )
532
533	IF ( jj == nblk ) THEN
534	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
535	ELSE
536	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
537	ENDIF
538
539	!
540	!-- Solve the tridiagonal equation system along z
541	CALL cpu_log( log_point_s(6), 'tridia', 'start', cpu_log_nowait )
542
543	n = isave(2) + jj
544	CALL tridia_substi_overlap( ar1(:,:,:), n )
545
546	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
547
548	!
549	!-- Inverse Fourier Transformation
550	!-- Transposition z --> y
551	!-- Only one thread should call MPI routines, therefore forward and
552	!-- backward tranpose are in the same section
553	IF ( jj == 0 ) THEN
554	CALL cpu_log( log_point_s(8), 'transpo invers', 'start', cpu_log_nowait )
555	ELSE
556	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
557	ENDIF
558
559	CALL transpose_zy( ar1(:,:,:), f_in )
560
561	DO jnew = 0, ny-1
562	jind = jj + ( nblk + 1 ) * jnew
563	f_inv(:,:,jind) = f_in(:,:,jnew)
564	ENDDO
565
566	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
567
568	ENDDO
569	!
570	!-- Restore original indices/counters
571	ny = isave(1)
572	nys_z = isave(2)
573	nyn_z = isave(3)
574	sendrecvcount_yz = isave(4)
575
576	CALL resort_for_zy( f_inv, f_out_y )
577
578	DEALLOCATE( ar1, f_in, f_inv )
579
580	!
581	!-- fft along y backward + subsequent transposition y --> x
582	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
583	!
584	!-- Save original indices and gridpoint counter
585	isave(1) = nx
586	isave(2) = nxl_y
587	isave(3) = nxr_y
588	isave(4) = sendrecvcount_xy
589	!
590	!-- Set new indices for transformation
591	nblk = (( nx+1 ) / pdims(2) ) - 1
592	nx = pdims(2)
593	nnx_y = 1
594	nxl_y = myidy * nnx_y
595	nxr_y = ( myidy + 1 ) * nnx_y - 1
596	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
597
598	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
599	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
600
601	DO ii = 0, nblk
602
603	CALL cpu_log( log_point_s(7), 'fft_y', 'continue', cpu_log_nowait )
604
605	n = isave(2) + ii
606	nxl_y_bound = isave(2)
607	nxr_y_bound = isave(3)
608
609	CALL fft_y( ar1(:,:,:), 'backward', ar_tr = f_out_y, &
610	nxl_y_bound = nxl_y_bound, nxr_y_bound = nxr_y_bound, &
611	nxl_y_l = n, nxr_y_l = n )
612
613	IF ( ii == nblk ) THEN
614	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
615	ELSE
616	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
617	ENDIF
618
619	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
620
621	CALL transpose_yx( ar1(:,:,:), f_in )
622
623	DO inew = 0, nx-1
624	iind = ii + (nblk+1) * inew
625	f_inv(:,:,iind) = f_in(:,:,inew)
626	ENDDO
627
628	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
629
630	ENDDO
631	!
632	!-- Restore original indices/counters
633	nx = isave(1)
634	nxl_y = isave(2)
635	nxr_y = isave(3)
636	sendrecvcount_xy = isave(4)
637
638	CALL resort_for_yx( f_inv, f_out_z )
639
640	DEALLOCATE( ar1, f_in, f_inv )
641
642	!
643	!-- fft along x backward + subsequent final transposition x --> z
644	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
645	!
646	!-- Save original indices and gridpoint counter
647	isave(1) = nz
648	isave(2) = nzb_x
649	isave(3) = nzt_x
650	isave(4) = sendrecvcount_zx
651	!
652	!-- Set new indices for transformation
653	nblk = nz / pdims(1)
654	nz = pdims(1)
655	nnz_x = 1
656	nzb_x = 1 + myidx * nnz_x
657	nzt_x = ( myidx + 1 ) * nnz_x
658	sendrecvcount_zx = nnx * nny * nnz_x
659
660	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
661	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
662
663	DO kk = 1, nblk
664
665	CALL cpu_log( log_point_s(4), 'fft_x', 'continue', cpu_log_nowait )
666
667	n = isave(2) + kk - 1
668	CALL fft_x( ar1(:,:,:), 'backward', f_out_z(:,:,n))
669
670	IF ( kk == nblk ) THEN
671	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
672	ELSE
673	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
674	ENDIF
675
676	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
677
678	CALL transpose_xz( ar1(:,:,:), f_in )
679
680	DO knew = 1, nz
681	ki = kk + nblk * (knew-1)
682	f_inv(:,:,ki) = f_in(:,:,knew)
683	ENDDO
684
685	IF ( kk == nblk ) THEN
686	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
687	ELSE
688	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
689	ENDIF
690
691	ENDDO
692	!
693	!-- Restore original indices/counters
694	nz = isave(1)
695	nzb_x = isave(2)
696	nzt_x = isave(3)
697	sendrecvcount_zx = isave(4)
698
699	CALL resort_for_xz( f_inv, ar )
700
701	DEALLOCATE( ar1, f_in, f_inv )
702
703	ENDIF
704
705	CALL cpu_log( log_point_s(3), 'poisfft', 'stop' )
706
707	END SUBROUTINE poisfft
708
709
710	!------------------------------------------------------------------------------!
711	! Description:
712	! ------------
713	!> Fourier-transformation along y with subsequent transposition y --> x for
714	!> a 1d-decomposition along x.
715	!>
716	!> @attention The performance of this routine is much faster on the NEC-SX6,
717	!> if the first index of work_ffty_vec is odd. Otherwise
718	!> memory bank conflicts may occur (especially if the index is a
719	!> multiple of 128). That's why work_ffty_vec is dimensioned as
720	!> 0:ny+1.
721	!> Of course, this will not work if users are using an odd number
722	!> of gridpoints along y.
723	!------------------------------------------------------------------------------!
724	SUBROUTINE ffty_tr_yx( f_in, f_out )
725
726	USE control_parameters, &
727	ONLY: loop_optimization
728
729	USE cpulog, &
730	ONLY: cpu_log, log_point_s
731
732	USE kinds
733
734	USE pegrid
735
736	IMPLICIT NONE
737
738	INTEGER(iwp) :: i !<
739	INTEGER(iwp) :: iend !<
740	INTEGER(iwp) :: iouter !<
741	INTEGER(iwp) :: ir !<
742	INTEGER(iwp) :: j !<
743	INTEGER(iwp) :: k !<
744
745	INTEGER(iwp), PARAMETER :: stridex = 4 !<
746
747	REAL(wp), DIMENSION(1:nz,0:ny,nxl:nxr) :: f_in !<
748	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_out !<
749	REAL(wp), DIMENSION(nxl:nxr,1:nz,0:ny) :: work !<
750
751	REAL(wp), DIMENSION(:,:), ALLOCATABLE :: work_ffty !<
752	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: work_ffty_vec !<
753
754	!
755	!-- Carry out the FFT along y, where all data are present due to the
756	!-- 1d-decomposition along x. Resort the data in a way that x becomes
757	!-- the first index.
758	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'start' )
759
760	IF ( loop_optimization == 'vector' ) THEN
761
762	ALLOCATE( work_ffty_vec(0:ny+1,1:nz,nxl:nxr) )
763	!
764	!-- Code optimized for vector processors
765	!$OMP PARALLEL PRIVATE ( i, j, k )
766	!$OMP DO
767	DO i = nxl, nxr
768
769	DO j = 0, ny
770	DO k = 1, nz
771	work_ffty_vec(j,k,i) = f_in(k,j,i)
772	ENDDO
773	ENDDO
774
775	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' )
776
777	ENDDO
778
779	!$OMP DO
780	DO k = 1, nz
781	DO j = 0, ny
782	DO i = nxl, nxr
783	work(i,k,j) = work_ffty_vec(j,k,i)
784	ENDDO
785	ENDDO
786	ENDDO
787	!$OMP END PARALLEL
788
789	DEALLOCATE( work_ffty_vec )
790
791	ELSE
792	!
793	!-- Cache optimized code.
794	ALLOCATE( work_ffty(0:ny,stridex) )
795	!
796	!-- The i-(x-)direction is split into a strided outer loop and an inner
797	!-- loop for better cache performance
798	!$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty)
799	!$OMP DO
800	DO iouter = nxl, nxr, stridex
801
802	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
803
804	DO k = 1, nz
805
806	DO i = iouter, iend
807
808	ir = i-iouter+1 ! counter within a stride
809	DO j = 0, ny
810	work_ffty(j,ir) = f_in(k,j,i)
811	ENDDO
812	!
813	!-- FFT along y
814	CALL fft_y_1d( work_ffty(:,ir), 'forward' )
815
816	ENDDO
817
818	!
819	!-- Resort
820	DO j = 0, ny
821	DO i = iouter, iend
822	work(i,k,j) = work_ffty(j,i-iouter+1)
823	ENDDO
824	ENDDO
825
826	ENDDO
827
828	ENDDO
829	!$OMP END PARALLEL
830
831	DEALLOCATE( work_ffty )
832
833	ENDIF
834
835	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'pause' )
836
837	!
838	!-- Transpose array
839	#if defined( __parallel )
840	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
841	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
842	CALL MPI_ALLTOALL( work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
843	f_out(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
844	comm1dx, ierr )
845	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
846	#endif
847
848	END SUBROUTINE ffty_tr_yx
849
850
851	!------------------------------------------------------------------------------!
852	! Description:
853	! ------------
854	!> Transposition x --> y with a subsequent backward Fourier transformation for
855	!> a 1d-decomposition along x
856	!------------------------------------------------------------------------------!
857	SUBROUTINE tr_xy_ffty( f_in, f_out )
858
859	USE control_parameters, &
860	ONLY: loop_optimization
861
862	USE cpulog, &
863	ONLY: cpu_log, log_point_s
864
865	USE kinds
866
867	USE pegrid
868
869	IMPLICIT NONE
870
871	INTEGER(iwp) :: i !<
872	INTEGER(iwp) :: iend !<
873	INTEGER(iwp) :: iouter !<
874	INTEGER(iwp) :: ir !<
875	INTEGER(iwp) :: j !<
876	INTEGER(iwp) :: k !<
877
878	INTEGER(iwp), PARAMETER :: stridex = 4 !<
879
880	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_in !<
881	REAL(wp), DIMENSION(1:nz,0:ny,nxl:nxr) :: f_out !<
882	REAL(wp), DIMENSION(nxl:nxr,1:nz,0:ny) :: work !<
883
884	REAL(wp), DIMENSION(:,:), ALLOCATABLE :: work_ffty !<
885	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: work_ffty_vec !<
886
887	!
888	!-- Transpose array
889	#if defined( __parallel )
890	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
891	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
892	CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
893	work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
894	comm1dx, ierr )
895	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
896	#endif
897
898	!
899	!-- Resort the data in a way that y becomes the first index and carry out the
900	!-- backward fft along y.
901	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'continue' )
902
903	IF ( loop_optimization == 'vector' ) THEN
904
905	ALLOCATE( work_ffty_vec(0:ny+1,1:nz,nxl:nxr) )
906	!
907	!-- Code optimized for vector processors
908	!$OMP PARALLEL PRIVATE ( i, j, k )
909	!$OMP DO
910	DO k = 1, nz
911	DO j = 0, ny
912	DO i = nxl, nxr
913	work_ffty_vec(j,k,i) = work(i,k,j)
914	ENDDO
915	ENDDO
916	ENDDO
917
918	!$OMP DO
919	DO i = nxl, nxr
920
921	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' )
922
923	DO j = 0, ny
924	DO k = 1, nz
925	f_out(k,j,i) = work_ffty_vec(j,k,i)
926	ENDDO
927	ENDDO
928
929	ENDDO
930	!$OMP END PARALLEL
931
932	DEALLOCATE( work_ffty_vec )
933
934	ELSE
935	!
936	!-- Cache optimized code.
937	ALLOCATE( work_ffty(0:ny,stridex) )
938	!
939	!-- The i-(x-)direction is split into a strided outer loop and an inner
940	!-- loop for better cache performance
941	!$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty )
942	!$OMP DO
943	DO iouter = nxl, nxr, stridex
944
945	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
946
947	DO k = 1, nz
948	!
949	!-- Resort
950	DO j = 0, ny
951	DO i = iouter, iend
952	work_ffty(j,i-iouter+1) = work(i,k,j)
953	ENDDO
954	ENDDO
955
956	DO i = iouter, iend
957
958	!
959	!-- FFT along y
960	ir = i-iouter+1 ! counter within a stride
961	CALL fft_y_1d( work_ffty(:,ir), 'backward' )
962
963	DO j = 0, ny
964	f_out(k,j,i) = work_ffty(j,ir)
965	ENDDO
966	ENDDO
967
968	ENDDO
969
970	ENDDO
971	!$OMP END PARALLEL
972
973	DEALLOCATE( work_ffty )
974
975	ENDIF
976
977	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'stop' )
978
979	END SUBROUTINE tr_xy_ffty
980
981
982	!------------------------------------------------------------------------------!
983	! Description:
984	! ------------
985	!> FFT along x, solution of the tridiagonal system and backward FFT for
986	!> a 1d-decomposition along x
987	!>
988	!> @warning this subroutine may still not work for hybrid parallelization
989	!> with OpenMP (for possible necessary changes see the original
990	!> routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
991	!------------------------------------------------------------------------------!
992	SUBROUTINE fftx_tri_fftx( ar )
993
994	USE control_parameters, &
995	ONLY: loop_optimization
996
997	USE cpulog, &
998	ONLY: cpu_log, log_point_s
999
1000	USE grid_variables, &
1001	ONLY: ddx2, ddy2
1002
1003	USE kinds
1004
1005	USE pegrid
1006
1007	IMPLICIT NONE
1008
1009	INTEGER(iwp) :: i !<
1010	INTEGER(iwp) :: j !<
1011	INTEGER(iwp) :: k !<
1012	INTEGER(iwp) :: m !<
1013	INTEGER(iwp) :: n !<
1014	INTEGER(iwp) :: omp_get_thread_num !<
1015	INTEGER(iwp) :: tn !<
1016
1017	REAL(wp), DIMENSION(0:nx) :: work_fftx !<
1018	REAL(wp), DIMENSION(0:nx,1:nz) :: work_trix !<
1019	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: ar !<
1020	REAL(wp), DIMENSION(:,:,:,:), ALLOCATABLE :: tri !<
1021
1022
1023	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'start' )
1024
1025	ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) )
1026
1027	tn = 0 ! Default thread number in case of one thread
1028	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_fftx, work_trix )
1029	DO j = nys_x, nyn_x
1030
1031	!$ tn = omp_get_thread_num()
1032
1033	IF ( loop_optimization == 'vector' ) THEN
1034	!
1035	!-- Code optimized for vector processors
1036	DO k = 1, nz
1037
1038	m = 0
1039	DO n = 1, pdims(1)
1040	DO i = 1, nnx
1041	work_trix(m,k) = ar(i,k,j,n)
1042	m = m + 1
1043	ENDDO
1044	ENDDO
1045
1046	ENDDO
1047
1048	CALL fft_x_m( work_trix, 'forward' )
1049
1050	ELSE
1051	!
1052	!-- Cache optimized code
1053	DO k = 1, nz
1054
1055	m = 0
1056	DO n = 1, pdims(1)
1057	DO i = 1, nnx
1058	work_fftx(m) = ar(i,k,j,n)
1059	m = m + 1
1060	ENDDO
1061	ENDDO
1062
1063	CALL fft_x_1d( work_fftx, 'forward' )
1064
1065	DO i = 0, nx
1066	work_trix(i,k) = work_fftx(i)
1067	ENDDO
1068
1069	ENDDO
1070
1071	ENDIF
1072
1073	!
1074	!-- Solve the linear equation system
1075	CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) )
1076
1077	IF ( loop_optimization == 'vector' ) THEN
1078	!
1079	!-- Code optimized for vector processors
1080	CALL fft_x_m( work_trix, 'backward' )
1081
1082	DO k = 1, nz
1083
1084	m = 0
1085	DO n = 1, pdims(1)
1086	DO i = 1, nnx
1087	ar(i,k,j,n) = work_trix(m,k)
1088	m = m + 1
1089	ENDDO
1090	ENDDO
1091
1092	ENDDO
1093
1094	ELSE
1095	!
1096	!-- Cache optimized code
1097	DO k = 1, nz
1098
1099	DO i = 0, nx
1100	work_fftx(i) = work_trix(i,k)
1101	ENDDO
1102
1103	CALL fft_x_1d( work_fftx, 'backward' )
1104
1105	m = 0
1106	DO n = 1, pdims(1)
1107	DO i = 1, nnx
1108	ar(i,k,j,n) = work_fftx(m)
1109	m = m + 1
1110	ENDDO
1111	ENDDO
1112
1113	ENDDO
1114
1115	ENDIF
1116
1117	ENDDO
1118
1119	DEALLOCATE( tri )
1120
1121	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'stop' )
1122
1123	END SUBROUTINE fftx_tri_fftx
1124
1125
1126	!------------------------------------------------------------------------------!
1127	! Description:
1128	! ------------
1129	!> Fourier-transformation along x with subsequent transposition x --> y for
1130	!> a 1d-decomposition along y.
1131	!>
1132	!> @attention NEC-branch of this routine may significantly profit from
1133	!> further optimizations. So far, performance is much worse than
1134	!> for routine ffty_tr_yx (more than three times slower).
1135	!------------------------------------------------------------------------------!
1136	SUBROUTINE fftx_tr_xy( f_in, f_out )
1137
1138
1139	USE control_parameters, &
1140	ONLY: loop_optimization
1141
1142	USE cpulog, &
1143	ONLY: cpu_log, log_point_s
1144
1145	USE kinds
1146
1147	USE pegrid
1148
1149	IMPLICIT NONE
1150
1151	INTEGER(iwp) :: i !<
1152	INTEGER(iwp) :: j !<
1153	INTEGER(iwp) :: k !<
1154
1155	REAL(wp), DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx !<
1156	REAL(wp), DIMENSION(1:nz,nys:nyn,0:nx) :: f_in !<
1157	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_out !<
1158	REAL(wp), DIMENSION(nys:nyn,1:nz,0:nx) :: work !<
1159
1160	!
1161	!-- Carry out the FFT along x, where all data are present due to the
1162	!-- 1d-decomposition along y. Resort the data in a way that y becomes
1163	!-- the first index.
1164	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'start' )
1165
1166	IF ( loop_optimization == 'vector' ) THEN
1167	!
1168	!-- Code for vector processors
1169	!$OMP PARALLEL PRIVATE ( i, j, k )
1170	!$OMP DO
1171	DO i = 0, nx
1172
1173	DO j = nys, nyn
1174	DO k = 1, nz
1175	work_fftx(i,k,j) = f_in(k,j,i)
1176	ENDDO
1177	ENDDO
1178
1179	ENDDO
1180
1181	!$OMP DO
1182	DO j = nys, nyn
1183
1184	CALL fft_x_m( work_fftx(:,:,j), 'forward' )
1185
1186	DO k = 1, nz
1187	DO i = 0, nx
1188	work(j,k,i) = work_fftx(i,k,j)
1189	ENDDO
1190	ENDDO
1191
1192	ENDDO
1193	!$OMP END PARALLEL
1194
1195	ELSE
1196
1197	!
1198	!-- Cache optimized code (there might be still a potential for better
1199	!-- optimization).
1200	!$OMP PARALLEL PRIVATE (i,j,k)
1201	!$OMP DO
1202	DO i = 0, nx
1203
1204	DO j = nys, nyn
1205	DO k = 1, nz
1206	work_fftx(i,k,j) = f_in(k,j,i)
1207	ENDDO
1208	ENDDO
1209
1210	ENDDO
1211
1212	!$OMP DO
1213	DO j = nys, nyn
1214	DO k = 1, nz
1215
1216	CALL fft_x_1d( work_fftx(0:nx,k,j), 'forward' )
1217
1218	DO i = 0, nx
1219	work(j,k,i) = work_fftx(i,k,j)
1220	ENDDO
1221	ENDDO
1222
1223	ENDDO
1224	!$OMP END PARALLEL
1225
1226	ENDIF
1227	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'pause' )
1228
1229	!
1230	!-- Transpose array
1231	#if defined( __parallel )
1232	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1233	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1234	CALL MPI_ALLTOALL( work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
1235	f_out(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
1236	comm1dy, ierr )
1237	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1238	#endif
1239
1240	END SUBROUTINE fftx_tr_xy
1241
1242
1243	!------------------------------------------------------------------------------!
1244	! Description:
1245	! ------------
1246	!> Transposition y --> x with a subsequent backward Fourier transformation for
1247	!> a 1d-decomposition along x.
1248	!------------------------------------------------------------------------------!
1249	SUBROUTINE tr_yx_fftx( f_in, f_out )
1250
1251
1252	USE control_parameters, &
1253	ONLY: loop_optimization
1254
1255	USE cpulog, &
1256	ONLY: cpu_log, log_point_s
1257
1258	USE kinds
1259
1260	USE pegrid
1261
1262	IMPLICIT NONE
1263
1264	INTEGER(iwp) :: i !<
1265	INTEGER(iwp) :: j !<
1266	INTEGER(iwp) :: k !<
1267
1268	REAL(wp), DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx !<
1269	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_in !<
1270	REAL(wp), DIMENSION(1:nz,nys:nyn,0:nx) :: f_out !<
1271	REAL(wp), DIMENSION(nys:nyn,1:nz,0:nx) :: work !<
1272
1273	!
1274	!-- Transpose array
1275	#if defined( __parallel )
1276	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1277	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1278	CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
1279	work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
1280	comm1dy, ierr )
1281	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1282	#endif
1283
1284	!
1285	!-- Carry out the FFT along x, where all data are present due to the
1286	!-- 1d-decomposition along y. Resort the data in a way that y becomes
1287	!-- the first index.
1288	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'continue' )
1289
1290	IF ( loop_optimization == 'vector' ) THEN
1291	!
1292	!-- Code optimized for vector processors
1293	!$OMP PARALLEL PRIVATE ( i, j, k )
1294	!$OMP DO
1295	DO j = nys, nyn
1296
1297	DO k = 1, nz
1298	DO i = 0, nx
1299	work_fftx(i,k,j) = work(j,k,i)
1300	ENDDO
1301	ENDDO
1302
1303	CALL fft_x_m( work_fftx(:,:,j), 'backward' )
1304
1305	ENDDO
1306
1307	!$OMP DO
1308	DO i = 0, nx
1309	DO j = nys, nyn
1310	DO k = 1, nz
1311	f_out(k,j,i) = work_fftx(i,k,j)
1312	ENDDO
1313	ENDDO
1314	ENDDO
1315	!$OMP END PARALLEL
1316
1317	ELSE
1318
1319	!
1320	!-- Cache optimized code (there might be still a potential for better
1321	!-- optimization).
1322	!$OMP PARALLEL PRIVATE (i,j,k)
1323	!$OMP DO
1324	DO j = nys, nyn
1325	DO k = 1, nz
1326
1327	DO i = 0, nx
1328	work_fftx(i,k,j) = work(j,k,i)
1329	ENDDO
1330
1331	CALL fft_x_1d( work_fftx(0:nx,k,j), 'backward' )
1332
1333	ENDDO
1334	ENDDO
1335
1336	!$OMP DO
1337	DO i = 0, nx
1338	DO j = nys, nyn
1339	DO k = 1, nz
1340	f_out(k,j,i) = work_fftx(i,k,j)
1341	ENDDO
1342	ENDDO
1343	ENDDO
1344	!$OMP END PARALLEL
1345
1346	ENDIF
1347	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'stop' )
1348
1349	END SUBROUTINE tr_yx_fftx
1350
1351
1352	!------------------------------------------------------------------------------!
1353	! Description:
1354	! ------------
1355	!> FFT along y, solution of the tridiagonal system and backward FFT for
1356	!> a 1d-decomposition along y.
1357	!>
1358	!> @warning this subroutine may still not work for hybrid parallelization
1359	!> with OpenMP (for possible necessary changes see the original
1360	!> routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
1361	!------------------------------------------------------------------------------!
1362	SUBROUTINE ffty_tri_ffty( ar )
1363
1364
1365	USE control_parameters, &
1366	ONLY: loop_optimization
1367
1368	USE cpulog, &
1369	ONLY: cpu_log, log_point_s
1370
1371	USE grid_variables, &
1372	ONLY: ddx2, ddy2
1373
1374	USE kinds
1375
1376	USE pegrid
1377
1378	IMPLICIT NONE
1379
1380	INTEGER(iwp) :: i !<
1381	INTEGER(iwp) :: j !<
1382	INTEGER(iwp) :: k !<
1383	INTEGER(iwp) :: m !<
1384	INTEGER(iwp) :: n !<
1385	INTEGER(iwp) :: omp_get_thread_num !<
1386	INTEGER(iwp) :: tn !<
1387
1388	REAL(wp), DIMENSION(0:ny) :: work_ffty !<
1389	REAL(wp), DIMENSION(0:ny,1:nz) :: work_triy !<
1390	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: ar !<
1391	REAL(wp), DIMENSION(:,:,:,:), ALLOCATABLE :: tri !<
1392
1393
1394	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'start' )
1395
1396	ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) )
1397
1398	tn = 0 ! Default thread number in case of one thread
1399	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy )
1400	DO i = nxl_y, nxr_y
1401
1402	!$ tn = omp_get_thread_num()
1403
1404	IF ( loop_optimization == 'vector' ) THEN
1405	!
1406	!-- Code optimized for vector processors
1407	DO k = 1, nz
1408
1409	m = 0
1410	DO n = 1, pdims(2)
1411	DO j = 1, nny
1412	work_triy(m,k) = ar(j,k,i,n)
1413	m = m + 1
1414	ENDDO
1415	ENDDO
1416
1417	ENDDO
1418
1419	CALL fft_y_m( work_triy, ny, 'forward' )
1420
1421	ELSE
1422	!
1423	!-- Cache optimized code
1424	DO k = 1, nz
1425
1426	m = 0
1427	DO n = 1, pdims(2)
1428	DO j = 1, nny
1429	work_ffty(m) = ar(j,k,i,n)
1430	m = m + 1
1431	ENDDO
1432	ENDDO
1433
1434	CALL fft_y_1d( work_ffty, 'forward' )
1435
1436	DO j = 0, ny
1437	work_triy(j,k) = work_ffty(j)
1438	ENDDO
1439
1440	ENDDO
1441
1442	ENDIF
1443
1444	!
1445	!-- Solve the linear equation system
1446	CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) )
1447
1448	IF ( loop_optimization == 'vector' ) THEN
1449	!
1450	!-- Code optimized for vector processors
1451	CALL fft_y_m( work_triy, ny, 'backward' )
1452
1453	DO k = 1, nz
1454
1455	m = 0
1456	DO n = 1, pdims(2)
1457	DO j = 1, nny
1458	ar(j,k,i,n) = work_triy(m,k)
1459	m = m + 1
1460	ENDDO
1461	ENDDO
1462
1463	ENDDO
1464
1465	ELSE
1466	!
1467	!-- Cache optimized code
1468	DO k = 1, nz
1469
1470	DO j = 0, ny
1471	work_ffty(j) = work_triy(j,k)
1472	ENDDO
1473
1474	CALL fft_y_1d( work_ffty, 'backward' )
1475
1476	m = 0
1477	DO n = 1, pdims(2)
1478	DO j = 1, nny
1479	ar(j,k,i,n) = work_ffty(m)
1480	m = m + 1
1481	ENDDO
1482	ENDDO
1483
1484	ENDDO
1485
1486	ENDIF
1487
1488	ENDDO
1489
1490	DEALLOCATE( tri )
1491
1492	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'stop' )
1493
1494	END SUBROUTINE ffty_tri_ffty
1495
1496	END MODULE poisfft_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |