Home

Context Navigation

source: palm/trunk/SOURCE/poisfft.f90 @ 1682

Last change on this file since 1682 was 1682, checked in by knoop, 9 years ago
Code annotations made doxygen readable
Property svn:keywords set to `Id`
File size: 45.6 KB

Line
1	!> @file poisfft.f90
2	!--------------------------------------------------------------------------------!
3	! This file is part of PALM.
4	!
5	! PALM is free software: you can redistribute it and/or modify it under the terms
6	! of the GNU General Public License as published by the Free Software Foundation,
7	! either version 3 of the License, or (at your option) any later version.
8	!
9	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
10	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
11	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
12	!
13	! You should have received a copy of the GNU General Public License along with
14	! PALM. If not, see <http://www.gnu.org/licenses/>.
15	!
16	! Copyright 1997-2014 Leibniz Universitaet Hannover
17	!--------------------------------------------------------------------------------!
18	!
19	! Current revisions:
20	! -----------------
21	! Code annotations made doxygen readable
22	!
23	! Former revisions:
24	! -----------------
25	! $Id: poisfft.f90 1682 2015-10-07 23:56:08Z knoop $
26	!
27	! 1482 2014-10-18 12:34:45Z raasch
28	! use 2d-decomposition, if accelerator boards are used
29	!
30	! 1406 2014-05-16 13:47:01Z raasch
31	! bugfix for pgi 14.4: declare create moved after array declaration
32	!
33	! 1320 2014-03-20 08:40:49Z raasch
34	! ONLY-attribute added to USE-statements,
35	! kind-parameters added to all INTEGER and REAL declaration statements,
36	! kinds are defined in new module kinds,
37	! old module precision_kind is removed,
38	! revision history before 2012 removed,
39	! comment fields (!:) to be used for variable explanations added to
40	! all variable declaration statements
41	!
42	! 1318 2014-03-17 13:35:16Z raasch
43	! module interfaces removed
44	!
45	! 1306 2014-03-13 14:30:59Z raasch
46	! openmp sections removed from the overlap branch,
47	! second argument removed from parameter list
48	!
49	! 1216 2013-08-26 09:31:42Z raasch
50	! resorting of arrays moved to separate routines resort_for_...,
51	! one argument, used as temporary work array, removed from all transpose
52	! routines
53	! overlapping fft / transposition implemented
54	!
55	! 1212 2013-08-15 08:46:27Z raasch
56	! tridia routines moved to seperate module tridia_solver
57	!
58	! 1208 2013-08-13 06:41:49Z raasch
59	! acc-update clauses added for "ar" so that ffts other than cufft can also be
60	! used (although they are not ported and will give a poor performance)
61	!
62	! 1111 2013-03-08 23:54:10Z raasch
63	! further openACC porting of non-parallel (MPI) branch:
64	! tridiagonal routines split into extermal subroutines (instead using CONTAINS),
65	! no distinction between parallel/non-parallel in poisfft and tridia any more,
66	! tridia routines moved to end of file because of probable bug in PGI compiler 12.5
67	! (otherwise "invalid device function" is indicated during runtime),
68	! optimization of tridia routines: constant elements and coefficients of tri are
69	! stored in seperate arrays ddzuw and tric, last dimension of tri reduced from 5
70	! to 2,
71	! poisfft_init is now called internally from poisfft, maketri is called from
72	! poisfft_init,
73	! ibc_p_b = 2 removed
74	!
75	! 1106 2013-03-04 05:31:38Z raasch
76	! routines fftx, ffty, fftxp, fftyp removed, calls replaced by fft_x, fft_y,
77	! in the 1D-decomposition routines fft_x, ffty are replaced by fft_x_1d,
78	! fft_y_1d
79	!
80	! 1103 2013-02-20 02:15:53Z raasch
81	! tri, ar, and ar1 arguments in tridia-routines (2d) are removed because they
82	! sometimes cause segmentation faults with intel 12.1 compiler
83	!
84	! 1092 2013-02-02 11:24:22Z raasch
85	! unused variables removed
86	!
87	! 1036 2012-10-22 13:43:42Z raasch
88	! code put under GPL (PALM 3.9)
89	!
90	! 2012-09-21 07:03:55Z raasch
91	! FLOAT type conversion replaced by REAL
92	!
93	! 1003 2012-09-14 14:35:53Z raasch
94	! indices nxa, nya, etc. replaced by nx, ny, etc.
95	!
96	! 940 2012-07-09 14:31:00Z raasch
97	! special handling of tri-array as an argument in tridia_1dd routines switched
98	! off because it caused segmentation faults with intel 12.1 compiler
99	!
100	! 877 2012-04-03 11:21:44Z suehring
101	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
102	! pressure at the top of the model domain.
103	!
104	! 809 2012-01-30 13:32:58Z maronga
105	! Bugfix: replaced .AND. and .NOT. with && and ! in the preprocessor directives
106	!
107	! 807 2012-01-25 11:53:51Z maronga
108	! New cpp directive "__check" implemented which is used by check_namelist_files
109	! (most of the code is unneeded by check_namelist_files).
110	!
111	! Revision 1.1 1997/07/24 11:24:14 raasch
112	! Initial revision
113	!
114	!
115	! Description:
116	! ------------
117	!> Solves the Poisson equation with a 2D spectral method
118	!> d^2 p / dx^2 + d^2 p / dy^2 + d^2 p / dz^2 = s
119	!>
120	!> Input:
121	!> real ar contains (nnz,nny,nnx) elements of the velocity divergence,
122	!> starting from (1,nys,nxl)
123	!>
124	!> Output:
125	!> real ar contains the solution for perturbation pressure p
126	!------------------------------------------------------------------------------!
127	MODULE poisfft_mod
128
129
130	USE fft_xy, &
131	ONLY: fft_init, fft_y, fft_y_1d, fft_y_m, fft_x, fft_x_1d, fft_x_m
132
133	USE indices, &
134	ONLY: nnx, nny, nx, nxl, nxr, ny, nys, nyn, nz
135
136	USE transpose_indices, &
137	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nys_x, nys_z, nyn_x, nyn_z, nzb_x, &
138	nzb_y, nzt_x, nzt_y
139
140	USE tridia_solver, &
141	ONLY: tridia_1dd, tridia_init, tridia_substi, tridia_substi_overlap
142
143	IMPLICIT NONE
144
145	LOGICAL, SAVE :: poisfft_initialized = .FALSE.
146
147	PRIVATE
148
149	#if ! defined ( __check )
150	PUBLIC poisfft, poisfft_init
151
152	INTERFACE poisfft
153	MODULE PROCEDURE poisfft
154	END INTERFACE poisfft
155
156	INTERFACE poisfft_init
157	MODULE PROCEDURE poisfft_init
158	END INTERFACE poisfft_init
159	#else
160	PUBLIC poisfft_init
161
162	INTERFACE poisfft_init
163	MODULE PROCEDURE poisfft_init
164	END INTERFACE poisfft_init
165	#endif
166
167	CONTAINS
168
169	!------------------------------------------------------------------------------!
170	! Description:
171	! ------------
172	!> @todo Missing subroutine description.
173	!------------------------------------------------------------------------------!
174	SUBROUTINE poisfft_init
175
176	USE arrays_3d, &
177	ONLY: ddzu_pres, ddzw
178
179	USE kinds
180
181	IMPLICIT NONE
182
183	INTEGER(iwp) :: k !<
184
185
186	CALL fft_init
187
188	CALL tridia_init
189
190	poisfft_initialized = .TRUE.
191
192	END SUBROUTINE poisfft_init
193
194
195	#if ! defined ( __check )
196	!------------------------------------------------------------------------------!
197	! Description:
198	! ------------
199	!> Two-dimensional Fourier Transformation in x- and y-direction.
200	!------------------------------------------------------------------------------!
201	SUBROUTINE poisfft( ar )
202
203	USE control_parameters, &
204	ONLY: fft_method, transpose_compute_overlap
205
206	USE cpulog, &
207	ONLY: cpu_log, cpu_log_nowait, log_point_s
208
209	USE kinds
210
211	USE pegrid
212
213	IMPLICIT NONE
214
215	INTEGER(iwp) :: ii !<
216	INTEGER(iwp) :: iind !<
217	INTEGER(iwp) :: inew !<
218	INTEGER(iwp) :: jj !<
219	INTEGER(iwp) :: jind !<
220	INTEGER(iwp) :: jnew !<
221	INTEGER(iwp) :: ki !<
222	INTEGER(iwp) :: kk !<
223	INTEGER(iwp) :: knew !<
224	INTEGER(iwp) :: n !<
225	INTEGER(iwp) :: nblk !<
226	INTEGER(iwp) :: nnx_y !<
227	INTEGER(iwp) :: nny_z !<
228	INTEGER(iwp) :: nnz_t !<
229	INTEGER(iwp) :: nnz_x !<
230	INTEGER(iwp) :: nxl_y_bound !<
231	INTEGER(iwp) :: nxr_y_bound !<
232
233	INTEGER(iwp), DIMENSION(4) :: isave !<
234
235	REAL(wp), DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar !<
236	REAL(wp), DIMENSION(nys:nyn,nxl:nxr,1:nz) :: ar_inv !<
237	!$acc declare create( ar_inv )
238
239	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: ar1 !<
240	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_in !<
241	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_inv !<
242	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_out_y !<
243	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_out_z !<
244
245
246	CALL cpu_log( log_point_s(3), 'poisfft', 'start' )
247
248	IF ( .NOT. poisfft_initialized ) CALL poisfft_init
249
250	!
251	!-- Two-dimensional Fourier Transformation in x- and y-direction.
252	IF ( pdims(2) == 1 .AND. pdims(1) > 1 .AND. num_acc_per_node == 0 ) &
253	THEN
254
255	!
256	!-- 1d-domain-decomposition along x:
257	!-- FFT along y and transposition y --> x
258	CALL ffty_tr_yx( ar, ar )
259
260	!
261	!-- FFT along x, solving the tridiagonal system and backward FFT
262	CALL fftx_tri_fftx( ar )
263
264	!
265	!-- Transposition x --> y and backward FFT along y
266	CALL tr_xy_ffty( ar, ar )
267
268	ELSEIF ( pdims(1) == 1 .AND. pdims(2) > 1 .AND. num_acc_per_node == 0 ) &
269	THEN
270
271	!
272	!-- 1d-domain-decomposition along y:
273	!-- FFT along x and transposition x --> y
274	CALL fftx_tr_xy( ar, ar )
275
276	!
277	!-- FFT along y, solving the tridiagonal system and backward FFT
278	CALL ffty_tri_ffty( ar )
279
280	!
281	!-- Transposition y --> x and backward FFT along x
282	CALL tr_yx_fftx( ar, ar )
283
284	ELSEIF ( .NOT. transpose_compute_overlap ) THEN
285
286	!
287	!-- 2d-domain-decomposition or no decomposition (1 PE run)
288	!-- Transposition z --> x
289	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
290	CALL resort_for_zx( ar, ar_inv )
291	CALL transpose_zx( ar_inv, ar )
292	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
293
294	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
295	IF ( fft_method /= 'system-specific' ) THEN
296	!$acc update host( ar )
297	ENDIF
298	CALL fft_x( ar, 'forward' )
299	IF ( fft_method /= 'system-specific' ) THEN
300	!$acc update device( ar )
301	ENDIF
302	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
303
304	!
305	!-- Transposition x --> y
306	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
307	CALL resort_for_xy( ar, ar_inv )
308	CALL transpose_xy( ar_inv, ar )
309	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
310
311	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
312	IF ( fft_method /= 'system-specific' ) THEN
313	!$acc update host( ar )
314	ENDIF
315	CALL fft_y( ar, 'forward', ar_tr = ar, &
316	nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
317	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
318	IF ( fft_method /= 'system-specific' ) THEN
319	!$acc update device( ar )
320	ENDIF
321	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
322
323	!
324	!-- Transposition y --> z
325	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
326	CALL resort_for_yz( ar, ar_inv )
327	CALL transpose_yz( ar_inv, ar )
328	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
329
330	!
331	!-- Solve the tridiagonal equation system along z
332	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
333	CALL tridia_substi( ar )
334	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
335
336	!
337	!-- Inverse Fourier Transformation
338	!-- Transposition z --> y
339	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
340	CALL transpose_zy( ar, ar_inv )
341	CALL resort_for_zy( ar_inv, ar )
342	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
343
344	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
345	IF ( fft_method /= 'system-specific' ) THEN
346	!$acc update host( ar )
347	ENDIF
348	CALL fft_y( ar, 'backward', ar_tr = ar, &
349	nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
350	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
351	IF ( fft_method /= 'system-specific' ) THEN
352	!$acc update device( ar )
353	ENDIF
354	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
355
356	!
357	!-- Transposition y --> x
358	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
359	CALL transpose_yx( ar, ar_inv )
360	CALL resort_for_yx( ar_inv, ar )
361	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
362
363	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
364	IF ( fft_method /= 'system-specific' ) THEN
365	!$acc update host( ar )
366	ENDIF
367	CALL fft_x( ar, 'backward' )
368	IF ( fft_method /= 'system-specific' ) THEN
369	!$acc update device( ar )
370	ENDIF
371	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
372
373	!
374	!-- Transposition x --> z
375	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
376	CALL transpose_xz( ar, ar_inv )
377	CALL resort_for_xz( ar_inv, ar )
378	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
379
380	ELSE
381
382	!
383	!-- 2d-domain-decomposition or no decomposition (1 PE run) with
384	!-- overlapping transposition / fft
385	!-- cputime logging must not use barriers, which would prevent overlapping
386	ALLOCATE( f_out_y(0:ny,nxl_y:nxr_y,nzb_y:nzt_y), &
387	f_out_z(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
388	!
389	!-- Transposition z --> x + subsequent fft along x
390	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
391	CALL resort_for_zx( ar, f_inv )
392	!
393	!-- Save original indices and gridpoint counter
394	isave(1) = nz
395	isave(2) = nzb_x
396	isave(3) = nzt_x
397	isave(4) = sendrecvcount_zx
398	!
399	!-- Set new indices for transformation
400	nblk = nz / pdims(1)
401	nz = pdims(1)
402	nnz_x = 1
403	nzb_x = 1 + myidx * nnz_x
404	nzt_x = ( myidx + 1 ) * nnz_x
405	sendrecvcount_zx = nnx * nny * nnz_x
406
407	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
408	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
409
410	DO kk = 1, nblk
411
412	IF ( kk == 1 ) THEN
413	CALL cpu_log( log_point_s(5), 'transpo forward', 'start', cpu_log_nowait )
414	ELSE
415	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
416	ENDIF
417
418	DO knew = 1, nz
419	ki = kk + nblk * ( knew - 1 )
420	f_in(:,:,knew) = f_inv(:,:,ki)
421	ENDDO
422
423	CALL transpose_zx( f_in, ar1(:,:,:))
424	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
425
426	IF ( kk == 1 ) THEN
427	CALL cpu_log( log_point_s(4), 'fft_x', 'start', cpu_log_nowait )
428	ELSE
429	CALL cpu_log( log_point_s(4), 'fft_x', 'continue', cpu_log_nowait )
430	ENDIF
431
432	n = isave(2) + kk - 1
433	CALL fft_x( ar1(:,:,:), 'forward', ar_2d = f_out_z(:,:,n))
434	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
435
436	ENDDO
437	!
438	!-- Restore original indices/counters
439	nz = isave(1)
440	nzb_x = isave(2)
441	nzt_x = isave(3)
442	sendrecvcount_zx = isave(4)
443
444	DEALLOCATE( ar1, f_in, f_inv )
445
446	!
447	!-- Transposition x --> y + subsequent fft along y
448	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
449	CALL resort_for_xy( f_out_z, f_inv )
450	!
451	!-- Save original indices and gridpoint counter
452	isave(1) = nx
453	isave(2) = nxl_y
454	isave(3) = nxr_y
455	isave(4) = sendrecvcount_xy
456	!
457	!-- Set new indices for transformation
458	nblk = ( ( nx+1 ) / pdims(2) ) - 1
459	nx = pdims(2)
460	nnx_y = 1
461	nxl_y = myidy * nnx_y
462	nxr_y = ( myidy + 1 ) * nnx_y - 1
463	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
464
465	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
466	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
467
468	DO ii = 0, nblk
469
470	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
471
472	DO inew = 0, nx-1
473	iind = ii + ( nblk + 1 ) * inew
474	f_in(:,:,inew) = f_inv(:,:,iind)
475	ENDDO
476
477	CALL transpose_xy( f_in, ar1(:,:,:) )
478
479	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
480
481	IF ( ii == 1 ) THEN
482	CALL cpu_log( log_point_s(7), 'fft_y', 'start', cpu_log_nowait )
483	ELSE
484	CALL cpu_log( log_point_s(7), 'fft_y', 'continue', cpu_log_nowait )
485	ENDIF
486
487	nxl_y_bound = isave(2)
488	nxr_y_bound = isave(3)
489	n = isave(2) + ii
490	CALL fft_y( ar1(:,:,:), 'forward', ar_tr = f_out_y, &
491	nxl_y_bound = nxl_y_bound, nxr_y_bound = nxr_y_bound, &
492	nxl_y_l = n, nxr_y_l = n )
493
494	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
495
496	ENDDO
497	!
498	!-- Restore original indices/counters
499	nx = isave(1)
500	nxl_y = isave(2)
501	nxr_y = isave(3)
502	sendrecvcount_xy = isave(4)
503
504	DEALLOCATE( ar1, f_in, f_inv )
505
506	!
507	!-- Transposition y --> z + subsequent tridia + resort for z --> y
508	ALLOCATE( f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
509	CALL resort_for_yz( f_out_y, f_inv )
510	!
511	!-- Save original indices and gridpoint counter
512	isave(1) = ny
513	isave(2) = nys_z
514	isave(3) = nyn_z
515	isave(4) = sendrecvcount_yz
516	!
517	!-- Set new indices for transformation
518	nblk = ( ( ny+1 ) / pdims(1) ) - 1
519	ny = pdims(1)
520	nny_z = 1
521	nys_z = myidx * nny_z
522	nyn_z = ( myidx + 1 ) * nny_z - 1
523	sendrecvcount_yz = ( nxr_y-nxl_y+1 ) * nny_z * ( nzt_y-nzb_y+1 )
524
525	ALLOCATE( ar1(nxl_z:nxr_z,nys_z:nyn_z,1:nz) )
526	ALLOCATE( f_in(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
527
528	DO jj = 0, nblk
529	!
530	!-- Forward Fourier Transformation
531	!-- Transposition y --> z
532	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
533
534	DO jnew = 0, ny-1
535	jind = jj + ( nblk + 1 ) * jnew
536	f_in(:,:,jnew) = f_inv(:,:,jind)
537	ENDDO
538
539	CALL transpose_yz( f_in, ar1(:,:,:) )
540
541	IF ( jj == nblk ) THEN
542	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
543	ELSE
544	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
545	ENDIF
546
547	!
548	!-- Solve the tridiagonal equation system along z
549	CALL cpu_log( log_point_s(6), 'tridia', 'start', cpu_log_nowait )
550
551	n = isave(2) + jj
552	CALL tridia_substi_overlap( ar1(:,:,:), n )
553
554	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
555
556	!
557	!-- Inverse Fourier Transformation
558	!-- Transposition z --> y
559	!-- Only one thread should call MPI routines, therefore forward and
560	!-- backward tranpose are in the same section
561	IF ( jj == 0 ) THEN
562	CALL cpu_log( log_point_s(8), 'transpo invers', 'start', cpu_log_nowait )
563	ELSE
564	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
565	ENDIF
566
567	CALL transpose_zy( ar1(:,:,:), f_in )
568
569	DO jnew = 0, ny-1
570	jind = jj + ( nblk + 1 ) * jnew
571	f_inv(:,:,jind) = f_in(:,:,jnew)
572	ENDDO
573
574	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
575
576	ENDDO
577	!
578	!-- Restore original indices/counters
579	ny = isave(1)
580	nys_z = isave(2)
581	nyn_z = isave(3)
582	sendrecvcount_yz = isave(4)
583
584	CALL resort_for_zy( f_inv, f_out_y )
585
586	DEALLOCATE( ar1, f_in, f_inv )
587
588	!
589	!-- fft along y backward + subsequent transposition y --> x
590	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
591	!
592	!-- Save original indices and gridpoint counter
593	isave(1) = nx
594	isave(2) = nxl_y
595	isave(3) = nxr_y
596	isave(4) = sendrecvcount_xy
597	!
598	!-- Set new indices for transformation
599	nblk = (( nx+1 ) / pdims(2) ) - 1
600	nx = pdims(2)
601	nnx_y = 1
602	nxl_y = myidy * nnx_y
603	nxr_y = ( myidy + 1 ) * nnx_y - 1
604	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
605
606	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
607	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
608
609	DO ii = 0, nblk
610
611	CALL cpu_log( log_point_s(7), 'fft_y', 'continue', cpu_log_nowait )
612
613	n = isave(2) + ii
614	nxl_y_bound = isave(2)
615	nxr_y_bound = isave(3)
616
617	CALL fft_y( ar1(:,:,:), 'backward', ar_tr = f_out_y, &
618	nxl_y_bound = nxl_y_bound, nxr_y_bound = nxr_y_bound, &
619	nxl_y_l = n, nxr_y_l = n )
620
621	IF ( ii == nblk ) THEN
622	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
623	ELSE
624	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
625	ENDIF
626
627	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
628
629	CALL transpose_yx( ar1(:,:,:), f_in )
630
631	DO inew = 0, nx-1
632	iind = ii + (nblk+1) * inew
633	f_inv(:,:,iind) = f_in(:,:,inew)
634	ENDDO
635
636	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
637
638	ENDDO
639	!
640	!-- Restore original indices/counters
641	nx = isave(1)
642	nxl_y = isave(2)
643	nxr_y = isave(3)
644	sendrecvcount_xy = isave(4)
645
646	CALL resort_for_yx( f_inv, f_out_z )
647
648	DEALLOCATE( ar1, f_in, f_inv )
649
650	!
651	!-- fft along x backward + subsequent final transposition x --> z
652	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
653	!
654	!-- Save original indices and gridpoint counter
655	isave(1) = nz
656	isave(2) = nzb_x
657	isave(3) = nzt_x
658	isave(4) = sendrecvcount_zx
659	!
660	!-- Set new indices for transformation
661	nblk = nz / pdims(1)
662	nz = pdims(1)
663	nnz_x = 1
664	nzb_x = 1 + myidx * nnz_x
665	nzt_x = ( myidx + 1 ) * nnz_x
666	sendrecvcount_zx = nnx * nny * nnz_x
667
668	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
669	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
670
671	DO kk = 1, nblk
672
673	CALL cpu_log( log_point_s(4), 'fft_x', 'continue', cpu_log_nowait )
674
675	n = isave(2) + kk - 1
676	CALL fft_x( ar1(:,:,:), 'backward', f_out_z(:,:,n))
677
678	IF ( kk == nblk ) THEN
679	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
680	ELSE
681	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
682	ENDIF
683
684	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
685
686	CALL transpose_xz( ar1(:,:,:), f_in )
687
688	DO knew = 1, nz
689	ki = kk + nblk * (knew-1)
690	f_inv(:,:,ki) = f_in(:,:,knew)
691	ENDDO
692
693	IF ( kk == nblk ) THEN
694	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
695	ELSE
696	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
697	ENDIF
698
699	ENDDO
700	!
701	!-- Restore original indices/counters
702	nz = isave(1)
703	nzb_x = isave(2)
704	nzt_x = isave(3)
705	sendrecvcount_zx = isave(4)
706
707	CALL resort_for_xz( f_inv, ar )
708
709	DEALLOCATE( ar1, f_in, f_inv )
710
711	ENDIF
712
713	CALL cpu_log( log_point_s(3), 'poisfft', 'stop' )
714
715	END SUBROUTINE poisfft
716
717
718	!------------------------------------------------------------------------------!
719	! Description:
720	! ------------
721	!> Fourier-transformation along y with subsequent transposition y --> x for
722	!> a 1d-decomposition along x.
723	!>
724	!> @attention The performance of this routine is much faster on the NEC-SX6,
725	!> if the first index of work_ffty_vec is odd. Otherwise
726	!> memory bank conflicts may occur (especially if the index is a
727	!> multiple of 128). That's why work_ffty_vec is dimensioned as
728	!> 0:ny+1.
729	!> Of course, this will not work if users are using an odd number
730	!> of gridpoints along y.
731	!------------------------------------------------------------------------------!
732	SUBROUTINE ffty_tr_yx( f_in, f_out )
733
734
735	USE control_parameters, &
736	ONLY: host
737
738	USE cpulog, &
739	ONLY: cpu_log, log_point_s
740
741	USE kinds
742
743	USE pegrid
744
745	IMPLICIT NONE
746
747	INTEGER(iwp) :: i !<
748	INTEGER(iwp) :: iend !<
749	INTEGER(iwp) :: iouter !<
750	INTEGER(iwp) :: ir !<
751	INTEGER(iwp) :: j !<
752	INTEGER(iwp) :: k !<
753
754	INTEGER(iwp), PARAMETER :: stridex = 4 !<
755
756	REAL(wp), DIMENSION(0:ny,stridex) :: work_ffty !<
757	#if defined( __nec )
758	REAL(wp), DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec !<
759	#endif
760	REAL(wp), DIMENSION(1:nz,0:ny,nxl:nxr) :: f_in !<
761	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_out !<
762	REAL(wp), DIMENSION(nxl:nxr,1:nz,0:ny) :: work !<
763
764	!
765	!-- Carry out the FFT along y, where all data are present due to the
766	!-- 1d-decomposition along x. Resort the data in a way that x becomes
767	!-- the first index.
768	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'start' )
769
770	IF ( host(1:3) == 'nec' ) THEN
771	#if defined( __nec )
772	!
773	!-- Code optimized for vector processors
774	!$OMP PARALLEL PRIVATE ( i, j, k )
775	!$OMP DO
776	DO i = nxl, nxr
777
778	DO j = 0, ny
779	DO k = 1, nz
780	work_ffty_vec(j,k,i) = f_in(k,j,i)
781	ENDDO
782	ENDDO
783
784	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' )
785
786	ENDDO
787
788	!$OMP DO
789	DO k = 1, nz
790	DO j = 0, ny
791	DO i = nxl, nxr
792	work(i,k,j) = work_ffty_vec(j,k,i)
793	ENDDO
794	ENDDO
795	ENDDO
796	!$OMP END PARALLEL
797	#endif
798
799	ELSE
800
801	!
802	!-- Cache optimized code.
803	!-- The i-(x-)direction is split into a strided outer loop and an inner
804	!-- loop for better cache performance
805	!$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty)
806	!$OMP DO
807	DO iouter = nxl, nxr, stridex
808
809	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
810
811	DO k = 1, nz
812
813	DO i = iouter, iend
814
815	ir = i-iouter+1 ! counter within a stride
816	DO j = 0, ny
817	work_ffty(j,ir) = f_in(k,j,i)
818	ENDDO
819	!
820	!-- FFT along y
821	CALL fft_y_1d( work_ffty(:,ir), 'forward' )
822
823	ENDDO
824
825	!
826	!-- Resort
827	DO j = 0, ny
828	DO i = iouter, iend
829	work(i,k,j) = work_ffty(j,i-iouter+1)
830	ENDDO
831	ENDDO
832
833	ENDDO
834
835	ENDDO
836	!$OMP END PARALLEL
837
838	ENDIF
839	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'pause' )
840
841	!
842	!-- Transpose array
843	#if defined( __parallel )
844	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
845	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
846	CALL MPI_ALLTOALL( work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
847	f_out(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
848	comm1dx, ierr )
849	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
850	#endif
851
852	END SUBROUTINE ffty_tr_yx
853
854
855	!------------------------------------------------------------------------------!
856	! Description:
857	! ------------
858	!> Transposition x --> y with a subsequent backward Fourier transformation for
859	!> a 1d-decomposition along x
860	!------------------------------------------------------------------------------!
861	SUBROUTINE tr_xy_ffty( f_in, f_out )
862
863
864	USE control_parameters, &
865	ONLY: host
866
867	USE cpulog, &
868	ONLY: cpu_log, log_point_s
869
870	USE kinds
871
872	USE pegrid
873
874	IMPLICIT NONE
875
876	INTEGER(iwp) :: i !<
877	INTEGER(iwp) :: iend !<
878	INTEGER(iwp) :: iouter !<
879	INTEGER(iwp) :: ir !<
880	INTEGER(iwp) :: j !<
881	INTEGER(iwp) :: k !<
882
883	INTEGER(iwp), PARAMETER :: stridex = 4 !<
884
885	REAL(wp), DIMENSION(0:ny,stridex) :: work_ffty !<
886	#if defined( __nec )
887	REAL(wp), DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec !<
888	#endif
889	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_in !<
890	REAL(wp), DIMENSION(1:nz,0:ny,nxl:nxr) :: f_out !<
891	REAL(wp), DIMENSION(nxl:nxr,1:nz,0:ny) :: work !<
892
893	!
894	!-- Transpose array
895	#if defined( __parallel )
896	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
897	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
898	CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
899	work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
900	comm1dx, ierr )
901	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
902	#endif
903
904	!
905	!-- Resort the data in a way that y becomes the first index and carry out the
906	!-- backward fft along y.
907	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'continue' )
908
909	IF ( host(1:3) == 'nec' ) THEN
910	#if defined( __nec )
911	!
912	!-- Code optimized for vector processors
913	!$OMP PARALLEL PRIVATE ( i, j, k )
914	!$OMP DO
915	DO k = 1, nz
916	DO j = 0, ny
917	DO i = nxl, nxr
918	work_ffty_vec(j,k,i) = work(i,k,j)
919	ENDDO
920	ENDDO
921	ENDDO
922
923	!$OMP DO
924	DO i = nxl, nxr
925
926	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' )
927
928	DO j = 0, ny
929	DO k = 1, nz
930	f_out(k,j,i) = work_ffty_vec(j,k,i)
931	ENDDO
932	ENDDO
933
934	ENDDO
935	!$OMP END PARALLEL
936	#endif
937
938	ELSE
939
940	!
941	!-- Cache optimized code.
942	!-- The i-(x-)direction is split into a strided outer loop and an inner
943	!-- loop for better cache performance
944	!$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty )
945	!$OMP DO
946	DO iouter = nxl, nxr, stridex
947
948	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
949
950	DO k = 1, nz
951	!
952	!-- Resort
953	DO j = 0, ny
954	DO i = iouter, iend
955	work_ffty(j,i-iouter+1) = work(i,k,j)
956	ENDDO
957	ENDDO
958
959	DO i = iouter, iend
960
961	!
962	!-- FFT along y
963	ir = i-iouter+1 ! counter within a stride
964	CALL fft_y_1d( work_ffty(:,ir), 'backward' )
965
966	DO j = 0, ny
967	f_out(k,j,i) = work_ffty(j,ir)
968	ENDDO
969	ENDDO
970
971	ENDDO
972
973	ENDDO
974	!$OMP END PARALLEL
975
976	ENDIF
977
978	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'stop' )
979
980	END SUBROUTINE tr_xy_ffty
981
982
983	!------------------------------------------------------------------------------!
984	! Description:
985	! ------------
986	!> FFT along x, solution of the tridiagonal system and backward FFT for
987	!> a 1d-decomposition along x
988	!>
989	!> @warning this subroutine may still not work for hybrid parallelization
990	!> with OpenMP (for possible necessary changes see the original
991	!> routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
992	!------------------------------------------------------------------------------!
993	SUBROUTINE fftx_tri_fftx( ar )
994
995
996	USE control_parameters, &
997	ONLY: host
998
999	USE cpulog, &
1000	ONLY: cpu_log, log_point_s
1001
1002	USE grid_variables, &
1003	ONLY: ddx2, ddy2
1004
1005	USE kinds
1006
1007	USE pegrid
1008
1009	IMPLICIT NONE
1010
1011	INTEGER(iwp) :: i !<
1012	INTEGER(iwp) :: j !<
1013	INTEGER(iwp) :: k !<
1014	INTEGER(iwp) :: m !<
1015	INTEGER(iwp) :: n !<
1016	INTEGER(iwp) :: omp_get_thread_num !<
1017	INTEGER(iwp) :: tn !<
1018
1019	REAL(wp), DIMENSION(0:nx) :: work_fftx !<
1020	REAL(wp), DIMENSION(0:nx,1:nz) :: work_trix !<
1021	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: ar !<
1022	REAL(wp), DIMENSION(:,:,:,:), ALLOCATABLE :: tri !<
1023
1024
1025	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'start' )
1026
1027	ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) )
1028
1029	tn = 0 ! Default thread number in case of one thread
1030	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_fftx, work_trix )
1031	DO j = nys_x, nyn_x
1032
1033	!$ tn = omp_get_thread_num()
1034
1035	IF ( host(1:3) == 'nec' ) THEN
1036	!
1037	!-- Code optimized for vector processors
1038	DO k = 1, nz
1039
1040	m = 0
1041	DO n = 1, pdims(1)
1042	DO i = 1, nnx
1043	work_trix(m,k) = ar(i,k,j,n)
1044	m = m + 1
1045	ENDDO
1046	ENDDO
1047
1048	ENDDO
1049
1050	CALL fft_x_m( work_trix, 'forward' )
1051
1052	ELSE
1053	!
1054	!-- Cache optimized code
1055	DO k = 1, nz
1056
1057	m = 0
1058	DO n = 1, pdims(1)
1059	DO i = 1, nnx
1060	work_fftx(m) = ar(i,k,j,n)
1061	m = m + 1
1062	ENDDO
1063	ENDDO
1064
1065	CALL fft_x_1d( work_fftx, 'forward' )
1066
1067	DO i = 0, nx
1068	work_trix(i,k) = work_fftx(i)
1069	ENDDO
1070
1071	ENDDO
1072
1073	ENDIF
1074
1075	!
1076	!-- Solve the linear equation system
1077	CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) )
1078
1079	IF ( host(1:3) == 'nec' ) THEN
1080	!
1081	!-- Code optimized for vector processors
1082	CALL fft_x_m( work_trix, 'backward' )
1083
1084	DO k = 1, nz
1085
1086	m = 0
1087	DO n = 1, pdims(1)
1088	DO i = 1, nnx
1089	ar(i,k,j,n) = work_trix(m,k)
1090	m = m + 1
1091	ENDDO
1092	ENDDO
1093
1094	ENDDO
1095
1096	ELSE
1097	!
1098	!-- Cache optimized code
1099	DO k = 1, nz
1100
1101	DO i = 0, nx
1102	work_fftx(i) = work_trix(i,k)
1103	ENDDO
1104
1105	CALL fft_x_1d( work_fftx, 'backward' )
1106
1107	m = 0
1108	DO n = 1, pdims(1)
1109	DO i = 1, nnx
1110	ar(i,k,j,n) = work_fftx(m)
1111	m = m + 1
1112	ENDDO
1113	ENDDO
1114
1115	ENDDO
1116
1117	ENDIF
1118
1119	ENDDO
1120
1121	DEALLOCATE( tri )
1122
1123	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'stop' )
1124
1125	END SUBROUTINE fftx_tri_fftx
1126
1127
1128	!------------------------------------------------------------------------------!
1129	! Description:
1130	! ------------
1131	!> Fourier-transformation along x with subsequent transposition x --> y for
1132	!> a 1d-decomposition along y.
1133	!>
1134	!> @attention NEC-branch of this routine may significantly profit from
1135	!> further optimizations. So far, performance is much worse than
1136	!> for routine ffty_tr_yx (more than three times slower).
1137	!------------------------------------------------------------------------------!
1138	SUBROUTINE fftx_tr_xy( f_in, f_out )
1139
1140
1141	USE control_parameters, &
1142	ONLY: host
1143
1144	USE cpulog, &
1145	ONLY: cpu_log, log_point_s
1146
1147	USE kinds
1148
1149	USE pegrid
1150
1151	IMPLICIT NONE
1152
1153	INTEGER(iwp) :: i !<
1154	INTEGER(iwp) :: j !<
1155	INTEGER(iwp) :: k !<
1156
1157	REAL(wp), DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx !<
1158	REAL(wp), DIMENSION(1:nz,nys:nyn,0:nx) :: f_in !<
1159	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_out !<
1160	REAL(wp), DIMENSION(nys:nyn,1:nz,0:nx) :: work !<
1161
1162	!
1163	!-- Carry out the FFT along x, where all data are present due to the
1164	!-- 1d-decomposition along y. Resort the data in a way that y becomes
1165	!-- the first index.
1166	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'start' )
1167
1168	IF ( host(1:3) == 'nec' ) THEN
1169	!
1170	!-- Code for vector processors
1171	!$OMP PARALLEL PRIVATE ( i, j, k )
1172	!$OMP DO
1173	DO i = 0, nx
1174
1175	DO j = nys, nyn
1176	DO k = 1, nz
1177	work_fftx(i,k,j) = f_in(k,j,i)
1178	ENDDO
1179	ENDDO
1180
1181	ENDDO
1182
1183	!$OMP DO
1184	DO j = nys, nyn
1185
1186	CALL fft_x_m( work_fftx(:,:,j), 'forward' )
1187
1188	DO k = 1, nz
1189	DO i = 0, nx
1190	work(j,k,i) = work_fftx(i,k,j)
1191	ENDDO
1192	ENDDO
1193
1194	ENDDO
1195	!$OMP END PARALLEL
1196
1197	ELSE
1198
1199	!
1200	!-- Cache optimized code (there might be still a potential for better
1201	!-- optimization).
1202	!$OMP PARALLEL PRIVATE (i,j,k)
1203	!$OMP DO
1204	DO i = 0, nx
1205
1206	DO j = nys, nyn
1207	DO k = 1, nz
1208	work_fftx(i,k,j) = f_in(k,j,i)
1209	ENDDO
1210	ENDDO
1211
1212	ENDDO
1213
1214	!$OMP DO
1215	DO j = nys, nyn
1216	DO k = 1, nz
1217
1218	CALL fft_x_1d( work_fftx(0:nx,k,j), 'forward' )
1219
1220	DO i = 0, nx
1221	work(j,k,i) = work_fftx(i,k,j)
1222	ENDDO
1223	ENDDO
1224
1225	ENDDO
1226	!$OMP END PARALLEL
1227
1228	ENDIF
1229	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'pause' )
1230
1231	!
1232	!-- Transpose array
1233	#if defined( __parallel )
1234	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1235	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1236	CALL MPI_ALLTOALL( work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
1237	f_out(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
1238	comm1dy, ierr )
1239	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1240	#endif
1241
1242	END SUBROUTINE fftx_tr_xy
1243
1244
1245	!------------------------------------------------------------------------------!
1246	! Description:
1247	! ------------
1248	!> Transposition y --> x with a subsequent backward Fourier transformation for
1249	!> a 1d-decomposition along x.
1250	!------------------------------------------------------------------------------!
1251	SUBROUTINE tr_yx_fftx( f_in, f_out )
1252
1253
1254	USE control_parameters, &
1255	ONLY: host
1256
1257	USE cpulog, &
1258	ONLY: cpu_log, log_point_s
1259
1260	USE kinds
1261
1262	USE pegrid
1263
1264	IMPLICIT NONE
1265
1266	INTEGER(iwp) :: i !<
1267	INTEGER(iwp) :: j !<
1268	INTEGER(iwp) :: k !<
1269
1270	REAL(wp), DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx !<
1271	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_in !<
1272	REAL(wp), DIMENSION(1:nz,nys:nyn,0:nx) :: f_out !<
1273	REAL(wp), DIMENSION(nys:nyn,1:nz,0:nx) :: work !<
1274
1275	!
1276	!-- Transpose array
1277	#if defined( __parallel )
1278	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1279	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1280	CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
1281	work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
1282	comm1dy, ierr )
1283	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1284	#endif
1285
1286	!
1287	!-- Carry out the FFT along x, where all data are present due to the
1288	!-- 1d-decomposition along y. Resort the data in a way that y becomes
1289	!-- the first index.
1290	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'continue' )
1291
1292	IF ( host(1:3) == 'nec' ) THEN
1293	!
1294	!-- Code optimized for vector processors
1295	!$OMP PARALLEL PRIVATE ( i, j, k )
1296	!$OMP DO
1297	DO j = nys, nyn
1298
1299	DO k = 1, nz
1300	DO i = 0, nx
1301	work_fftx(i,k,j) = work(j,k,i)
1302	ENDDO
1303	ENDDO
1304
1305	CALL fft_x_m( work_fftx(:,:,j), 'backward' )
1306
1307	ENDDO
1308
1309	!$OMP DO
1310	DO i = 0, nx
1311	DO j = nys, nyn
1312	DO k = 1, nz
1313	f_out(k,j,i) = work_fftx(i,k,j)
1314	ENDDO
1315	ENDDO
1316	ENDDO
1317	!$OMP END PARALLEL
1318
1319	ELSE
1320
1321	!
1322	!-- Cache optimized code (there might be still a potential for better
1323	!-- optimization).
1324	!$OMP PARALLEL PRIVATE (i,j,k)
1325	!$OMP DO
1326	DO j = nys, nyn
1327	DO k = 1, nz
1328
1329	DO i = 0, nx
1330	work_fftx(i,k,j) = work(j,k,i)
1331	ENDDO
1332
1333	CALL fft_x_1d( work_fftx(0:nx,k,j), 'backward' )
1334
1335	ENDDO
1336	ENDDO
1337
1338	!$OMP DO
1339	DO i = 0, nx
1340	DO j = nys, nyn
1341	DO k = 1, nz
1342	f_out(k,j,i) = work_fftx(i,k,j)
1343	ENDDO
1344	ENDDO
1345	ENDDO
1346	!$OMP END PARALLEL
1347
1348	ENDIF
1349	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'stop' )
1350
1351	END SUBROUTINE tr_yx_fftx
1352
1353
1354	!------------------------------------------------------------------------------!
1355	! Description:
1356	! ------------
1357	!> FFT along y, solution of the tridiagonal system and backward FFT for
1358	!> a 1d-decomposition along y.
1359	!>
1360	!> @warning this subroutine may still not work for hybrid parallelization
1361	!> with OpenMP (for possible necessary changes see the original
1362	!> routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
1363	!------------------------------------------------------------------------------!
1364	SUBROUTINE ffty_tri_ffty( ar )
1365
1366
1367	USE control_parameters, &
1368	ONLY: host
1369
1370	USE cpulog, &
1371	ONLY: cpu_log, log_point_s
1372
1373	USE grid_variables, &
1374	ONLY: ddx2, ddy2
1375
1376	USE kinds
1377
1378	USE pegrid
1379
1380	IMPLICIT NONE
1381
1382	INTEGER(iwp) :: i !<
1383	INTEGER(iwp) :: j !<
1384	INTEGER(iwp) :: k !<
1385	INTEGER(iwp) :: m !<
1386	INTEGER(iwp) :: n !<
1387	INTEGER(iwp) :: omp_get_thread_num !<
1388	INTEGER(iwp) :: tn !<
1389
1390	REAL(wp), DIMENSION(0:ny) :: work_ffty !<
1391	REAL(wp), DIMENSION(0:ny,1:nz) :: work_triy !<
1392	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: ar !<
1393	REAL(wp), DIMENSION(:,:,:,:), ALLOCATABLE :: tri !<
1394
1395
1396	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'start' )
1397
1398	ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) )
1399
1400	tn = 0 ! Default thread number in case of one thread
1401	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy )
1402	DO i = nxl_y, nxr_y
1403
1404	!$ tn = omp_get_thread_num()
1405
1406	IF ( host(1:3) == 'nec' ) THEN
1407	!
1408	!-- Code optimized for vector processors
1409	DO k = 1, nz
1410
1411	m = 0
1412	DO n = 1, pdims(2)
1413	DO j = 1, nny
1414	work_triy(m,k) = ar(j,k,i,n)
1415	m = m + 1
1416	ENDDO
1417	ENDDO
1418
1419	ENDDO
1420
1421	CALL fft_y_m( work_triy, ny, 'forward' )
1422
1423	ELSE
1424	!
1425	!-- Cache optimized code
1426	DO k = 1, nz
1427
1428	m = 0
1429	DO n = 1, pdims(2)
1430	DO j = 1, nny
1431	work_ffty(m) = ar(j,k,i,n)
1432	m = m + 1
1433	ENDDO
1434	ENDDO
1435
1436	CALL fft_y_1d( work_ffty, 'forward' )
1437
1438	DO j = 0, ny
1439	work_triy(j,k) = work_ffty(j)
1440	ENDDO
1441
1442	ENDDO
1443
1444	ENDIF
1445
1446	!
1447	!-- Solve the linear equation system
1448	CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) )
1449
1450	IF ( host(1:3) == 'nec' ) THEN
1451	!
1452	!-- Code optimized for vector processors
1453	CALL fft_y_m( work_triy, ny, 'backward' )
1454
1455	DO k = 1, nz
1456
1457	m = 0
1458	DO n = 1, pdims(2)
1459	DO j = 1, nny
1460	ar(j,k,i,n) = work_triy(m,k)
1461	m = m + 1
1462	ENDDO
1463	ENDDO
1464
1465	ENDDO
1466
1467	ELSE
1468	!
1469	!-- Cache optimized code
1470	DO k = 1, nz
1471
1472	DO j = 0, ny
1473	work_ffty(j) = work_triy(j,k)
1474	ENDDO
1475
1476	CALL fft_y_1d( work_ffty, 'backward' )
1477
1478	m = 0
1479	DO n = 1, pdims(2)
1480	DO j = 1, nny
1481	ar(j,k,i,n) = work_ffty(m)
1482	m = m + 1
1483	ENDDO
1484	ENDDO
1485
1486	ENDDO
1487
1488	ENDIF
1489
1490	ENDDO
1491
1492	DEALLOCATE( tri )
1493
1494	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'stop' )
1495
1496	END SUBROUTINE ffty_tri_ffty
1497
1498	#endif
1499
1500	END MODULE poisfft_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |