Home

Context Navigation

source: palm/trunk/SOURCE/poisfft.f90 @ 1320

Last change on this file since 1320 was 1320, checked in by raasch, 10 years ago
ONLY-attribute added to USE-statements, kind-parameters added to all INTEGER and REAL declaration statements, kinds are defined in new module kinds, old module precision_kind is removed, revision history before 2012 removed, comment fields (!:) to be used for variable explanations added to all variable declaration statements
Property svn:keywords set to `Id`
File size: 44.7 KB

Line
1	MODULE poisfft_mod
2
3	!--------------------------------------------------------------------------------!
4	! This file is part of PALM.
5	!
6	! PALM is free software: you can redistribute it and/or modify it under the terms
7	! of the GNU General Public License as published by the Free Software Foundation,
8	! either version 3 of the License, or (at your option) any later version.
9	!
10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13	!
14	! You should have received a copy of the GNU General Public License along with
15	! PALM. If not, see <http://www.gnu.org/licenses/>.
16	!
17	! Copyright 1997-2014 Leibniz Universitaet Hannover
18	!--------------------------------------------------------------------------------!
19	!
20	! Current revisions:
21	! -----------------
22	! ONLY-attribute added to USE-statements,
23	! kind-parameters added to all INTEGER and REAL declaration statements,
24	! kinds are defined in new module kinds,
25	! old module precision_kind is removed,
26	! revision history before 2012 removed,
27	! comment fields (!:) to be used for variable explanations added to
28	! all variable declaration statements
29	!
30	! Former revisions:
31	! -----------------
32	! $Id: poisfft.f90 1320 2014-03-20 08:40:49Z raasch $
33	!
34	! 1318 2014-03-17 13:35:16Z raasch
35	! module interfaces removed
36	!
37	! 1306 2014-03-13 14:30:59Z raasch
38	! openmp sections removed from the overlap branch,
39	! second argument removed from parameter list
40	!
41	! 1216 2013-08-26 09:31:42Z raasch
42	! resorting of arrays moved to separate routines resort_for_...,
43	! one argument, used as temporary work array, removed from all transpose
44	! routines
45	! overlapping fft / transposition implemented
46	!
47	! 1212 2013-08-15 08:46:27Z raasch
48	! tridia routines moved to seperate module tridia_solver
49	!
50	! 1208 2013-08-13 06:41:49Z raasch
51	! acc-update clauses added for "ar" so that ffts other than cufft can also be
52	! used (although they are not ported and will give a poor performance)
53	!
54	! 1111 2013-03-08 23:54:10Z raasch
55	! further openACC porting of non-parallel (MPI) branch:
56	! tridiagonal routines split into extermal subroutines (instead using CONTAINS),
57	! no distinction between parallel/non-parallel in poisfft and tridia any more,
58	! tridia routines moved to end of file because of probable bug in PGI compiler 12.5
59	! (otherwise "invalid device function" is indicated during runtime),
60	! optimization of tridia routines: constant elements and coefficients of tri are
61	! stored in seperate arrays ddzuw and tric, last dimension of tri reduced from 5
62	! to 2,
63	! poisfft_init is now called internally from poisfft, maketri is called from
64	! poisfft_init,
65	! ibc_p_b = 2 removed
66	!
67	! 1106 2013-03-04 05:31:38Z raasch
68	! routines fftx, ffty, fftxp, fftyp removed, calls replaced by fft_x, fft_y,
69	! in the 1D-decomposition routines fft_x, ffty are replaced by fft_x_1d,
70	! fft_y_1d
71	!
72	! 1103 2013-02-20 02:15:53Z raasch
73	! tri, ar, and ar1 arguments in tridia-routines (2d) are removed because they
74	! sometimes cause segmentation faults with intel 12.1 compiler
75	!
76	! 1092 2013-02-02 11:24:22Z raasch
77	! unused variables removed
78	!
79	! 1036 2012-10-22 13:43:42Z raasch
80	! code put under GPL (PALM 3.9)
81	!
82	! 2012-09-21 07:03:55Z raasch
83	! FLOAT type conversion replaced by REAL
84	!
85	! 1003 2012-09-14 14:35:53Z raasch
86	! indices nxa, nya, etc. replaced by nx, ny, etc.
87	!
88	! 940 2012-07-09 14:31:00Z raasch
89	! special handling of tri-array as an argument in tridia_1dd routines switched
90	! off because it caused segmentation faults with intel 12.1 compiler
91	!
92	! 877 2012-04-03 11:21:44Z suehring
93	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
94	! pressure at the top of the model domain.
95	!
96	! 809 2012-01-30 13:32:58Z maronga
97	! Bugfix: replaced .AND. and .NOT. with && and ! in the preprocessor directives
98	!
99	! 807 2012-01-25 11:53:51Z maronga
100	! New cpp directive "__check" implemented which is used by check_namelist_files
101	! (most of the code is unneeded by check_namelist_files).
102	!
103	! Revision 1.1 1997/07/24 11:24:14 raasch
104	! Initial revision
105	!
106	!
107	! Description:
108	! ------------
109	! Original version by Stephan Siano (pois3d), as of July 23, 1996
110	! Adapted for 2D-domain-decomposition by Siegfried Raasch, July 3, 1997
111	!
112	! Solves the Poisson equation with a 2D spectral method
113	! d^2 p / dx^2 + d^2 p / dy^2 + d^2 p / dz^2 = s
114	!
115	! Input:
116	! real ar contains (nnz,nny,nnx) elements of the velocity divergence,
117	! starting from (1,nys,nxl)
118	!
119	! Output:
120	! real ar contains the solution for perturbation pressure p
121	!------------------------------------------------------------------------------!
122
123	USE fft_xy, &
124	ONLY: fft_init, fft_y, fft_y_1d, fft_y_m, fft_x, fft_x_1d, fft_x_m
125
126	USE indices, &
127	ONLY: nnx, nny, nx, nxl, nxr, ny, nys, nyn, nz
128
129	USE transpose_indices, &
130	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nys_x, nys_z, nyn_x, nyn_z, nzb_x, &
131	nzb_y, nzt_x, nzt_y
132
133	USE tridia_solver, &
134	ONLY: tridia_1dd, tridia_init, tridia_substi, tridia_substi_overlap
135
136	IMPLICIT NONE
137
138	LOGICAL, SAVE :: poisfft_initialized = .FALSE.
139
140	PRIVATE
141
142	#if ! defined ( __check )
143	PUBLIC poisfft, poisfft_init
144
145	INTERFACE poisfft
146	MODULE PROCEDURE poisfft
147	END INTERFACE poisfft
148
149	INTERFACE poisfft_init
150	MODULE PROCEDURE poisfft_init
151	END INTERFACE poisfft_init
152	#else
153	PUBLIC poisfft_init
154
155	INTERFACE poisfft_init
156	MODULE PROCEDURE poisfft_init
157	END INTERFACE poisfft_init
158	#endif
159
160	CONTAINS
161
162	SUBROUTINE poisfft_init
163
164	USE arrays_3d, &
165	ONLY: ddzu_pres, ddzw
166
167	USE kinds
168
169	IMPLICIT NONE
170
171	INTEGER(iwp) :: k !:
172
173
174	CALL fft_init
175
176	CALL tridia_init
177
178	poisfft_initialized = .TRUE.
179
180	END SUBROUTINE poisfft_init
181
182
183	#if ! defined ( __check )
184	SUBROUTINE poisfft( ar )
185
186	USE control_parameters, &
187	ONLY: fft_method, transpose_compute_overlap
188
189	USE cpulog, &
190	ONLY: cpu_log, cpu_log_nowait, log_point_s
191
192	USE kinds
193
194	USE pegrid
195
196	IMPLICIT NONE
197
198	INTEGER(iwp) :: ii !:
199	INTEGER(iwp) :: iind !:
200	INTEGER(iwp) :: inew !:
201	INTEGER(iwp) :: jj !:
202	INTEGER(iwp) :: jind !:
203	INTEGER(iwp) :: jnew !:
204	INTEGER(iwp) :: ki !:
205	INTEGER(iwp) :: kk !:
206	INTEGER(iwp) :: knew !:
207	INTEGER(iwp) :: n !:
208	INTEGER(iwp) :: nblk !:
209	INTEGER(iwp) :: nnx_y !:
210	INTEGER(iwp) :: nny_z !:
211	INTEGER(iwp) :: nnz_t !:
212	INTEGER(iwp) :: nnz_x !:
213	INTEGER(iwp) :: nxl_y_bound !:
214	INTEGER(iwp) :: nxr_y_bound !:
215
216	INTEGER(iwp), DIMENSION(4) :: isave !:
217
218	REAL(wp), DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar !:
219	!$acc declare create( ar_inv )
220	REAL(wp), DIMENSION(nys:nyn,nxl:nxr,1:nz) :: ar_inv !:
221
222	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: ar1 !:
223	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_in !:
224	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_inv !:
225	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_out_y !:
226	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_out_z !:
227
228
229	CALL cpu_log( log_point_s(3), 'poisfft', 'start' )
230
231	IF ( .NOT. poisfft_initialized ) CALL poisfft_init
232
233	!
234	!-- Two-dimensional Fourier Transformation in x- and y-direction.
235	IF ( pdims(2) == 1 .AND. pdims(1) > 1 ) THEN
236
237	!
238	!-- 1d-domain-decomposition along x:
239	!-- FFT along y and transposition y --> x
240	CALL ffty_tr_yx( ar, ar )
241
242	!
243	!-- FFT along x, solving the tridiagonal system and backward FFT
244	CALL fftx_tri_fftx( ar )
245
246	!
247	!-- Transposition x --> y and backward FFT along y
248	CALL tr_xy_ffty( ar, ar )
249
250	ELSEIF ( pdims(1) == 1 .AND. pdims(2) > 1 ) THEN
251
252	!
253	!-- 1d-domain-decomposition along y:
254	!-- FFT along x and transposition x --> y
255	CALL fftx_tr_xy( ar, ar )
256
257	!
258	!-- FFT along y, solving the tridiagonal system and backward FFT
259	CALL ffty_tri_ffty( ar )
260
261	!
262	!-- Transposition y --> x and backward FFT along x
263	CALL tr_yx_fftx( ar, ar )
264
265	ELSEIF ( .NOT. transpose_compute_overlap ) THEN
266
267	!
268	!-- 2d-domain-decomposition or no decomposition (1 PE run)
269	!-- Transposition z --> x
270	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
271	CALL resort_for_zx( ar, ar_inv )
272	CALL transpose_zx( ar_inv, ar )
273	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
274
275	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
276	IF ( fft_method /= 'system-specific' ) THEN
277	!$acc update host( ar )
278	ENDIF
279	CALL fft_x( ar, 'forward' )
280	IF ( fft_method /= 'system-specific' ) THEN
281	!$acc update device( ar )
282	ENDIF
283	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
284
285	!
286	!-- Transposition x --> y
287	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
288	CALL resort_for_xy( ar, ar_inv )
289	CALL transpose_xy( ar_inv, ar )
290	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
291
292	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
293	IF ( fft_method /= 'system-specific' ) THEN
294	!$acc update host( ar )
295	ENDIF
296	CALL fft_y( ar, 'forward', ar_tr = ar, &
297	nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
298	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
299	IF ( fft_method /= 'system-specific' ) THEN
300	!$acc update device( ar )
301	ENDIF
302	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
303
304	!
305	!-- Transposition y --> z
306	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
307	CALL resort_for_yz( ar, ar_inv )
308	CALL transpose_yz( ar_inv, ar )
309	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
310
311	!
312	!-- Solve the tridiagonal equation system along z
313	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
314	CALL tridia_substi( ar )
315	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
316
317	!
318	!-- Inverse Fourier Transformation
319	!-- Transposition z --> y
320	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
321	CALL transpose_zy( ar, ar_inv )
322	CALL resort_for_zy( ar_inv, ar )
323	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
324
325	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
326	IF ( fft_method /= 'system-specific' ) THEN
327	!$acc update host( ar )
328	ENDIF
329	CALL fft_y( ar, 'backward', ar_tr = ar, &
330	nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
331	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
332	IF ( fft_method /= 'system-specific' ) THEN
333	!$acc update device( ar )
334	ENDIF
335	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
336
337	!
338	!-- Transposition y --> x
339	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
340	CALL transpose_yx( ar, ar_inv )
341	CALL resort_for_yx( ar_inv, ar )
342	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
343
344	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
345	IF ( fft_method /= 'system-specific' ) THEN
346	!$acc update host( ar )
347	ENDIF
348	CALL fft_x( ar, 'backward' )
349	IF ( fft_method /= 'system-specific' ) THEN
350	!$acc update device( ar )
351	ENDIF
352	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
353
354	!
355	!-- Transposition x --> z
356	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
357	CALL transpose_xz( ar, ar_inv )
358	CALL resort_for_xz( ar_inv, ar )
359	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
360
361	ELSE
362
363	!
364	!-- 2d-domain-decomposition or no decomposition (1 PE run) with
365	!-- overlapping transposition / fft
366	!-- cputime logging must not use barriers, which would prevent overlapping
367	ALLOCATE( f_out_y(0:ny,nxl_y:nxr_y,nzb_y:nzt_y), &
368	f_out_z(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
369	!
370	!-- Transposition z --> x + subsequent fft along x
371	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
372	CALL resort_for_zx( ar, f_inv )
373	!
374	!-- Save original indices and gridpoint counter
375	isave(1) = nz
376	isave(2) = nzb_x
377	isave(3) = nzt_x
378	isave(4) = sendrecvcount_zx
379	!
380	!-- Set new indices for transformation
381	nblk = nz / pdims(1)
382	nz = pdims(1)
383	nnz_x = 1
384	nzb_x = 1 + myidx * nnz_x
385	nzt_x = ( myidx + 1 ) * nnz_x
386	sendrecvcount_zx = nnx * nny * nnz_x
387
388	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
389	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
390
391	DO kk = 1, nblk
392
393	IF ( kk == 1 ) THEN
394	CALL cpu_log( log_point_s(5), 'transpo forward', 'start', cpu_log_nowait )
395	ELSE
396	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
397	ENDIF
398
399	DO knew = 1, nz
400	ki = kk + nblk * ( knew - 1 )
401	f_in(:,:,knew) = f_inv(:,:,ki)
402	ENDDO
403
404	CALL transpose_zx( f_in, ar1(:,:,:))
405	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
406
407	IF ( kk == 1 ) THEN
408	CALL cpu_log( log_point_s(4), 'fft_x', 'start', cpu_log_nowait )
409	ELSE
410	CALL cpu_log( log_point_s(4), 'fft_x', 'continue', cpu_log_nowait )
411	ENDIF
412
413	n = isave(2) + kk - 1
414	CALL fft_x( ar1(:,:,:), 'forward', ar_2d = f_out_z(:,:,n))
415	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
416
417	ENDDO
418	!
419	!-- Restore original indices/counters
420	nz = isave(1)
421	nzb_x = isave(2)
422	nzt_x = isave(3)
423	sendrecvcount_zx = isave(4)
424
425	DEALLOCATE( ar1, f_in, f_inv )
426
427	!
428	!-- Transposition x --> y + subsequent fft along y
429	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
430	CALL resort_for_xy( f_out_z, f_inv )
431	!
432	!-- Save original indices and gridpoint counter
433	isave(1) = nx
434	isave(2) = nxl_y
435	isave(3) = nxr_y
436	isave(4) = sendrecvcount_xy
437	!
438	!-- Set new indices for transformation
439	nblk = ( ( nx+1 ) / pdims(2) ) - 1
440	nx = pdims(2)
441	nnx_y = 1
442	nxl_y = myidy * nnx_y
443	nxr_y = ( myidy + 1 ) * nnx_y - 1
444	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
445
446	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
447	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
448
449	DO ii = 0, nblk
450
451	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
452
453	DO inew = 0, nx-1
454	iind = ii + ( nblk + 1 ) * inew
455	f_in(:,:,inew) = f_inv(:,:,iind)
456	ENDDO
457
458	CALL transpose_xy( f_in, ar1(:,:,:) )
459
460	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
461
462	IF ( ii == 1 ) THEN
463	CALL cpu_log( log_point_s(7), 'fft_y', 'start', cpu_log_nowait )
464	ELSE
465	CALL cpu_log( log_point_s(7), 'fft_y', 'continue', cpu_log_nowait )
466	ENDIF
467
468	nxl_y_bound = isave(2)
469	nxr_y_bound = isave(3)
470	n = isave(2) + ii
471	CALL fft_y( ar1(:,:,:), 'forward', ar_tr = f_out_y, &
472	nxl_y_bound = nxl_y_bound, nxr_y_bound = nxr_y_bound, &
473	nxl_y_l = n, nxr_y_l = n )
474
475	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
476
477	ENDDO
478	!
479	!-- Restore original indices/counters
480	nx = isave(1)
481	nxl_y = isave(2)
482	nxr_y = isave(3)
483	sendrecvcount_xy = isave(4)
484
485	DEALLOCATE( ar1, f_in, f_inv )
486
487	!
488	!-- Transposition y --> z + subsequent tridia + resort for z --> y
489	ALLOCATE( f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
490	CALL resort_for_yz( f_out_y, f_inv )
491	!
492	!-- Save original indices and gridpoint counter
493	isave(1) = ny
494	isave(2) = nys_z
495	isave(3) = nyn_z
496	isave(4) = sendrecvcount_yz
497	!
498	!-- Set new indices for transformation
499	nblk = ( ( ny+1 ) / pdims(1) ) - 1
500	ny = pdims(1)
501	nny_z = 1
502	nys_z = myidx * nny_z
503	nyn_z = ( myidx + 1 ) * nny_z - 1
504	sendrecvcount_yz = ( nxr_y-nxl_y+1 ) * nny_z * ( nzt_y-nzb_y+1 )
505
506	ALLOCATE( ar1(nxl_z:nxr_z,nys_z:nyn_z,1:nz) )
507	ALLOCATE( f_in(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
508
509	DO jj = 0, nblk
510	!
511	!-- Forward Fourier Transformation
512	!-- Transposition y --> z
513	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
514
515	DO jnew = 0, ny-1
516	jind = jj + ( nblk + 1 ) * jnew
517	f_in(:,:,jnew) = f_inv(:,:,jind)
518	ENDDO
519
520	CALL transpose_yz( f_in, ar1(:,:,:) )
521
522	IF ( jj == nblk ) THEN
523	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
524	ELSE
525	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
526	ENDIF
527
528	!
529	!-- Solve the tridiagonal equation system along z
530	CALL cpu_log( log_point_s(6), 'tridia', 'start', cpu_log_nowait )
531
532	n = isave(2) + jj
533	CALL tridia_substi_overlap( ar1(:,:,:), n )
534
535	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
536
537	!
538	!-- Inverse Fourier Transformation
539	!-- Transposition z --> y
540	!-- Only one thread should call MPI routines, therefore forward and
541	!-- backward tranpose are in the same section
542	IF ( jj == 0 ) THEN
543	CALL cpu_log( log_point_s(8), 'transpo invers', 'start', cpu_log_nowait )
544	ELSE
545	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
546	ENDIF
547
548	CALL transpose_zy( ar1(:,:,:), f_in )
549
550	DO jnew = 0, ny-1
551	jind = jj + ( nblk + 1 ) * jnew
552	f_inv(:,:,jind) = f_in(:,:,jnew)
553	ENDDO
554
555	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
556
557	ENDDO
558	!
559	!-- Restore original indices/counters
560	ny = isave(1)
561	nys_z = isave(2)
562	nyn_z = isave(3)
563	sendrecvcount_yz = isave(4)
564
565	CALL resort_for_zy( f_inv, f_out_y )
566
567	DEALLOCATE( ar1, f_in, f_inv )
568
569	!
570	!-- fft along y backward + subsequent transposition y --> x
571	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
572	!
573	!-- Save original indices and gridpoint counter
574	isave(1) = nx
575	isave(2) = nxl_y
576	isave(3) = nxr_y
577	isave(4) = sendrecvcount_xy
578	!
579	!-- Set new indices for transformation
580	nblk = (( nx+1 ) / pdims(2) ) - 1
581	nx = pdims(2)
582	nnx_y = 1
583	nxl_y = myidy * nnx_y
584	nxr_y = ( myidy + 1 ) * nnx_y - 1
585	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
586
587	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
588	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
589
590	DO ii = 0, nblk
591
592	CALL cpu_log( log_point_s(7), 'fft_y', 'continue', cpu_log_nowait )
593
594	n = isave(2) + ii
595	nxl_y_bound = isave(2)
596	nxr_y_bound = isave(3)
597
598	CALL fft_y( ar1(:,:,:), 'backward', ar_tr = f_out_y, &
599	nxl_y_bound = nxl_y_bound, nxr_y_bound = nxr_y_bound, &
600	nxl_y_l = n, nxr_y_l = n )
601
602	IF ( ii == nblk ) THEN
603	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
604	ELSE
605	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
606	ENDIF
607
608	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
609
610	CALL transpose_yx( ar1(:,:,:), f_in )
611
612	DO inew = 0, nx-1
613	iind = ii + (nblk+1) * inew
614	f_inv(:,:,iind) = f_in(:,:,inew)
615	ENDDO
616
617	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
618
619	ENDDO
620	!
621	!-- Restore original indices/counters
622	nx = isave(1)
623	nxl_y = isave(2)
624	nxr_y = isave(3)
625	sendrecvcount_xy = isave(4)
626
627	CALL resort_for_yx( f_inv, f_out_z )
628
629	DEALLOCATE( ar1, f_in, f_inv )
630
631	!
632	!-- fft along x backward + subsequent final transposition x --> z
633	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
634	!
635	!-- Save original indices and gridpoint counter
636	isave(1) = nz
637	isave(2) = nzb_x
638	isave(3) = nzt_x
639	isave(4) = sendrecvcount_zx
640	!
641	!-- Set new indices for transformation
642	nblk = nz / pdims(1)
643	nz = pdims(1)
644	nnz_x = 1
645	nzb_x = 1 + myidx * nnz_x
646	nzt_x = ( myidx + 1 ) * nnz_x
647	sendrecvcount_zx = nnx * nny * nnz_x
648
649	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
650	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
651
652	DO kk = 1, nblk
653
654	CALL cpu_log( log_point_s(4), 'fft_x', 'continue', cpu_log_nowait )
655
656	n = isave(2) + kk - 1
657	CALL fft_x( ar1(:,:,:), 'backward', f_out_z(:,:,n))
658
659	IF ( kk == nblk ) THEN
660	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
661	ELSE
662	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
663	ENDIF
664
665	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
666
667	CALL transpose_xz( ar1(:,:,:), f_in )
668
669	DO knew = 1, nz
670	ki = kk + nblk * (knew-1)
671	f_inv(:,:,ki) = f_in(:,:,knew)
672	ENDDO
673
674	IF ( kk == nblk ) THEN
675	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
676	ELSE
677	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
678	ENDIF
679
680	ENDDO
681	!
682	!-- Restore original indices/counters
683	nz = isave(1)
684	nzb_x = isave(2)
685	nzt_x = isave(3)
686	sendrecvcount_zx = isave(4)
687
688	CALL resort_for_xz( f_inv, ar )
689
690	DEALLOCATE( ar1, f_in, f_inv )
691
692	ENDIF
693
694	CALL cpu_log( log_point_s(3), 'poisfft', 'stop' )
695
696	END SUBROUTINE poisfft
697
698
699
700	SUBROUTINE ffty_tr_yx( f_in, f_out )
701
702	!------------------------------------------------------------------------------!
703	! Fourier-transformation along y with subsequent transposition y --> x for
704	! a 1d-decomposition along x
705	!
706	! ATTENTION: The performance of this routine is much faster on the NEC-SX6,
707	! if the first index of work_ffty_vec is odd. Otherwise
708	! memory bank conflicts may occur (especially if the index is a
709	! multiple of 128). That's why work_ffty_vec is dimensioned as
710	! 0:ny+1.
711	! Of course, this will not work if users are using an odd number
712	! of gridpoints along y.
713	!------------------------------------------------------------------------------!
714
715	USE control_parameters, &
716	ONLY: host
717
718	USE cpulog, &
719	ONLY: cpu_log, log_point_s
720
721	USE kinds
722
723	USE pegrid
724
725	IMPLICIT NONE
726
727	INTEGER(iwp) :: i !:
728	INTEGER(iwp) :: iend !:
729	INTEGER(iwp) :: iouter !:
730	INTEGER(iwp) :: ir !:
731	INTEGER(iwp) :: j !:
732	INTEGER(iwp) :: k !:
733
734	INTEGER(iwp), PARAMETER :: stridex = 4 !:
735
736	REAL(wp), DIMENSION(0:ny,stridex) :: work_ffty !:
737	#if defined( __nec )
738	REAL(wp), DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec !:
739	#endif
740	REAL(wp), DIMENSION(1:nz,0:ny,nxl:nxr) :: f_in !:
741	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_out !:
742	REAL(wp), DIMENSION(nxl:nxr,1:nz,0:ny) :: work !:
743
744	!
745	!-- Carry out the FFT along y, where all data are present due to the
746	!-- 1d-decomposition along x. Resort the data in a way that x becomes
747	!-- the first index.
748	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'start' )
749
750	IF ( host(1:3) == 'nec' ) THEN
751	#if defined( __nec )
752	!
753	!-- Code optimized for vector processors
754	!$OMP PARALLEL PRIVATE ( i, j, k )
755	!$OMP DO
756	DO i = nxl, nxr
757
758	DO j = 0, ny
759	DO k = 1, nz
760	work_ffty_vec(j,k,i) = f_in(k,j,i)
761	ENDDO
762	ENDDO
763
764	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' )
765
766	ENDDO
767
768	!$OMP DO
769	DO k = 1, nz
770	DO j = 0, ny
771	DO i = nxl, nxr
772	work(i,k,j) = work_ffty_vec(j,k,i)
773	ENDDO
774	ENDDO
775	ENDDO
776	!$OMP END PARALLEL
777	#endif
778
779	ELSE
780
781	!
782	!-- Cache optimized code.
783	!-- The i-(x-)direction is split into a strided outer loop and an inner
784	!-- loop for better cache performance
785	!$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty)
786	!$OMP DO
787	DO iouter = nxl, nxr, stridex
788
789	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
790
791	DO k = 1, nz
792
793	DO i = iouter, iend
794
795	ir = i-iouter+1 ! counter within a stride
796	DO j = 0, ny
797	work_ffty(j,ir) = f_in(k,j,i)
798	ENDDO
799	!
800	!-- FFT along y
801	CALL fft_y_1d( work_ffty(:,ir), 'forward' )
802
803	ENDDO
804
805	!
806	!-- Resort
807	DO j = 0, ny
808	DO i = iouter, iend
809	work(i,k,j) = work_ffty(j,i-iouter+1)
810	ENDDO
811	ENDDO
812
813	ENDDO
814
815	ENDDO
816	!$OMP END PARALLEL
817
818	ENDIF
819	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'pause' )
820
821	!
822	!-- Transpose array
823	#if defined( __parallel )
824	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
825	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
826	CALL MPI_ALLTOALL( work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
827	f_out(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
828	comm1dx, ierr )
829	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
830	#endif
831
832	END SUBROUTINE ffty_tr_yx
833
834
835	SUBROUTINE tr_xy_ffty( f_in, f_out )
836
837	!------------------------------------------------------------------------------!
838	! Transposition x --> y with a subsequent backward Fourier transformation for
839	! a 1d-decomposition along x
840	!------------------------------------------------------------------------------!
841
842	USE control_parameters, &
843	ONLY: host
844
845	USE cpulog, &
846	ONLY: cpu_log, log_point_s
847
848	USE kinds
849
850	USE pegrid
851
852	IMPLICIT NONE
853
854	INTEGER(iwp) :: i !:
855	INTEGER(iwp) :: iend !:
856	INTEGER(iwp) :: iouter !:
857	INTEGER(iwp) :: ir !:
858	INTEGER(iwp) :: j !:
859	INTEGER(iwp) :: k !:
860
861	INTEGER(iwp), PARAMETER :: stridex = 4 !:
862
863	REAL(wp), DIMENSION(0:ny,stridex) :: work_ffty !:
864	#if defined( __nec )
865	REAL(wp), DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec !:
866	#endif
867	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_in !:
868	REAL(wp), DIMENSION(1:nz,0:ny,nxl:nxr) :: f_out !:
869	REAL(wp), DIMENSION(nxl:nxr,1:nz,0:ny) :: work !:
870
871	!
872	!-- Transpose array
873	#if defined( __parallel )
874	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
875	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
876	CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
877	work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
878	comm1dx, ierr )
879	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
880	#endif
881
882	!
883	!-- Resort the data in a way that y becomes the first index and carry out the
884	!-- backward fft along y.
885	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'continue' )
886
887	IF ( host(1:3) == 'nec' ) THEN
888	#if defined( __nec )
889	!
890	!-- Code optimized for vector processors
891	!$OMP PARALLEL PRIVATE ( i, j, k )
892	!$OMP DO
893	DO k = 1, nz
894	DO j = 0, ny
895	DO i = nxl, nxr
896	work_ffty_vec(j,k,i) = work(i,k,j)
897	ENDDO
898	ENDDO
899	ENDDO
900
901	!$OMP DO
902	DO i = nxl, nxr
903
904	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' )
905
906	DO j = 0, ny
907	DO k = 1, nz
908	f_out(k,j,i) = work_ffty_vec(j,k,i)
909	ENDDO
910	ENDDO
911
912	ENDDO
913	!$OMP END PARALLEL
914	#endif
915
916	ELSE
917
918	!
919	!-- Cache optimized code.
920	!-- The i-(x-)direction is split into a strided outer loop and an inner
921	!-- loop for better cache performance
922	!$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty )
923	!$OMP DO
924	DO iouter = nxl, nxr, stridex
925
926	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
927
928	DO k = 1, nz
929	!
930	!-- Resort
931	DO j = 0, ny
932	DO i = iouter, iend
933	work_ffty(j,i-iouter+1) = work(i,k,j)
934	ENDDO
935	ENDDO
936
937	DO i = iouter, iend
938
939	!
940	!-- FFT along y
941	ir = i-iouter+1 ! counter within a stride
942	CALL fft_y_1d( work_ffty(:,ir), 'backward' )
943
944	DO j = 0, ny
945	f_out(k,j,i) = work_ffty(j,ir)
946	ENDDO
947	ENDDO
948
949	ENDDO
950
951	ENDDO
952	!$OMP END PARALLEL
953
954	ENDIF
955
956	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'stop' )
957
958	END SUBROUTINE tr_xy_ffty
959
960
961	SUBROUTINE fftx_tri_fftx( ar )
962
963	!------------------------------------------------------------------------------!
964	! FFT along x, solution of the tridiagonal system and backward FFT for
965	! a 1d-decomposition along x
966	!
967	! WARNING: this subroutine may still not work for hybrid parallelization
968	! with OpenMP (for possible necessary changes see the original
969	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
970	!------------------------------------------------------------------------------!
971
972	USE control_parameters, &
973	ONLY: host
974
975	USE cpulog, &
976	ONLY: cpu_log, log_point_s
977
978	USE grid_variables, &
979	ONLY: ddx2, ddy2
980
981	USE kinds
982
983	USE pegrid
984
985	IMPLICIT NONE
986
987	INTEGER(iwp) :: i !:
988	INTEGER(iwp) :: j !:
989	INTEGER(iwp) :: k !:
990	INTEGER(iwp) :: m !:
991	INTEGER(iwp) :: n !:
992	INTEGER(iwp) :: omp_get_thread_num !:
993	INTEGER(iwp) :: tn !:
994
995	REAL(wp), DIMENSION(0:nx) :: work_fftx !:
996	REAL(wp), DIMENSION(0:nx,1:nz) :: work_trix !:
997	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: ar !:
998	REAL(wp), DIMENSION(:,:,:,:), ALLOCATABLE :: tri !:
999
1000
1001	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'start' )
1002
1003	ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) )
1004
1005	tn = 0 ! Default thread number in case of one thread
1006	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_fftx, work_trix )
1007	DO j = nys_x, nyn_x
1008
1009	!$ tn = omp_get_thread_num()
1010
1011	IF ( host(1:3) == 'nec' ) THEN
1012	!
1013	!-- Code optimized for vector processors
1014	DO k = 1, nz
1015
1016	m = 0
1017	DO n = 1, pdims(1)
1018	DO i = 1, nnx
1019	work_trix(m,k) = ar(i,k,j,n)
1020	m = m + 1
1021	ENDDO
1022	ENDDO
1023
1024	ENDDO
1025
1026	CALL fft_x_m( work_trix, 'forward' )
1027
1028	ELSE
1029	!
1030	!-- Cache optimized code
1031	DO k = 1, nz
1032
1033	m = 0
1034	DO n = 1, pdims(1)
1035	DO i = 1, nnx
1036	work_fftx(m) = ar(i,k,j,n)
1037	m = m + 1
1038	ENDDO
1039	ENDDO
1040
1041	CALL fft_x_1d( work_fftx, 'forward' )
1042
1043	DO i = 0, nx
1044	work_trix(i,k) = work_fftx(i)
1045	ENDDO
1046
1047	ENDDO
1048
1049	ENDIF
1050
1051	!
1052	!-- Solve the linear equation system
1053	CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) )
1054
1055	IF ( host(1:3) == 'nec' ) THEN
1056	!
1057	!-- Code optimized for vector processors
1058	CALL fft_x_m( work_trix, 'backward' )
1059
1060	DO k = 1, nz
1061
1062	m = 0
1063	DO n = 1, pdims(1)
1064	DO i = 1, nnx
1065	ar(i,k,j,n) = work_trix(m,k)
1066	m = m + 1
1067	ENDDO
1068	ENDDO
1069
1070	ENDDO
1071
1072	ELSE
1073	!
1074	!-- Cache optimized code
1075	DO k = 1, nz
1076
1077	DO i = 0, nx
1078	work_fftx(i) = work_trix(i,k)
1079	ENDDO
1080
1081	CALL fft_x_1d( work_fftx, 'backward' )
1082
1083	m = 0
1084	DO n = 1, pdims(1)
1085	DO i = 1, nnx
1086	ar(i,k,j,n) = work_fftx(m)
1087	m = m + 1
1088	ENDDO
1089	ENDDO
1090
1091	ENDDO
1092
1093	ENDIF
1094
1095	ENDDO
1096
1097	DEALLOCATE( tri )
1098
1099	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'stop' )
1100
1101	END SUBROUTINE fftx_tri_fftx
1102
1103
1104	SUBROUTINE fftx_tr_xy( f_in, f_out )
1105
1106	!------------------------------------------------------------------------------!
1107	! Fourier-transformation along x with subsequent transposition x --> y for
1108	! a 1d-decomposition along y
1109	!
1110	! ATTENTION: The NEC-branch of this routine may significantly profit from
1111	! further optimizations. So far, performance is much worse than
1112	! for routine ffty_tr_yx (more than three times slower).
1113	!------------------------------------------------------------------------------!
1114
1115	USE control_parameters, &
1116	ONLY: host
1117
1118	USE cpulog, &
1119	ONLY: cpu_log, log_point_s
1120
1121	USE kinds
1122
1123	USE pegrid
1124
1125	IMPLICIT NONE
1126
1127	INTEGER(iwp) :: i !:
1128	INTEGER(iwp) :: j !:
1129	INTEGER(iwp) :: k !:
1130
1131	REAL(wp), DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx !:
1132	REAL(wp), DIMENSION(1:nz,nys:nyn,0:nx) :: f_in !:
1133	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_out !:
1134	REAL(wp), DIMENSION(nys:nyn,1:nz,0:nx) :: work !:
1135
1136	!
1137	!-- Carry out the FFT along x, where all data are present due to the
1138	!-- 1d-decomposition along y. Resort the data in a way that y becomes
1139	!-- the first index.
1140	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'start' )
1141
1142	IF ( host(1:3) == 'nec' ) THEN
1143	!
1144	!-- Code for vector processors
1145	!$OMP PARALLEL PRIVATE ( i, j, k )
1146	!$OMP DO
1147	DO i = 0, nx
1148
1149	DO j = nys, nyn
1150	DO k = 1, nz
1151	work_fftx(i,k,j) = f_in(k,j,i)
1152	ENDDO
1153	ENDDO
1154
1155	ENDDO
1156
1157	!$OMP DO
1158	DO j = nys, nyn
1159
1160	CALL fft_x_m( work_fftx(:,:,j), 'forward' )
1161
1162	DO k = 1, nz
1163	DO i = 0, nx
1164	work(j,k,i) = work_fftx(i,k,j)
1165	ENDDO
1166	ENDDO
1167
1168	ENDDO
1169	!$OMP END PARALLEL
1170
1171	ELSE
1172
1173	!
1174	!-- Cache optimized code (there might be still a potential for better
1175	!-- optimization).
1176	!$OMP PARALLEL PRIVATE (i,j,k)
1177	!$OMP DO
1178	DO i = 0, nx
1179
1180	DO j = nys, nyn
1181	DO k = 1, nz
1182	work_fftx(i,k,j) = f_in(k,j,i)
1183	ENDDO
1184	ENDDO
1185
1186	ENDDO
1187
1188	!$OMP DO
1189	DO j = nys, nyn
1190	DO k = 1, nz
1191
1192	CALL fft_x_1d( work_fftx(0:nx,k,j), 'forward' )
1193
1194	DO i = 0, nx
1195	work(j,k,i) = work_fftx(i,k,j)
1196	ENDDO
1197	ENDDO
1198
1199	ENDDO
1200	!$OMP END PARALLEL
1201
1202	ENDIF
1203	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'pause' )
1204
1205	!
1206	!-- Transpose array
1207	#if defined( __parallel )
1208	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1209	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1210	CALL MPI_ALLTOALL( work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
1211	f_out(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
1212	comm1dy, ierr )
1213	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1214	#endif
1215
1216	END SUBROUTINE fftx_tr_xy
1217
1218
1219	SUBROUTINE tr_yx_fftx( f_in, f_out )
1220
1221	!------------------------------------------------------------------------------!
1222	! Transposition y --> x with a subsequent backward Fourier transformation for
1223	! a 1d-decomposition along x
1224	!------------------------------------------------------------------------------!
1225
1226	USE control_parameters, &
1227	ONLY: host
1228
1229	USE cpulog, &
1230	ONLY: cpu_log, log_point_s
1231
1232	USE kinds
1233
1234	USE pegrid
1235
1236	IMPLICIT NONE
1237
1238	INTEGER(iwp) :: i !:
1239	INTEGER(iwp) :: j !:
1240	INTEGER(iwp) :: k !:
1241
1242	REAL(wp), DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx !:
1243	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_in !:
1244	REAL(wp), DIMENSION(1:nz,nys:nyn,0:nx) :: f_out !:
1245	REAL(wp), DIMENSION(nys:nyn,1:nz,0:nx) :: work !:
1246
1247	!
1248	!-- Transpose array
1249	#if defined( __parallel )
1250	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1251	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1252	CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
1253	work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
1254	comm1dy, ierr )
1255	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1256	#endif
1257
1258	!
1259	!-- Carry out the FFT along x, where all data are present due to the
1260	!-- 1d-decomposition along y. Resort the data in a way that y becomes
1261	!-- the first index.
1262	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'continue' )
1263
1264	IF ( host(1:3) == 'nec' ) THEN
1265	!
1266	!-- Code optimized for vector processors
1267	!$OMP PARALLEL PRIVATE ( i, j, k )
1268	!$OMP DO
1269	DO j = nys, nyn
1270
1271	DO k = 1, nz
1272	DO i = 0, nx
1273	work_fftx(i,k,j) = work(j,k,i)
1274	ENDDO
1275	ENDDO
1276
1277	CALL fft_x_m( work_fftx(:,:,j), 'backward' )
1278
1279	ENDDO
1280
1281	!$OMP DO
1282	DO i = 0, nx
1283	DO j = nys, nyn
1284	DO k = 1, nz
1285	f_out(k,j,i) = work_fftx(i,k,j)
1286	ENDDO
1287	ENDDO
1288	ENDDO
1289	!$OMP END PARALLEL
1290
1291	ELSE
1292
1293	!
1294	!-- Cache optimized code (there might be still a potential for better
1295	!-- optimization).
1296	!$OMP PARALLEL PRIVATE (i,j,k)
1297	!$OMP DO
1298	DO j = nys, nyn
1299	DO k = 1, nz
1300
1301	DO i = 0, nx
1302	work_fftx(i,k,j) = work(j,k,i)
1303	ENDDO
1304
1305	CALL fft_x_1d( work_fftx(0:nx,k,j), 'backward' )
1306
1307	ENDDO
1308	ENDDO
1309
1310	!$OMP DO
1311	DO i = 0, nx
1312	DO j = nys, nyn
1313	DO k = 1, nz
1314	f_out(k,j,i) = work_fftx(i,k,j)
1315	ENDDO
1316	ENDDO
1317	ENDDO
1318	!$OMP END PARALLEL
1319
1320	ENDIF
1321	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'stop' )
1322
1323	END SUBROUTINE tr_yx_fftx
1324
1325
1326	SUBROUTINE ffty_tri_ffty( ar )
1327
1328	!------------------------------------------------------------------------------!
1329	! FFT along y, solution of the tridiagonal system and backward FFT for
1330	! a 1d-decomposition along y
1331	!
1332	! WARNING: this subroutine may still not work for hybrid parallelization
1333	! with OpenMP (for possible necessary changes see the original
1334	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
1335	!------------------------------------------------------------------------------!
1336
1337	USE control_parameters, &
1338	ONLY: host
1339
1340	USE cpulog, &
1341	ONLY: cpu_log, log_point_s
1342
1343	USE grid_variables, &
1344	ONLY: ddx2, ddy2
1345
1346	USE kinds
1347
1348	USE pegrid
1349
1350	IMPLICIT NONE
1351
1352	INTEGER(iwp) :: i !:
1353	INTEGER(iwp) :: j !:
1354	INTEGER(iwp) :: k !:
1355	INTEGER(iwp) :: m !:
1356	INTEGER(iwp) :: n !:
1357	INTEGER(iwp) :: omp_get_thread_num !:
1358	INTEGER(iwp) :: tn !:
1359
1360	REAL(wp), DIMENSION(0:ny) :: work_ffty !:
1361	REAL(wp), DIMENSION(0:ny,1:nz) :: work_triy !:
1362	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: ar !:
1363	REAL(wp), DIMENSION(:,:,:,:), ALLOCATABLE :: tri !:
1364
1365
1366	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'start' )
1367
1368	ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) )
1369
1370	tn = 0 ! Default thread number in case of one thread
1371	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy )
1372	DO i = nxl_y, nxr_y
1373
1374	!$ tn = omp_get_thread_num()
1375
1376	IF ( host(1:3) == 'nec' ) THEN
1377	!
1378	!-- Code optimized for vector processors
1379	DO k = 1, nz
1380
1381	m = 0
1382	DO n = 1, pdims(2)
1383	DO j = 1, nny
1384	work_triy(m,k) = ar(j,k,i,n)
1385	m = m + 1
1386	ENDDO
1387	ENDDO
1388
1389	ENDDO
1390
1391	CALL fft_y_m( work_triy, ny, 'forward' )
1392
1393	ELSE
1394	!
1395	!-- Cache optimized code
1396	DO k = 1, nz
1397
1398	m = 0
1399	DO n = 1, pdims(2)
1400	DO j = 1, nny
1401	work_ffty(m) = ar(j,k,i,n)
1402	m = m + 1
1403	ENDDO
1404	ENDDO
1405
1406	CALL fft_y_1d( work_ffty, 'forward' )
1407
1408	DO j = 0, ny
1409	work_triy(j,k) = work_ffty(j)
1410	ENDDO
1411
1412	ENDDO
1413
1414	ENDIF
1415
1416	!
1417	!-- Solve the linear equation system
1418	CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) )
1419
1420	IF ( host(1:3) == 'nec' ) THEN
1421	!
1422	!-- Code optimized for vector processors
1423	CALL fft_y_m( work_triy, ny, 'backward' )
1424
1425	DO k = 1, nz
1426
1427	m = 0
1428	DO n = 1, pdims(2)
1429	DO j = 1, nny
1430	ar(j,k,i,n) = work_triy(m,k)
1431	m = m + 1
1432	ENDDO
1433	ENDDO
1434
1435	ENDDO
1436
1437	ELSE
1438	!
1439	!-- Cache optimized code
1440	DO k = 1, nz
1441
1442	DO j = 0, ny
1443	work_ffty(j) = work_triy(j,k)
1444	ENDDO
1445
1446	CALL fft_y_1d( work_ffty, 'backward' )
1447
1448	m = 0
1449	DO n = 1, pdims(2)
1450	DO j = 1, nny
1451	ar(j,k,i,n) = work_ffty(m)
1452	m = m + 1
1453	ENDDO
1454	ENDDO
1455
1456	ENDDO
1457
1458	ENDIF
1459
1460	ENDDO
1461
1462	DEALLOCATE( tri )
1463
1464	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'stop' )
1465
1466	END SUBROUTINE ffty_tri_ffty
1467
1468	#endif
1469
1470	END MODULE poisfft_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |