Home

Context Navigation

source: palm/trunk/SOURCE/poisfft.f90 @ 1804

Last change on this file since 1804 was 1804, checked in by maronga, 9 years ago
removed parameter file check. update of mrungui for compilation with qt5
Property svn:keywords set to `Id`
File size: 45.5 KB

Line
1	!> @file poisfft.f90
2	!--------------------------------------------------------------------------------!
3	! This file is part of PALM.
4	!
5	! PALM is free software: you can redistribute it and/or modify it under the terms
6	! of the GNU General Public License as published by the Free Software Foundation,
7	! either version 3 of the License, or (at your option) any later version.
8	!
9	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
10	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
11	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
12	!
13	! You should have received a copy of the GNU General Public License along with
14	! PALM. If not, see <http://www.gnu.org/licenses/>.
15	!
16	! Copyright 1997-2014 Leibniz Universitaet Hannover
17	!--------------------------------------------------------------------------------!
18	!
19	! Current revisions:
20	! -----------------
21	! Removed code for parameter file check (__check)
22	!
23	! Former revisions:
24	! -----------------
25	! $Id: poisfft.f90 1804 2016-04-05 16:30:18Z maronga $
26	!
27	! 1682 2015-10-07 23:56:08Z knoop
28	! Code annotations made doxygen readable
29	!
30	! 1482 2014-10-18 12:34:45Z raasch
31	! use 2d-decomposition, if accelerator boards are used
32	!
33	! 1406 2014-05-16 13:47:01Z raasch
34	! bugfix for pgi 14.4: declare create moved after array declaration
35	!
36	! 1320 2014-03-20 08:40:49Z raasch
37	! ONLY-attribute added to USE-statements,
38	! kind-parameters added to all INTEGER and REAL declaration statements,
39	! kinds are defined in new module kinds,
40	! old module precision_kind is removed,
41	! revision history before 2012 removed,
42	! comment fields (!:) to be used for variable explanations added to
43	! all variable declaration statements
44	!
45	! 1318 2014-03-17 13:35:16Z raasch
46	! module interfaces removed
47	!
48	! 1306 2014-03-13 14:30:59Z raasch
49	! openmp sections removed from the overlap branch,
50	! second argument removed from parameter list
51	!
52	! 1216 2013-08-26 09:31:42Z raasch
53	! resorting of arrays moved to separate routines resort_for_...,
54	! one argument, used as temporary work array, removed from all transpose
55	! routines
56	! overlapping fft / transposition implemented
57	!
58	! 1212 2013-08-15 08:46:27Z raasch
59	! tridia routines moved to seperate module tridia_solver
60	!
61	! 1208 2013-08-13 06:41:49Z raasch
62	! acc-update clauses added for "ar" so that ffts other than cufft can also be
63	! used (although they are not ported and will give a poor performance)
64	!
65	! 1111 2013-03-08 23:54:10Z raasch
66	! further openACC porting of non-parallel (MPI) branch:
67	! tridiagonal routines split into extermal subroutines (instead using CONTAINS),
68	! no distinction between parallel/non-parallel in poisfft and tridia any more,
69	! tridia routines moved to end of file because of probable bug in PGI compiler 12.5
70	! (otherwise "invalid device function" is indicated during runtime),
71	! optimization of tridia routines: constant elements and coefficients of tri are
72	! stored in seperate arrays ddzuw and tric, last dimension of tri reduced from 5
73	! to 2,
74	! poisfft_init is now called internally from poisfft, maketri is called from
75	! poisfft_init,
76	! ibc_p_b = 2 removed
77	!
78	! 1106 2013-03-04 05:31:38Z raasch
79	! routines fftx, ffty, fftxp, fftyp removed, calls replaced by fft_x, fft_y,
80	! in the 1D-decomposition routines fft_x, ffty are replaced by fft_x_1d,
81	! fft_y_1d
82	!
83	! 1103 2013-02-20 02:15:53Z raasch
84	! tri, ar, and ar1 arguments in tridia-routines (2d) are removed because they
85	! sometimes cause segmentation faults with intel 12.1 compiler
86	!
87	! 1092 2013-02-02 11:24:22Z raasch
88	! unused variables removed
89	!
90	! 1036 2012-10-22 13:43:42Z raasch
91	! code put under GPL (PALM 3.9)
92	!
93	! 2012-09-21 07:03:55Z raasch
94	! FLOAT type conversion replaced by REAL
95	!
96	! 1003 2012-09-14 14:35:53Z raasch
97	! indices nxa, nya, etc. replaced by nx, ny, etc.
98	!
99	! 940 2012-07-09 14:31:00Z raasch
100	! special handling of tri-array as an argument in tridia_1dd routines switched
101	! off because it caused segmentation faults with intel 12.1 compiler
102	!
103	! 877 2012-04-03 11:21:44Z suehring
104	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
105	! pressure at the top of the model domain.
106	!
107	! 809 2012-01-30 13:32:58Z maronga
108	! Bugfix: replaced .AND. and .NOT. with && and ! in the preprocessor directives
109	!
110	! 807 2012-01-25 11:53:51Z maronga
111	! New cpp directive "__check" implemented which is used by check_namelist_files
112	! (most of the code is unneeded by check_namelist_files).
113	!
114	! Revision 1.1 1997/07/24 11:24:14 raasch
115	! Initial revision
116	!
117	!
118	! Description:
119	! ------------
120	!> Solves the Poisson equation with a 2D spectral method
121	!> d^2 p / dx^2 + d^2 p / dy^2 + d^2 p / dz^2 = s
122	!>
123	!> Input:
124	!> real ar contains (nnz,nny,nnx) elements of the velocity divergence,
125	!> starting from (1,nys,nxl)
126	!>
127	!> Output:
128	!> real ar contains the solution for perturbation pressure p
129	!------------------------------------------------------------------------------!
130	MODULE poisfft_mod
131
132
133	USE fft_xy, &
134	ONLY: fft_init, fft_y, fft_y_1d, fft_y_m, fft_x, fft_x_1d, fft_x_m
135
136	USE indices, &
137	ONLY: nnx, nny, nx, nxl, nxr, ny, nys, nyn, nz
138
139	USE transpose_indices, &
140	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nys_x, nys_z, nyn_x, nyn_z, nzb_x, &
141	nzb_y, nzt_x, nzt_y
142
143	USE tridia_solver, &
144	ONLY: tridia_1dd, tridia_init, tridia_substi, tridia_substi_overlap
145
146	IMPLICIT NONE
147
148	LOGICAL, SAVE :: poisfft_initialized = .FALSE.
149
150	PRIVATE
151
152	PUBLIC poisfft, poisfft_init
153
154	INTERFACE poisfft
155	MODULE PROCEDURE poisfft
156	END INTERFACE poisfft
157
158	INTERFACE poisfft_init
159	MODULE PROCEDURE poisfft_init
160	END INTERFACE poisfft_init
161
162
163	CONTAINS
164
165	!------------------------------------------------------------------------------!
166	! Description:
167	! ------------
168	!> @todo Missing subroutine description.
169	!------------------------------------------------------------------------------!
170	SUBROUTINE poisfft_init
171
172	USE arrays_3d, &
173	ONLY: ddzu_pres, ddzw
174
175	USE kinds
176
177	IMPLICIT NONE
178
179	INTEGER(iwp) :: k !<
180
181
182	CALL fft_init
183
184	CALL tridia_init
185
186	poisfft_initialized = .TRUE.
187
188	END SUBROUTINE poisfft_init
189
190
191
192	!------------------------------------------------------------------------------!
193	! Description:
194	! ------------
195	!> Two-dimensional Fourier Transformation in x- and y-direction.
196	!------------------------------------------------------------------------------!
197	SUBROUTINE poisfft( ar )
198
199	USE control_parameters, &
200	ONLY: fft_method, transpose_compute_overlap
201
202	USE cpulog, &
203	ONLY: cpu_log, cpu_log_nowait, log_point_s
204
205	USE kinds
206
207	USE pegrid
208
209	IMPLICIT NONE
210
211	INTEGER(iwp) :: ii !<
212	INTEGER(iwp) :: iind !<
213	INTEGER(iwp) :: inew !<
214	INTEGER(iwp) :: jj !<
215	INTEGER(iwp) :: jind !<
216	INTEGER(iwp) :: jnew !<
217	INTEGER(iwp) :: ki !<
218	INTEGER(iwp) :: kk !<
219	INTEGER(iwp) :: knew !<
220	INTEGER(iwp) :: n !<
221	INTEGER(iwp) :: nblk !<
222	INTEGER(iwp) :: nnx_y !<
223	INTEGER(iwp) :: nny_z !<
224	INTEGER(iwp) :: nnz_t !<
225	INTEGER(iwp) :: nnz_x !<
226	INTEGER(iwp) :: nxl_y_bound !<
227	INTEGER(iwp) :: nxr_y_bound !<
228
229	INTEGER(iwp), DIMENSION(4) :: isave !<
230
231	REAL(wp), DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar !<
232	REAL(wp), DIMENSION(nys:nyn,nxl:nxr,1:nz) :: ar_inv !<
233	!$acc declare create( ar_inv )
234
235	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: ar1 !<
236	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_in !<
237	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_inv !<
238	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_out_y !<
239	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_out_z !<
240
241
242	CALL cpu_log( log_point_s(3), 'poisfft', 'start' )
243
244	IF ( .NOT. poisfft_initialized ) CALL poisfft_init
245
246	!
247	!-- Two-dimensional Fourier Transformation in x- and y-direction.
248	IF ( pdims(2) == 1 .AND. pdims(1) > 1 .AND. num_acc_per_node == 0 ) &
249	THEN
250
251	!
252	!-- 1d-domain-decomposition along x:
253	!-- FFT along y and transposition y --> x
254	CALL ffty_tr_yx( ar, ar )
255
256	!
257	!-- FFT along x, solving the tridiagonal system and backward FFT
258	CALL fftx_tri_fftx( ar )
259
260	!
261	!-- Transposition x --> y and backward FFT along y
262	CALL tr_xy_ffty( ar, ar )
263
264	ELSEIF ( pdims(1) == 1 .AND. pdims(2) > 1 .AND. num_acc_per_node == 0 ) &
265	THEN
266
267	!
268	!-- 1d-domain-decomposition along y:
269	!-- FFT along x and transposition x --> y
270	CALL fftx_tr_xy( ar, ar )
271
272	!
273	!-- FFT along y, solving the tridiagonal system and backward FFT
274	CALL ffty_tri_ffty( ar )
275
276	!
277	!-- Transposition y --> x and backward FFT along x
278	CALL tr_yx_fftx( ar, ar )
279
280	ELSEIF ( .NOT. transpose_compute_overlap ) THEN
281
282	!
283	!-- 2d-domain-decomposition or no decomposition (1 PE run)
284	!-- Transposition z --> x
285	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
286	CALL resort_for_zx( ar, ar_inv )
287	CALL transpose_zx( ar_inv, ar )
288	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
289
290	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
291	IF ( fft_method /= 'system-specific' ) THEN
292	!$acc update host( ar )
293	ENDIF
294	CALL fft_x( ar, 'forward' )
295	IF ( fft_method /= 'system-specific' ) THEN
296	!$acc update device( ar )
297	ENDIF
298	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
299
300	!
301	!-- Transposition x --> y
302	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
303	CALL resort_for_xy( ar, ar_inv )
304	CALL transpose_xy( ar_inv, ar )
305	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
306
307	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
308	IF ( fft_method /= 'system-specific' ) THEN
309	!$acc update host( ar )
310	ENDIF
311	CALL fft_y( ar, 'forward', ar_tr = ar, &
312	nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
313	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
314	IF ( fft_method /= 'system-specific' ) THEN
315	!$acc update device( ar )
316	ENDIF
317	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
318
319	!
320	!-- Transposition y --> z
321	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
322	CALL resort_for_yz( ar, ar_inv )
323	CALL transpose_yz( ar_inv, ar )
324	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
325
326	!
327	!-- Solve the tridiagonal equation system along z
328	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
329	CALL tridia_substi( ar )
330	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
331
332	!
333	!-- Inverse Fourier Transformation
334	!-- Transposition z --> y
335	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
336	CALL transpose_zy( ar, ar_inv )
337	CALL resort_for_zy( ar_inv, ar )
338	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
339
340	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
341	IF ( fft_method /= 'system-specific' ) THEN
342	!$acc update host( ar )
343	ENDIF
344	CALL fft_y( ar, 'backward', ar_tr = ar, &
345	nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
346	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
347	IF ( fft_method /= 'system-specific' ) THEN
348	!$acc update device( ar )
349	ENDIF
350	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
351
352	!
353	!-- Transposition y --> x
354	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
355	CALL transpose_yx( ar, ar_inv )
356	CALL resort_for_yx( ar_inv, ar )
357	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
358
359	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
360	IF ( fft_method /= 'system-specific' ) THEN
361	!$acc update host( ar )
362	ENDIF
363	CALL fft_x( ar, 'backward' )
364	IF ( fft_method /= 'system-specific' ) THEN
365	!$acc update device( ar )
366	ENDIF
367	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
368
369	!
370	!-- Transposition x --> z
371	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
372	CALL transpose_xz( ar, ar_inv )
373	CALL resort_for_xz( ar_inv, ar )
374	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
375
376	ELSE
377
378	!
379	!-- 2d-domain-decomposition or no decomposition (1 PE run) with
380	!-- overlapping transposition / fft
381	!-- cputime logging must not use barriers, which would prevent overlapping
382	ALLOCATE( f_out_y(0:ny,nxl_y:nxr_y,nzb_y:nzt_y), &
383	f_out_z(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
384	!
385	!-- Transposition z --> x + subsequent fft along x
386	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
387	CALL resort_for_zx( ar, f_inv )
388	!
389	!-- Save original indices and gridpoint counter
390	isave(1) = nz
391	isave(2) = nzb_x
392	isave(3) = nzt_x
393	isave(4) = sendrecvcount_zx
394	!
395	!-- Set new indices for transformation
396	nblk = nz / pdims(1)
397	nz = pdims(1)
398	nnz_x = 1
399	nzb_x = 1 + myidx * nnz_x
400	nzt_x = ( myidx + 1 ) * nnz_x
401	sendrecvcount_zx = nnx * nny * nnz_x
402
403	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
404	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
405
406	DO kk = 1, nblk
407
408	IF ( kk == 1 ) THEN
409	CALL cpu_log( log_point_s(5), 'transpo forward', 'start', cpu_log_nowait )
410	ELSE
411	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
412	ENDIF
413
414	DO knew = 1, nz
415	ki = kk + nblk * ( knew - 1 )
416	f_in(:,:,knew) = f_inv(:,:,ki)
417	ENDDO
418
419	CALL transpose_zx( f_in, ar1(:,:,:))
420	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
421
422	IF ( kk == 1 ) THEN
423	CALL cpu_log( log_point_s(4), 'fft_x', 'start', cpu_log_nowait )
424	ELSE
425	CALL cpu_log( log_point_s(4), 'fft_x', 'continue', cpu_log_nowait )
426	ENDIF
427
428	n = isave(2) + kk - 1
429	CALL fft_x( ar1(:,:,:), 'forward', ar_2d = f_out_z(:,:,n))
430	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
431
432	ENDDO
433	!
434	!-- Restore original indices/counters
435	nz = isave(1)
436	nzb_x = isave(2)
437	nzt_x = isave(3)
438	sendrecvcount_zx = isave(4)
439
440	DEALLOCATE( ar1, f_in, f_inv )
441
442	!
443	!-- Transposition x --> y + subsequent fft along y
444	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
445	CALL resort_for_xy( f_out_z, f_inv )
446	!
447	!-- Save original indices and gridpoint counter
448	isave(1) = nx
449	isave(2) = nxl_y
450	isave(3) = nxr_y
451	isave(4) = sendrecvcount_xy
452	!
453	!-- Set new indices for transformation
454	nblk = ( ( nx+1 ) / pdims(2) ) - 1
455	nx = pdims(2)
456	nnx_y = 1
457	nxl_y = myidy * nnx_y
458	nxr_y = ( myidy + 1 ) * nnx_y - 1
459	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
460
461	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
462	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
463
464	DO ii = 0, nblk
465
466	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
467
468	DO inew = 0, nx-1
469	iind = ii + ( nblk + 1 ) * inew
470	f_in(:,:,inew) = f_inv(:,:,iind)
471	ENDDO
472
473	CALL transpose_xy( f_in, ar1(:,:,:) )
474
475	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
476
477	IF ( ii == 1 ) THEN
478	CALL cpu_log( log_point_s(7), 'fft_y', 'start', cpu_log_nowait )
479	ELSE
480	CALL cpu_log( log_point_s(7), 'fft_y', 'continue', cpu_log_nowait )
481	ENDIF
482
483	nxl_y_bound = isave(2)
484	nxr_y_bound = isave(3)
485	n = isave(2) + ii
486	CALL fft_y( ar1(:,:,:), 'forward', ar_tr = f_out_y, &
487	nxl_y_bound = nxl_y_bound, nxr_y_bound = nxr_y_bound, &
488	nxl_y_l = n, nxr_y_l = n )
489
490	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
491
492	ENDDO
493	!
494	!-- Restore original indices/counters
495	nx = isave(1)
496	nxl_y = isave(2)
497	nxr_y = isave(3)
498	sendrecvcount_xy = isave(4)
499
500	DEALLOCATE( ar1, f_in, f_inv )
501
502	!
503	!-- Transposition y --> z + subsequent tridia + resort for z --> y
504	ALLOCATE( f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
505	CALL resort_for_yz( f_out_y, f_inv )
506	!
507	!-- Save original indices and gridpoint counter
508	isave(1) = ny
509	isave(2) = nys_z
510	isave(3) = nyn_z
511	isave(4) = sendrecvcount_yz
512	!
513	!-- Set new indices for transformation
514	nblk = ( ( ny+1 ) / pdims(1) ) - 1
515	ny = pdims(1)
516	nny_z = 1
517	nys_z = myidx * nny_z
518	nyn_z = ( myidx + 1 ) * nny_z - 1
519	sendrecvcount_yz = ( nxr_y-nxl_y+1 ) * nny_z * ( nzt_y-nzb_y+1 )
520
521	ALLOCATE( ar1(nxl_z:nxr_z,nys_z:nyn_z,1:nz) )
522	ALLOCATE( f_in(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
523
524	DO jj = 0, nblk
525	!
526	!-- Forward Fourier Transformation
527	!-- Transposition y --> z
528	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
529
530	DO jnew = 0, ny-1
531	jind = jj + ( nblk + 1 ) * jnew
532	f_in(:,:,jnew) = f_inv(:,:,jind)
533	ENDDO
534
535	CALL transpose_yz( f_in, ar1(:,:,:) )
536
537	IF ( jj == nblk ) THEN
538	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
539	ELSE
540	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
541	ENDIF
542
543	!
544	!-- Solve the tridiagonal equation system along z
545	CALL cpu_log( log_point_s(6), 'tridia', 'start', cpu_log_nowait )
546
547	n = isave(2) + jj
548	CALL tridia_substi_overlap( ar1(:,:,:), n )
549
550	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
551
552	!
553	!-- Inverse Fourier Transformation
554	!-- Transposition z --> y
555	!-- Only one thread should call MPI routines, therefore forward and
556	!-- backward tranpose are in the same section
557	IF ( jj == 0 ) THEN
558	CALL cpu_log( log_point_s(8), 'transpo invers', 'start', cpu_log_nowait )
559	ELSE
560	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
561	ENDIF
562
563	CALL transpose_zy( ar1(:,:,:), f_in )
564
565	DO jnew = 0, ny-1
566	jind = jj + ( nblk + 1 ) * jnew
567	f_inv(:,:,jind) = f_in(:,:,jnew)
568	ENDDO
569
570	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
571
572	ENDDO
573	!
574	!-- Restore original indices/counters
575	ny = isave(1)
576	nys_z = isave(2)
577	nyn_z = isave(3)
578	sendrecvcount_yz = isave(4)
579
580	CALL resort_for_zy( f_inv, f_out_y )
581
582	DEALLOCATE( ar1, f_in, f_inv )
583
584	!
585	!-- fft along y backward + subsequent transposition y --> x
586	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
587	!
588	!-- Save original indices and gridpoint counter
589	isave(1) = nx
590	isave(2) = nxl_y
591	isave(3) = nxr_y
592	isave(4) = sendrecvcount_xy
593	!
594	!-- Set new indices for transformation
595	nblk = (( nx+1 ) / pdims(2) ) - 1
596	nx = pdims(2)
597	nnx_y = 1
598	nxl_y = myidy * nnx_y
599	nxr_y = ( myidy + 1 ) * nnx_y - 1
600	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
601
602	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
603	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
604
605	DO ii = 0, nblk
606
607	CALL cpu_log( log_point_s(7), 'fft_y', 'continue', cpu_log_nowait )
608
609	n = isave(2) + ii
610	nxl_y_bound = isave(2)
611	nxr_y_bound = isave(3)
612
613	CALL fft_y( ar1(:,:,:), 'backward', ar_tr = f_out_y, &
614	nxl_y_bound = nxl_y_bound, nxr_y_bound = nxr_y_bound, &
615	nxl_y_l = n, nxr_y_l = n )
616
617	IF ( ii == nblk ) THEN
618	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
619	ELSE
620	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
621	ENDIF
622
623	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
624
625	CALL transpose_yx( ar1(:,:,:), f_in )
626
627	DO inew = 0, nx-1
628	iind = ii + (nblk+1) * inew
629	f_inv(:,:,iind) = f_in(:,:,inew)
630	ENDDO
631
632	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
633
634	ENDDO
635	!
636	!-- Restore original indices/counters
637	nx = isave(1)
638	nxl_y = isave(2)
639	nxr_y = isave(3)
640	sendrecvcount_xy = isave(4)
641
642	CALL resort_for_yx( f_inv, f_out_z )
643
644	DEALLOCATE( ar1, f_in, f_inv )
645
646	!
647	!-- fft along x backward + subsequent final transposition x --> z
648	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
649	!
650	!-- Save original indices and gridpoint counter
651	isave(1) = nz
652	isave(2) = nzb_x
653	isave(3) = nzt_x
654	isave(4) = sendrecvcount_zx
655	!
656	!-- Set new indices for transformation
657	nblk = nz / pdims(1)
658	nz = pdims(1)
659	nnz_x = 1
660	nzb_x = 1 + myidx * nnz_x
661	nzt_x = ( myidx + 1 ) * nnz_x
662	sendrecvcount_zx = nnx * nny * nnz_x
663
664	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
665	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
666
667	DO kk = 1, nblk
668
669	CALL cpu_log( log_point_s(4), 'fft_x', 'continue', cpu_log_nowait )
670
671	n = isave(2) + kk - 1
672	CALL fft_x( ar1(:,:,:), 'backward', f_out_z(:,:,n))
673
674	IF ( kk == nblk ) THEN
675	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
676	ELSE
677	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
678	ENDIF
679
680	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
681
682	CALL transpose_xz( ar1(:,:,:), f_in )
683
684	DO knew = 1, nz
685	ki = kk + nblk * (knew-1)
686	f_inv(:,:,ki) = f_in(:,:,knew)
687	ENDDO
688
689	IF ( kk == nblk ) THEN
690	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
691	ELSE
692	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
693	ENDIF
694
695	ENDDO
696	!
697	!-- Restore original indices/counters
698	nz = isave(1)
699	nzb_x = isave(2)
700	nzt_x = isave(3)
701	sendrecvcount_zx = isave(4)
702
703	CALL resort_for_xz( f_inv, ar )
704
705	DEALLOCATE( ar1, f_in, f_inv )
706
707	ENDIF
708
709	CALL cpu_log( log_point_s(3), 'poisfft', 'stop' )
710
711	END SUBROUTINE poisfft
712
713
714	!------------------------------------------------------------------------------!
715	! Description:
716	! ------------
717	!> Fourier-transformation along y with subsequent transposition y --> x for
718	!> a 1d-decomposition along x.
719	!>
720	!> @attention The performance of this routine is much faster on the NEC-SX6,
721	!> if the first index of work_ffty_vec is odd. Otherwise
722	!> memory bank conflicts may occur (especially if the index is a
723	!> multiple of 128). That's why work_ffty_vec is dimensioned as
724	!> 0:ny+1.
725	!> Of course, this will not work if users are using an odd number
726	!> of gridpoints along y.
727	!------------------------------------------------------------------------------!
728	SUBROUTINE ffty_tr_yx( f_in, f_out )
729
730
731	USE control_parameters, &
732	ONLY: host
733
734	USE cpulog, &
735	ONLY: cpu_log, log_point_s
736
737	USE kinds
738
739	USE pegrid
740
741	IMPLICIT NONE
742
743	INTEGER(iwp) :: i !<
744	INTEGER(iwp) :: iend !<
745	INTEGER(iwp) :: iouter !<
746	INTEGER(iwp) :: ir !<
747	INTEGER(iwp) :: j !<
748	INTEGER(iwp) :: k !<
749
750	INTEGER(iwp), PARAMETER :: stridex = 4 !<
751
752	REAL(wp), DIMENSION(0:ny,stridex) :: work_ffty !<
753	#if defined( __nec )
754	REAL(wp), DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec !<
755	#endif
756	REAL(wp), DIMENSION(1:nz,0:ny,nxl:nxr) :: f_in !<
757	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_out !<
758	REAL(wp), DIMENSION(nxl:nxr,1:nz,0:ny) :: work !<
759
760	!
761	!-- Carry out the FFT along y, where all data are present due to the
762	!-- 1d-decomposition along x. Resort the data in a way that x becomes
763	!-- the first index.
764	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'start' )
765
766	IF ( host(1:3) == 'nec' ) THEN
767	#if defined( __nec )
768	!
769	!-- Code optimized for vector processors
770	!$OMP PARALLEL PRIVATE ( i, j, k )
771	!$OMP DO
772	DO i = nxl, nxr
773
774	DO j = 0, ny
775	DO k = 1, nz
776	work_ffty_vec(j,k,i) = f_in(k,j,i)
777	ENDDO
778	ENDDO
779
780	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' )
781
782	ENDDO
783
784	!$OMP DO
785	DO k = 1, nz
786	DO j = 0, ny
787	DO i = nxl, nxr
788	work(i,k,j) = work_ffty_vec(j,k,i)
789	ENDDO
790	ENDDO
791	ENDDO
792	!$OMP END PARALLEL
793	#endif
794
795	ELSE
796
797	!
798	!-- Cache optimized code.
799	!-- The i-(x-)direction is split into a strided outer loop and an inner
800	!-- loop for better cache performance
801	!$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty)
802	!$OMP DO
803	DO iouter = nxl, nxr, stridex
804
805	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
806
807	DO k = 1, nz
808
809	DO i = iouter, iend
810
811	ir = i-iouter+1 ! counter within a stride
812	DO j = 0, ny
813	work_ffty(j,ir) = f_in(k,j,i)
814	ENDDO
815	!
816	!-- FFT along y
817	CALL fft_y_1d( work_ffty(:,ir), 'forward' )
818
819	ENDDO
820
821	!
822	!-- Resort
823	DO j = 0, ny
824	DO i = iouter, iend
825	work(i,k,j) = work_ffty(j,i-iouter+1)
826	ENDDO
827	ENDDO
828
829	ENDDO
830
831	ENDDO
832	!$OMP END PARALLEL
833
834	ENDIF
835	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'pause' )
836
837	!
838	!-- Transpose array
839	#if defined( __parallel )
840	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
841	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
842	CALL MPI_ALLTOALL( work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
843	f_out(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
844	comm1dx, ierr )
845	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
846	#endif
847
848	END SUBROUTINE ffty_tr_yx
849
850
851	!------------------------------------------------------------------------------!
852	! Description:
853	! ------------
854	!> Transposition x --> y with a subsequent backward Fourier transformation for
855	!> a 1d-decomposition along x
856	!------------------------------------------------------------------------------!
857	SUBROUTINE tr_xy_ffty( f_in, f_out )
858
859
860	USE control_parameters, &
861	ONLY: host
862
863	USE cpulog, &
864	ONLY: cpu_log, log_point_s
865
866	USE kinds
867
868	USE pegrid
869
870	IMPLICIT NONE
871
872	INTEGER(iwp) :: i !<
873	INTEGER(iwp) :: iend !<
874	INTEGER(iwp) :: iouter !<
875	INTEGER(iwp) :: ir !<
876	INTEGER(iwp) :: j !<
877	INTEGER(iwp) :: k !<
878
879	INTEGER(iwp), PARAMETER :: stridex = 4 !<
880
881	REAL(wp), DIMENSION(0:ny,stridex) :: work_ffty !<
882	#if defined( __nec )
883	REAL(wp), DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec !<
884	#endif
885	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_in !<
886	REAL(wp), DIMENSION(1:nz,0:ny,nxl:nxr) :: f_out !<
887	REAL(wp), DIMENSION(nxl:nxr,1:nz,0:ny) :: work !<
888
889	!
890	!-- Transpose array
891	#if defined( __parallel )
892	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
893	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
894	CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
895	work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
896	comm1dx, ierr )
897	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
898	#endif
899
900	!
901	!-- Resort the data in a way that y becomes the first index and carry out the
902	!-- backward fft along y.
903	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'continue' )
904
905	IF ( host(1:3) == 'nec' ) THEN
906	#if defined( __nec )
907	!
908	!-- Code optimized for vector processors
909	!$OMP PARALLEL PRIVATE ( i, j, k )
910	!$OMP DO
911	DO k = 1, nz
912	DO j = 0, ny
913	DO i = nxl, nxr
914	work_ffty_vec(j,k,i) = work(i,k,j)
915	ENDDO
916	ENDDO
917	ENDDO
918
919	!$OMP DO
920	DO i = nxl, nxr
921
922	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' )
923
924	DO j = 0, ny
925	DO k = 1, nz
926	f_out(k,j,i) = work_ffty_vec(j,k,i)
927	ENDDO
928	ENDDO
929
930	ENDDO
931	!$OMP END PARALLEL
932	#endif
933
934	ELSE
935
936	!
937	!-- Cache optimized code.
938	!-- The i-(x-)direction is split into a strided outer loop and an inner
939	!-- loop for better cache performance
940	!$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty )
941	!$OMP DO
942	DO iouter = nxl, nxr, stridex
943
944	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
945
946	DO k = 1, nz
947	!
948	!-- Resort
949	DO j = 0, ny
950	DO i = iouter, iend
951	work_ffty(j,i-iouter+1) = work(i,k,j)
952	ENDDO
953	ENDDO
954
955	DO i = iouter, iend
956
957	!
958	!-- FFT along y
959	ir = i-iouter+1 ! counter within a stride
960	CALL fft_y_1d( work_ffty(:,ir), 'backward' )
961
962	DO j = 0, ny
963	f_out(k,j,i) = work_ffty(j,ir)
964	ENDDO
965	ENDDO
966
967	ENDDO
968
969	ENDDO
970	!$OMP END PARALLEL
971
972	ENDIF
973
974	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'stop' )
975
976	END SUBROUTINE tr_xy_ffty
977
978
979	!------------------------------------------------------------------------------!
980	! Description:
981	! ------------
982	!> FFT along x, solution of the tridiagonal system and backward FFT for
983	!> a 1d-decomposition along x
984	!>
985	!> @warning this subroutine may still not work for hybrid parallelization
986	!> with OpenMP (for possible necessary changes see the original
987	!> routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
988	!------------------------------------------------------------------------------!
989	SUBROUTINE fftx_tri_fftx( ar )
990
991
992	USE control_parameters, &
993	ONLY: host
994
995	USE cpulog, &
996	ONLY: cpu_log, log_point_s
997
998	USE grid_variables, &
999	ONLY: ddx2, ddy2
1000
1001	USE kinds
1002
1003	USE pegrid
1004
1005	IMPLICIT NONE
1006
1007	INTEGER(iwp) :: i !<
1008	INTEGER(iwp) :: j !<
1009	INTEGER(iwp) :: k !<
1010	INTEGER(iwp) :: m !<
1011	INTEGER(iwp) :: n !<
1012	INTEGER(iwp) :: omp_get_thread_num !<
1013	INTEGER(iwp) :: tn !<
1014
1015	REAL(wp), DIMENSION(0:nx) :: work_fftx !<
1016	REAL(wp), DIMENSION(0:nx,1:nz) :: work_trix !<
1017	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: ar !<
1018	REAL(wp), DIMENSION(:,:,:,:), ALLOCATABLE :: tri !<
1019
1020
1021	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'start' )
1022
1023	ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) )
1024
1025	tn = 0 ! Default thread number in case of one thread
1026	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_fftx, work_trix )
1027	DO j = nys_x, nyn_x
1028
1029	!$ tn = omp_get_thread_num()
1030
1031	IF ( host(1:3) == 'nec' ) THEN
1032	!
1033	!-- Code optimized for vector processors
1034	DO k = 1, nz
1035
1036	m = 0
1037	DO n = 1, pdims(1)
1038	DO i = 1, nnx
1039	work_trix(m,k) = ar(i,k,j,n)
1040	m = m + 1
1041	ENDDO
1042	ENDDO
1043
1044	ENDDO
1045
1046	CALL fft_x_m( work_trix, 'forward' )
1047
1048	ELSE
1049	!
1050	!-- Cache optimized code
1051	DO k = 1, nz
1052
1053	m = 0
1054	DO n = 1, pdims(1)
1055	DO i = 1, nnx
1056	work_fftx(m) = ar(i,k,j,n)
1057	m = m + 1
1058	ENDDO
1059	ENDDO
1060
1061	CALL fft_x_1d( work_fftx, 'forward' )
1062
1063	DO i = 0, nx
1064	work_trix(i,k) = work_fftx(i)
1065	ENDDO
1066
1067	ENDDO
1068
1069	ENDIF
1070
1071	!
1072	!-- Solve the linear equation system
1073	CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) )
1074
1075	IF ( host(1:3) == 'nec' ) THEN
1076	!
1077	!-- Code optimized for vector processors
1078	CALL fft_x_m( work_trix, 'backward' )
1079
1080	DO k = 1, nz
1081
1082	m = 0
1083	DO n = 1, pdims(1)
1084	DO i = 1, nnx
1085	ar(i,k,j,n) = work_trix(m,k)
1086	m = m + 1
1087	ENDDO
1088	ENDDO
1089
1090	ENDDO
1091
1092	ELSE
1093	!
1094	!-- Cache optimized code
1095	DO k = 1, nz
1096
1097	DO i = 0, nx
1098	work_fftx(i) = work_trix(i,k)
1099	ENDDO
1100
1101	CALL fft_x_1d( work_fftx, 'backward' )
1102
1103	m = 0
1104	DO n = 1, pdims(1)
1105	DO i = 1, nnx
1106	ar(i,k,j,n) = work_fftx(m)
1107	m = m + 1
1108	ENDDO
1109	ENDDO
1110
1111	ENDDO
1112
1113	ENDIF
1114
1115	ENDDO
1116
1117	DEALLOCATE( tri )
1118
1119	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'stop' )
1120
1121	END SUBROUTINE fftx_tri_fftx
1122
1123
1124	!------------------------------------------------------------------------------!
1125	! Description:
1126	! ------------
1127	!> Fourier-transformation along x with subsequent transposition x --> y for
1128	!> a 1d-decomposition along y.
1129	!>
1130	!> @attention NEC-branch of this routine may significantly profit from
1131	!> further optimizations. So far, performance is much worse than
1132	!> for routine ffty_tr_yx (more than three times slower).
1133	!------------------------------------------------------------------------------!
1134	SUBROUTINE fftx_tr_xy( f_in, f_out )
1135
1136
1137	USE control_parameters, &
1138	ONLY: host
1139
1140	USE cpulog, &
1141	ONLY: cpu_log, log_point_s
1142
1143	USE kinds
1144
1145	USE pegrid
1146
1147	IMPLICIT NONE
1148
1149	INTEGER(iwp) :: i !<
1150	INTEGER(iwp) :: j !<
1151	INTEGER(iwp) :: k !<
1152
1153	REAL(wp), DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx !<
1154	REAL(wp), DIMENSION(1:nz,nys:nyn,0:nx) :: f_in !<
1155	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_out !<
1156	REAL(wp), DIMENSION(nys:nyn,1:nz,0:nx) :: work !<
1157
1158	!
1159	!-- Carry out the FFT along x, where all data are present due to the
1160	!-- 1d-decomposition along y. Resort the data in a way that y becomes
1161	!-- the first index.
1162	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'start' )
1163
1164	IF ( host(1:3) == 'nec' ) THEN
1165	!
1166	!-- Code for vector processors
1167	!$OMP PARALLEL PRIVATE ( i, j, k )
1168	!$OMP DO
1169	DO i = 0, nx
1170
1171	DO j = nys, nyn
1172	DO k = 1, nz
1173	work_fftx(i,k,j) = f_in(k,j,i)
1174	ENDDO
1175	ENDDO
1176
1177	ENDDO
1178
1179	!$OMP DO
1180	DO j = nys, nyn
1181
1182	CALL fft_x_m( work_fftx(:,:,j), 'forward' )
1183
1184	DO k = 1, nz
1185	DO i = 0, nx
1186	work(j,k,i) = work_fftx(i,k,j)
1187	ENDDO
1188	ENDDO
1189
1190	ENDDO
1191	!$OMP END PARALLEL
1192
1193	ELSE
1194
1195	!
1196	!-- Cache optimized code (there might be still a potential for better
1197	!-- optimization).
1198	!$OMP PARALLEL PRIVATE (i,j,k)
1199	!$OMP DO
1200	DO i = 0, nx
1201
1202	DO j = nys, nyn
1203	DO k = 1, nz
1204	work_fftx(i,k,j) = f_in(k,j,i)
1205	ENDDO
1206	ENDDO
1207
1208	ENDDO
1209
1210	!$OMP DO
1211	DO j = nys, nyn
1212	DO k = 1, nz
1213
1214	CALL fft_x_1d( work_fftx(0:nx,k,j), 'forward' )
1215
1216	DO i = 0, nx
1217	work(j,k,i) = work_fftx(i,k,j)
1218	ENDDO
1219	ENDDO
1220
1221	ENDDO
1222	!$OMP END PARALLEL
1223
1224	ENDIF
1225	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'pause' )
1226
1227	!
1228	!-- Transpose array
1229	#if defined( __parallel )
1230	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1231	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1232	CALL MPI_ALLTOALL( work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
1233	f_out(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
1234	comm1dy, ierr )
1235	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1236	#endif
1237
1238	END SUBROUTINE fftx_tr_xy
1239
1240
1241	!------------------------------------------------------------------------------!
1242	! Description:
1243	! ------------
1244	!> Transposition y --> x with a subsequent backward Fourier transformation for
1245	!> a 1d-decomposition along x.
1246	!------------------------------------------------------------------------------!
1247	SUBROUTINE tr_yx_fftx( f_in, f_out )
1248
1249
1250	USE control_parameters, &
1251	ONLY: host
1252
1253	USE cpulog, &
1254	ONLY: cpu_log, log_point_s
1255
1256	USE kinds
1257
1258	USE pegrid
1259
1260	IMPLICIT NONE
1261
1262	INTEGER(iwp) :: i !<
1263	INTEGER(iwp) :: j !<
1264	INTEGER(iwp) :: k !<
1265
1266	REAL(wp), DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx !<
1267	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_in !<
1268	REAL(wp), DIMENSION(1:nz,nys:nyn,0:nx) :: f_out !<
1269	REAL(wp), DIMENSION(nys:nyn,1:nz,0:nx) :: work !<
1270
1271	!
1272	!-- Transpose array
1273	#if defined( __parallel )
1274	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1275	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1276	CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
1277	work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
1278	comm1dy, ierr )
1279	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1280	#endif
1281
1282	!
1283	!-- Carry out the FFT along x, where all data are present due to the
1284	!-- 1d-decomposition along y. Resort the data in a way that y becomes
1285	!-- the first index.
1286	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'continue' )
1287
1288	IF ( host(1:3) == 'nec' ) THEN
1289	!
1290	!-- Code optimized for vector processors
1291	!$OMP PARALLEL PRIVATE ( i, j, k )
1292	!$OMP DO
1293	DO j = nys, nyn
1294
1295	DO k = 1, nz
1296	DO i = 0, nx
1297	work_fftx(i,k,j) = work(j,k,i)
1298	ENDDO
1299	ENDDO
1300
1301	CALL fft_x_m( work_fftx(:,:,j), 'backward' )
1302
1303	ENDDO
1304
1305	!$OMP DO
1306	DO i = 0, nx
1307	DO j = nys, nyn
1308	DO k = 1, nz
1309	f_out(k,j,i) = work_fftx(i,k,j)
1310	ENDDO
1311	ENDDO
1312	ENDDO
1313	!$OMP END PARALLEL
1314
1315	ELSE
1316
1317	!
1318	!-- Cache optimized code (there might be still a potential for better
1319	!-- optimization).
1320	!$OMP PARALLEL PRIVATE (i,j,k)
1321	!$OMP DO
1322	DO j = nys, nyn
1323	DO k = 1, nz
1324
1325	DO i = 0, nx
1326	work_fftx(i,k,j) = work(j,k,i)
1327	ENDDO
1328
1329	CALL fft_x_1d( work_fftx(0:nx,k,j), 'backward' )
1330
1331	ENDDO
1332	ENDDO
1333
1334	!$OMP DO
1335	DO i = 0, nx
1336	DO j = nys, nyn
1337	DO k = 1, nz
1338	f_out(k,j,i) = work_fftx(i,k,j)
1339	ENDDO
1340	ENDDO
1341	ENDDO
1342	!$OMP END PARALLEL
1343
1344	ENDIF
1345	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'stop' )
1346
1347	END SUBROUTINE tr_yx_fftx
1348
1349
1350	!------------------------------------------------------------------------------!
1351	! Description:
1352	! ------------
1353	!> FFT along y, solution of the tridiagonal system and backward FFT for
1354	!> a 1d-decomposition along y.
1355	!>
1356	!> @warning this subroutine may still not work for hybrid parallelization
1357	!> with OpenMP (for possible necessary changes see the original
1358	!> routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
1359	!------------------------------------------------------------------------------!
1360	SUBROUTINE ffty_tri_ffty( ar )
1361
1362
1363	USE control_parameters, &
1364	ONLY: host
1365
1366	USE cpulog, &
1367	ONLY: cpu_log, log_point_s
1368
1369	USE grid_variables, &
1370	ONLY: ddx2, ddy2
1371
1372	USE kinds
1373
1374	USE pegrid
1375
1376	IMPLICIT NONE
1377
1378	INTEGER(iwp) :: i !<
1379	INTEGER(iwp) :: j !<
1380	INTEGER(iwp) :: k !<
1381	INTEGER(iwp) :: m !<
1382	INTEGER(iwp) :: n !<
1383	INTEGER(iwp) :: omp_get_thread_num !<
1384	INTEGER(iwp) :: tn !<
1385
1386	REAL(wp), DIMENSION(0:ny) :: work_ffty !<
1387	REAL(wp), DIMENSION(0:ny,1:nz) :: work_triy !<
1388	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: ar !<
1389	REAL(wp), DIMENSION(:,:,:,:), ALLOCATABLE :: tri !<
1390
1391
1392	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'start' )
1393
1394	ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) )
1395
1396	tn = 0 ! Default thread number in case of one thread
1397	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy )
1398	DO i = nxl_y, nxr_y
1399
1400	!$ tn = omp_get_thread_num()
1401
1402	IF ( host(1:3) == 'nec' ) THEN
1403	!
1404	!-- Code optimized for vector processors
1405	DO k = 1, nz
1406
1407	m = 0
1408	DO n = 1, pdims(2)
1409	DO j = 1, nny
1410	work_triy(m,k) = ar(j,k,i,n)
1411	m = m + 1
1412	ENDDO
1413	ENDDO
1414
1415	ENDDO
1416
1417	CALL fft_y_m( work_triy, ny, 'forward' )
1418
1419	ELSE
1420	!
1421	!-- Cache optimized code
1422	DO k = 1, nz
1423
1424	m = 0
1425	DO n = 1, pdims(2)
1426	DO j = 1, nny
1427	work_ffty(m) = ar(j,k,i,n)
1428	m = m + 1
1429	ENDDO
1430	ENDDO
1431
1432	CALL fft_y_1d( work_ffty, 'forward' )
1433
1434	DO j = 0, ny
1435	work_triy(j,k) = work_ffty(j)
1436	ENDDO
1437
1438	ENDDO
1439
1440	ENDIF
1441
1442	!
1443	!-- Solve the linear equation system
1444	CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) )
1445
1446	IF ( host(1:3) == 'nec' ) THEN
1447	!
1448	!-- Code optimized for vector processors
1449	CALL fft_y_m( work_triy, ny, 'backward' )
1450
1451	DO k = 1, nz
1452
1453	m = 0
1454	DO n = 1, pdims(2)
1455	DO j = 1, nny
1456	ar(j,k,i,n) = work_triy(m,k)
1457	m = m + 1
1458	ENDDO
1459	ENDDO
1460
1461	ENDDO
1462
1463	ELSE
1464	!
1465	!-- Cache optimized code
1466	DO k = 1, nz
1467
1468	DO j = 0, ny
1469	work_ffty(j) = work_triy(j,k)
1470	ENDDO
1471
1472	CALL fft_y_1d( work_ffty, 'backward' )
1473
1474	m = 0
1475	DO n = 1, pdims(2)
1476	DO j = 1, nny
1477	ar(j,k,i,n) = work_ffty(m)
1478	m = m + 1
1479	ENDDO
1480	ENDDO
1481
1482	ENDDO
1483
1484	ENDIF
1485
1486	ENDDO
1487
1488	DEALLOCATE( tri )
1489
1490	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'stop' )
1491
1492	END SUBROUTINE ffty_tri_ffty
1493
1494	END MODULE poisfft_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |